Do not merge.

Backport changes/fixes related to Jpeg tile-based decoder from Honeycomb
to Gingerbread.

Bug: 3309014

//////////////////////////////////////////////////////////////////////////
This is a combination of 5 commits.

Fix 3118622, where tile-base jpeg decode does not handle the region
width correctly in Progressive JPEG when the h_samp_factor is different from one color
component to anothor.

To decode a region in a progressive JPEG, each time we decode
one iMCU row, the width of which equals to the region width.
However, for each color component the region width in DCT blocks depends on its h_samp_factor.
The change ensures we get a correct region width from our recorded MCU_column number.

Bug: 3118622
Change-Id: I6d3e30f946e0395c0719aee0c8e694824ab3d27f

libjpeg: Remove the old assembly code for ARM.

A much better one is coming.

Change-Id: I60d8c227d573fcbff10af363d69405e9fbd0c147

libjpeg: Use the new fast-and-accurate IDCT method for ARMv6+ devices.

As another AA&N implementation, it runs 9-10% faster than jidctfst.S
and 11-15% faster than jidctfst.c. As another IDCT method, it runs
17-20% faster than JDCT_ISLOW method and provides the same accuracy
or even better.

Change-Id: I81783c310d6dac5aaf84c03a4cf20662f466564c

libjpeg: Make both JDCT_IFAST and JDCT_ISLOW use armv6_idct.

Change-Id: Iae9c402ec7e1c6b078f404fec995162c8091f383

Fix the JPEG tile decode issue in the case of JPEGs having restart
markers.

The fix stores the restart information on each index point and restores
the restart information when we do the tile decode.

Bug: 3312406
////////////////////////////////////////////////////////////////////////////
diff --git a/Android.mk b/Android.mk
index 9e1c42e..2670652 100644
--- a/Android.mk
+++ b/Android.mk
@@ -10,8 +10,8 @@
 	jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
 	jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
 	jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-	jfdctint.c jidctflt.c jidctred.c jquant1.c \
-	jquant2.c jutils.c jmemmgr.c \
+	jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+	jquant2.c jutils.c jmemmgr.c armv6_idct.S
 
 # use ashmem as libjpeg decoder's backing store
 LOCAL_CFLAGS += -DUSE_ANDROID_ASHMEM
@@ -23,21 +23,6 @@
 #LOCAL_SRC_FILES += \
 #	jmem-android.c
 
-
-# the assembler is only for the ARM version, don't break the Linux sim
-ifneq ($(TARGET_ARCH),arm)
-ANDROID_JPEG_NO_ASSEMBLER := true
-endif
-
-# temp fix until we understand why this broke cnn.com
-#ANDROID_JPEG_NO_ASSEMBLER := true
-
-ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
-LOCAL_SRC_FILES += jidctint.c jidctfst.c
-else
-LOCAL_SRC_FILES += jidctint.c jidctfst.S
-endif
-
 LOCAL_CFLAGS += -DAVOID_TABLES 
 LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
 #LOCAL_CFLAGS += -march=armv6j
@@ -45,6 +30,9 @@
 # enable tile based decode
 LOCAL_CFLAGS += -DANDROID_TILE_BASED_DECODE
 
+# enable armv6 idct assembly
+LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+
 LOCAL_MODULE:= libjpeg
 
 LOCAL_SHARED_LIBRARIES := \
diff --git a/armv6_idct.S b/armv6_idct.S
new file mode 100644
index 0000000..18e4e8a
--- /dev/null
+++ b/armv6_idct.S
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This is a fast-and-accurate implementation of inverse Discrete Cosine
+ * Transform (IDCT) for ARMv6+. It also performs dequantization of the input
+ * coefficients just like other methods.
+ *
+ * This implementation is based on the scaled 1-D DCT algorithm proposed by
+ * Arai, Agui, and Nakajima. The following code is based on the figure 4-8
+ * on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients
+ * are (almost) directly mapped into registers.
+ *
+ * The accuracy is achieved by using SMULWy and SMLAWy instructions. Both
+ * multiply 32 bits by 16 bits and store the top 32 bits of the result. It
+ * makes 32-bit fixed-point arithmetic possible without overflow. That is
+ * why jpeg_idct_ifast(), which is written in C, cannot be improved.
+ *
+ * More tricks are used to gain more speed. First of all, we use as many
+ * registers as possible. ARM processor has 16 registers including sp (r13)
+ * and pc (r15), so only 14 registers can be used without limitations. In
+ * general, we let r0 to r7 hold the coefficients; r10 and r11 hold four
+ * 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold
+ * intermediate value. In the second pass, r9 is the loop counter. In the
+ * first pass, r8 to r11 are used to hold quantization values, so the loop
+ * counter is held by sp. Yes, the stack pointer. Since it must be aligned
+ * to 4-byte boundary all the time, we align it to 32-byte boundary and use
+ * bit 3 to bit 5. As the result, we actually use 14.1 registers. :-)
+ *
+ * Second, we rearrange quantization values to access them sequentially. The
+ * table is first transposed, and the new columns are placed in the order of
+ * 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a
+ * time. Rearranging coefficients also helps, but that requires to change a
+ * dozen of files, which seems not worth it. In addition, we choose to scale
+ * up quantization values by 13 bits, so the coefficients are scaled up by
+ * 16 bits after both passes. Then we can pack and saturate them two at a
+ * time using PKHTB and USAT16 instructions.
+ *
+ * Third, we reorder the instructions to avoid bubbles in the pipeline. This
+ * is done by hand accroding to the cycle timings and the interlock behavior
+ * described in the technical reference manual of ARM1136JF-S. We also take
+ * advantage of dual issue processors by interleaving instructions with
+ * dependencies. It has been benchmarked on four devices and all the results
+ * showed distinguishable improvements. Note that PLD instructions actually
+ * slow things down, so they are removed at the last minute. In the future,
+ * this might be futher improved using a system profiler.
+ */
+
+#ifdef __arm__
+#include <machine/cpu-features.h>
+#endif
+
+#if __ARM_ARCH__ >= 6
+
+// void armv6_idct(short *coefs, int *quans, unsigned char *rows, int col)
+    .arm
+    .text
+    .align
+    .global armv6_idct
+    .func   armv6_idct
+
+armv6_idct:
+    // Push everything except sp (r13) and pc (r15).
+    stmdb   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+
+    // r12 = quans, r14 = coefs.
+    sub     r4, sp, #236
+    bic     sp, r4, #31
+    add     r5, sp, #224
+    add     r12, r1, #256
+    stm     r5, {r2, r3, r4}
+    add     r14, r0, #16
+
+pass1_head:
+    // Load quantization values. (q[0, 2, 4, 6])
+    ldmdb   r12!, {r8, r9, r10, r11}
+
+    // Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7])
+    ldrsh   r4, [r14, #-2] !
+    ldrsh   r1, [r14, #16]
+    ldrsh   r2, [r14, #32]
+    ldrsh   r3, [r14, #48]
+    ldrsh   r0, [r14, #64]
+    ldrsh   r5, [r14, #80]
+    ldrsh   r6, [r14, #96]
+    ldrsh   r7, [r14, #112]
+
+    // r4 = q[0] * c[0];
+    mul     r4, r8, r4
+
+    // Check if ACs are all zero.
+    cmp     r0, #0
+    orreqs  r8, r1, r2
+    orreqs  r8, r3, r5
+    orreqs  r8, r6, r7
+    beq     pass1_zero
+
+    // Step 1: Dequantizations.
+
+    // r2 = q[2] * c[2];
+    // r0 = q[4] * c[4] + r4;
+    // r6 = q[6] * c[6] + r2;
+    mul     r2, r9, r2
+    mla     r0, r10, r0, r4
+    mla     r6, r11, r6, r2
+
+    // Load quantization values. (q[7, 5, 1, 3])
+    ldmdb   r12!, {r8, r9, r10, r11}
+
+    // r4 = r4 * 2 - r0 = -(r0 - r4 * 2);
+    // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+    rsb     r4, r0, r4, lsl #1
+    rsb     r2, r6, r2, lsl #1
+
+    // r7 = q[7] * c[7];
+    // r5 = q[5] * c[5];
+    // r1 = q[1] * c[1] + r7;
+    // r3 = q[3] * c[3] + r5;
+    mul     r7, r8, r7
+    mul     r5, r9, r5
+    mla     r1, r10, r1, r7
+    mla     r3, r11, r3, r5
+
+    // Load constants.
+    ldrd    r10, constants
+
+    // Step 2: Rotations and Butterflies.
+
+    // r7 = r1 - r7 * 2;
+    // r1 = r1 - r3;
+    // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+    // r3 = r1 + r3 * 2;
+    // r8 = r5 + r7;
+    sub     r7, r1, r7, lsl #1
+    sub     r1, r1, r3
+    rsb     r5, r3, r5, lsl #1
+    add     r3, r1, r3, lsl #1
+    add     r8, r5, r7
+
+    // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+    // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+    // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+    smlawt  r2, r2, r10, r2
+    smulwb  r8, r8, r10
+    smlawt  r1, r1, r10, r1
+
+    // r0 = r0 + r6;
+    // r2 = r2 - r6;
+    // r6 = r0 - r6 * 2;
+    add     r0, r0, r6
+    sub     r2, r2, r6
+    sub     r6, r0, r6, lsl #1
+
+    // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+    // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+    smlawt  r5, r5, r11, r8
+    smlawb  r8, r7, r11, r8
+
+    // r4 = r4 + r2;
+    // r0 = r0 + r3;
+    // r2 = r4 - r2 * 2;
+    add     r4, r4, r2
+    add     r0, r0, r3
+    sub     r2, r4, r2, lsl #1
+
+    // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+    // r3 = r0 - r3 * 2;
+    // r1 = r1 - r7;
+    // r4 = r4 + r7;
+    // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+    // r7 = r4 - r7 * 2;
+    rsb     r7, r3, r5, lsl #3
+    sub     r3, r0, r3, lsl #1
+    sub     r1, r1, r7
+    add     r4, r4, r7
+    rsb     r5, r1, r8, lsl #3
+    sub     r7, r4, r7, lsl #1
+
+    // r2 = r2 + r1;
+    // r6 = r6 + r5;
+    // r1 = r2 - r1 * 2;
+    // r5 = r6 - r5 * 2;
+    add     r2, r2, r1
+    add     r6, r6, r5
+    sub     r1, r2, r1, lsl #1
+    sub     r5, r6, r5, lsl #1
+
+    // Step 3: Reorder and Save.
+
+    str     r0, [sp, #-4] !
+    str     r4, [sp, #32]
+    str     r2, [sp, #64]
+    str     r6, [sp, #96]
+    str     r5, [sp, #128]
+    str     r1, [sp, #160]
+    str     r7, [sp, #192]
+    str     r3, [sp, #224]
+    b       pass1_tail
+
+    // Precomputed 16-bit constants: 27146, 15137, -21407, -8867.
+    // Put them in the middle since LDRD only accepts offsets from -255 to 255.
+    .align  3
+constants:
+    .word   0x6a0a3b21
+    .word   0xac61dd5d
+
+pass1_zero:
+    str     r4, [sp, #-4] !
+    str     r4, [sp, #32]
+    str     r4, [sp, #64]
+    str     r4, [sp, #96]
+    str     r4, [sp, #128]
+    str     r4, [sp, #160]
+    str     r4, [sp, #192]
+    str     r4, [sp, #224]
+    sub     r12, r12, #16
+
+pass1_tail:
+    ands    r9, sp, #31
+    bne     pass1_head
+
+    // r12 = rows, r14 = col.
+    ldr     r12, [sp, #256]
+    ldr     r14, [sp, #260]
+
+    // Load constants.
+    ldrd    r10, constants
+
+pass2_head:
+    // Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7])
+    ldmia   sp!, {r0, r1, r2, r3, r4, r5, r6, r7}
+
+    // r0 = r0 + 0x00808000;
+    add     r0, r0, #0x00800000
+    add     r0, r0, #0x00008000
+
+    // Step 1: Analog to the first pass.
+
+    // r0 = r0 + r4;
+    // r6 = r6 + r2;
+    add     r0, r0, r4
+    add     r6, r6, r2
+
+    // r4 = r0 - r4 * 2;
+    // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+    sub     r4, r0, r4, lsl #1
+    rsb     r2, r6, r2, lsl #1
+
+    // r1 = r1 + r7;
+    // r3 = r3 + r5;
+    add     r1, r1, r7
+    add     r3, r3, r5
+
+    // Step 2: Rotations and Butterflies.
+
+    // r7 = r1 - r7 * 2;
+    // r1 = r1 - r3;
+    // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+    // r3 = r1 + r3 * 2;
+    // r8 = r5 + r7;
+    sub     r7, r1, r7, lsl #1
+    sub     r1, r1, r3
+    rsb     r5, r3, r5, lsl #1
+    add     r3, r1, r3, lsl #1
+    add     r8, r5, r7
+
+    // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+    // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+    // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+    smlawt  r2, r2, r10, r2
+    smulwb  r8, r8, r10
+    smlawt  r1, r1, r10, r1
+
+    // r0 = r0 + r6;
+    // r2 = r2 - r6;
+    // r6 = r0 - r6 * 2;
+    add     r0, r0, r6
+    sub     r2, r2, r6
+    sub     r6, r0, r6, lsl #1
+
+    // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+    // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+    smlawt  r5, r5, r11, r8
+    smlawb  r8, r7, r11, r8
+
+    // r4 = r4 + r2;
+    // r0 = r0 + r3;
+    // r2 = r4 - r2 * 2;
+    add     r4, r4, r2
+    add     r0, r0, r3
+    sub     r2, r4, r2, lsl #1
+
+    // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+    // r3 = r0 - r3 * 2;
+    // r1 = r1 - r7;
+    // r4 = r4 + r7;
+    // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+    // r7 = r4 - r7 * 2;
+    rsb     r7, r3, r5, lsl #3
+    sub     r3, r0, r3, lsl #1
+    sub     r1, r1, r7
+    add     r4, r4, r7
+    rsb     r5, r1, r8, lsl #3
+    sub     r7, r4, r7, lsl #1
+
+    // r2 = r2 + r1;
+    // r6 = r6 + r5;
+    // r1 = r2 - r1 * 2;
+    // r5 = r6 - r5 * 2;
+    add     r2, r2, r1
+    add     r6, r6, r5
+    sub     r1, r2, r1, lsl #1
+    sub     r5, r6, r5, lsl #1
+
+    // Step 3: Reorder and Save.
+
+    // Load output pointer.
+    ldr     r8, [r12], #4
+
+    // For little endian: r6, r2, r4, r0, r3, r7, r1, r5.
+    pkhtb   r6, r6, r4, asr #16
+    pkhtb   r2, r2, r0, asr #16
+    pkhtb   r3, r3, r1, asr #16
+    pkhtb   r7, r7, r5, asr #16
+    usat16  r6, #8, r6
+    usat16  r2, #8, r2
+    usat16  r3, #8, r3
+    usat16  r7, #8, r7
+    orr     r0, r2, r6, lsl #8
+    orr     r1, r7, r3, lsl #8
+
+#ifdef __ARMEB__
+    // Reverse bytes for big endian.
+    rev     r0, r0
+    rev     r1, r1
+#endif
+
+    // Use STR instead of STRD to support unaligned access.
+    str     r0, [r8, r14] !
+    str     r1, [r8, #4]
+
+pass2_tail:
+    adds    r9, r9, #0x10000000
+    bpl     pass2_head
+
+    ldr     sp, [sp, #8]
+    add     sp, sp, #236
+
+    ldmia   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+    bx      lr
+    .endfunc
+
+#endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 9e8040b..e6e9506 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -277,15 +277,24 @@
   unsigned int MCUs_per_row = cinfo->MCUs_per_row;
 #ifdef ANDROID_TILE_BASED_DECODE
   if (cinfo->tile_decode) {
+    int iMCU_width_To_MCU_width;
+    if (cinfo->comps_in_scan > 1) {
+      // Interleaved
+      iMCU_width_To_MCU_width = 1;
+    } else {
+      // Non-intervleaved
+      iMCU_width_To_MCU_width = cinfo->cur_comp_info[0]->h_samp_factor;
+    }
     MCUs_per_row = jmin(MCUs_per_row,
         (cinfo->coef->column_right_boundary - cinfo->coef->column_left_boundary)
-        * cinfo->entropy->index->MCU_sample_size * cinfo->max_h_samp_factor);
+        * cinfo->entropy->index->MCU_sample_size * iMCU_width_To_MCU_width);
   }
 #endif
 
   /* Loop to process one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
+   // configure huffman decoder
 #ifdef ANDROID_TILE_BASED_DECODE
     if (cinfo->tile_decode) {
       huffman_scan_header scan_header =
@@ -296,8 +305,10 @@
               [col_offset + yoffset * scan_header.MCUs_per_row]);
     }
 #endif
+
+    // zero all blocks
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num < MCUs_per_row;
-	 MCU_col_num++) {
+          MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
       blkn = 0;			/* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
@@ -309,7 +320,7 @@
             coef->MCU_buffer[blkn++] = buffer_ptr++;
 #ifdef ANDROID_TILE_BASED_DECODE
             if (cinfo->tile_decode && cinfo->input_scan_number == 0) {
-              // need to do pre-zero ourself.
+              // need to do pre-zero ourselves.
               jzero_far((void FAR *) coef->MCU_buffer[blkn-1],
                         (size_t) (SIZEOF(JBLOCK)));
             }
@@ -317,12 +328,14 @@
           }
         }
       }
+
+
       /* Try to fetch the MCU. */
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->MCU_ctr = MCU_col_num;
-	return JPEG_SUSPENDED;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->MCU_ctr = MCU_col_num;
+        return JPEG_SUSPENDED;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -584,14 +597,14 @@
     int start_block = 0;
 #if ANDROID_TILE_BASED_DECODE
     if (cinfo->tile_decode) {
+      // width_in_blocks for a component depends on its h_samp_factor.
       width_in_blocks = jmin(width_in_blocks,
         (cinfo->coef->MCU_column_right_boundary -
          cinfo->coef->MCU_column_left_boundary) *
-         cinfo->max_h_samp_factor /
          compptr->h_samp_factor);
       start_block = coef->pub.MCU_columns_to_skip *
-        cinfo->max_h_samp_factor / compptr->h_samp_factor;
-    }
+         compptr->h_samp_factor;
+   }
 #endif
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..74a96db 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -20,6 +20,35 @@
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
 
+#ifdef ANDROID_ARMV6_IDCT
+  #undef ANDROID_ARMV6_IDCT
+  #ifdef __arm__
+    #include <machine/cpu-features.h>
+    #if __ARM_ARCH__ >= 6
+      #define ANDROID_ARMV6_IDCT
+    #else
+      #warning "ANDROID_ARMV6_IDCT is disabled"
+    #endif
+  #endif
+#endif
+
+#ifdef ANDROID_ARMV6_IDCT
+
+/* Intentionally declare the prototype with arguments of primitive types instead
+ * of type-defined ones. This will at least generate some warnings if jmorecfg.h
+ * is changed and becomes incompatible with the assembly code.
+ */
+extern void armv6_idct(short *coefs, int *quans, unsigned char **rows, int col);
+
+void jpeg_idct_armv6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *dct_table = (IFAST_MULT_TYPE *)compptr->dct_table;
+  armv6_idct(coef_block, dct_table, output_buf, output_col);
+}
+
+#endif
 
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
@@ -115,6 +144,13 @@
 #endif
     case DCTSIZE:
       switch (cinfo->dct_method) {
+#ifdef ANDROID_ARMV6_IDCT
+      case JDCT_ISLOW:
+      case JDCT_IFAST:
+	method_ptr = jpeg_idct_armv6;
+	method = JDCT_IFAST;
+	break;
+#else /* ANDROID_ARMV6_IDCT */
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
 	method_ptr = jpeg_idct_islow;
@@ -127,6 +163,7 @@
 	method = JDCT_IFAST;
 	break;
 #endif
+#endif /* ANDROID_ARMV6_IDCT */
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
 	method_ptr = jpeg_idct_float;
@@ -181,6 +218,27 @@
 	 * IFAST_SCALE_BITS.
 	 */
 	IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+#ifdef ANDROID_ARMV6_IDCT
+	/* Precomputed values scaled up by 15 bits. */
+	static const unsigned short scales[DCTSIZE2] = {
+	  32768, 45451, 42813, 38531, 32768, 25746, 17734,  9041,
+	  45451, 63042, 59384, 53444, 45451, 35710, 24598, 12540,
+	  42813, 59384, 55938, 50343, 42813, 33638, 23170, 11812,
+	  38531, 53444, 50343, 45308, 38531, 30274, 20853, 10631,
+	  32768, 45451, 42813, 38531, 32768, 25746, 17734,  9041,
+	  25746, 35710, 33638, 30274, 25746, 20228, 13933,  7103,
+	  17734, 24598, 23170, 20853, 17734, 13933,  9598,  4893,
+	   9041, 12540, 11812, 10631,  9041,  7103,  4893,  2494,
+	};
+	/* Inverse map of [7, 5, 1, 3, 0, 2, 4, 6]. */
+	static const char orders[DCTSIZE] = {4, 2, 5, 3, 6, 1, 7, 0};
+	/* Reorder the columns after transposing. */
+	for (i = 0; i < DCTSIZE2; ++i) {
+	  int j = ((i & 7) << 3) + orders[i >> 3];
+	  ifmtbl[j] = (qtbl->quantval[i] * scales[i] + 2) >> 2;
+	}
+#else /* ANDROID_ARMV6_IDCT */
+
 #define CONST_BITS 14
 	static const INT16 aanscales[DCTSIZE2] = {
 	  /* precomputed values scaled up by 14 bits */
@@ -201,6 +259,7 @@
 				  (INT32) aanscales[i]),
 		    CONST_BITS-IFAST_SCALE_BITS);
 	}
+#endif /* ANDROID_ARMV6_IDCT */
       }
       break;
 #endif
diff --git a/jdhuff.c b/jdhuff.c
index 0d704a5..bc5d4fd 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -295,20 +295,10 @@
 		      int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
-  j_decompress_ptr cinfo = state->cinfo;
-  if (cinfo->tile_decode &&
-      cinfo->restart_interval == 0 &&
-      cinfo->unread_marker >= 0xd0 &&
-      cinfo->unread_marker <= 0xd7 &&
-      nbits > bits_left
-      ) {
-      // Skip the restart marker.
-    cinfo->marker->next_restart_num = cinfo->unread_marker - 0xd0;
-    process_restart(cinfo);
-  }
   /* Copy heavily used state fields into locals (hopefully registers) */
   register const JOCTET * next_input_byte = state->next_input_byte;
   register size_t bytes_in_buffer = state->bytes_in_buffer;
+  j_decompress_ptr cinfo = state->cinfo;
 
   /* Attempt to load at least MIN_GET_BITS bits into get_buffer. */
   /* (It is assumed that no request will be for more than that many bits.) */
@@ -509,24 +499,20 @@
 }
 
 /*
- * Configure the Huffman decoder reader position and bit buffer.
+ * Save the current Huffman deocde position and the DC coefficients
+ * for each component into bitstream_offset and dc_info[], respectively.
  */
-GLOBAL(void)
-jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
-        huffman_offset_data offset)
+METHODDEF(void)
+get_huffman_decoder_configuration(j_decompress_ptr cinfo,
+        huffman_offset_data *offset)
 {
-  unsigned int bitstream_offset = offset.bitstream_offset;
-  int blkn, i;
-
-  cinfo->restart_interval = 0;
-  cinfo->unread_marker = 0;
-
-  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
-  unsigned int bit_in_bit_buffer =
-      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
-
-  jset_input_stream_position_bit(cinfo, byte_offset,
-          bit_in_bit_buffer, offset.get_buffer);
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  short int *dc_info = offset->prev_dc;
+  int i;
+  jpeg_get_huffman_decoder_configuration(cinfo, offset);
+  for (i = 0; i < cinfo->comps_in_scan; i++) {
+    dc_info[i] = entropy->saved.last_dc_val[i];
+  }
 }
 
 /*
@@ -546,6 +532,10 @@
 	return;
   }
 
+  // Save restarts_to_go and next_restart_num
+  offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+  offset->next_restart_num = cinfo->marker->next_restart_num;
+
   offset->bitstream_offset =
       (jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
       + entropy->bitstate.bits_left;
@@ -570,20 +560,28 @@
 }
 
 /*
- * Save the current Huffman deocde position and the DC coefficients
- * for each component into bitstream_offset and dc_info[], respectively.
+ * Configure the Huffman decoder reader position and bit buffer.
  */
-METHODDEF(void)
-get_huffman_decoder_configuration(j_decompress_ptr cinfo,
-        huffman_offset_data *offset)
+GLOBAL(void)
+jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
+        huffman_offset_data offset)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  short int *dc_info = offset->prev_dc;
-  int i;
-  jpeg_get_huffman_decoder_configuration(cinfo, offset);
-  for (i = 0; i < cinfo->comps_in_scan; i++) {
-    dc_info[i] = entropy->saved.last_dc_val[i];
-  }
+
+  // Restore restarts_to_go and next_restart_num
+  cinfo->unread_marker = 0;
+  entropy->restarts_to_go = offset.restarts_to_go;
+  cinfo->marker->next_restart_num = offset.next_restart_num;
+
+  unsigned int bitstream_offset = offset.bitstream_offset;
+  int blkn, i;
+
+  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+  unsigned int bit_in_bit_buffer =
+      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+  jset_input_stream_position_bit(cinfo, byte_offset,
+          bit_in_bit_buffer, offset.get_buffer);
 }
 
 /*
diff --git a/jdphuff.c b/jdphuff.c
index a58cdd2..922017e 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -632,21 +632,6 @@
 }
 
 /*
- * Configure the Huffman decoder to decode the image
- * starting from (iMCU_row_offset, iMCU_col_offset).
- */
-METHODDEF(void)
-configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
-{
-  int i;
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  jpeg_configure_huffman_decoder(cinfo, offset);
-  entropy->saved.EOBRUN = offset.EOBRUN;
-  for (i = 0; i < cinfo->comps_in_scan; i++)
-    entropy->saved.last_dc_val[i] = offset.prev_dc[i];
-}
-
-/*
  * Save the current Huffman deocde position and the DC coefficients
  * for each component into bitstream_offset and dc_info[], respectively.
  */
@@ -656,16 +641,103 @@
 {
   int i;
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  jpeg_get_huffman_decoder_configuration(cinfo, offset);
+  jpeg_get_huffman_decoder_configuration_progressive(cinfo, offset);
   offset->EOBRUN = entropy->saved.EOBRUN;
   for (i = 0; i < cinfo->comps_in_scan; i++)
     offset->prev_dc[i] = entropy->saved.last_dc_val[i];
 }
 
+
+/*
+ * Save the current Huffman decoder position and the bit buffer
+ * into bitstream_offset and get_buffer, respectively.
+ */
+GLOBAL(void)
+jpeg_get_huffman_decoder_configuration_progressive(j_decompress_ptr cinfo,
+        huffman_offset_data *offset)
+{
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+  if (cinfo->restart_interval) {
+    // We are at the end of a data segment
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return;
+  }
+
+  // Save restarts_to_go and next_restart_num.
+  offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+  offset->next_restart_num = cinfo->marker->next_restart_num;
+
+  offset->bitstream_offset =
+      (jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
+      + entropy->bitstate.bits_left;
+
+  offset->get_buffer = entropy->bitstate.get_buffer;
+}
+
+
+/*
+ * Configure the Huffman decoder to decode the image
+ * starting from (iMCU_row_offset, iMCU_col_offset).
+ */
+METHODDEF(void)
+configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
+{
+  int i;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  jpeg_configure_huffman_decoder_progressive(cinfo, offset);
+  entropy->saved.EOBRUN = offset.EOBRUN;
+  for (i = 0; i < cinfo->comps_in_scan; i++)
+    entropy->saved.last_dc_val[i] = offset.prev_dc[i];
+}
+
+/*
+ * Configure the Huffman decoder reader position and bit buffer.
+ */
+GLOBAL(void)
+jpeg_configure_huffman_decoder_progressive(j_decompress_ptr cinfo,
+        huffman_offset_data offset)
+{
+	phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+  // Restore restarts_to_go and next_restart_num
+  cinfo->unread_marker = 0;
+  entropy->restarts_to_go = offset.restarts_to_go;
+  cinfo->marker->next_restart_num = offset.next_restart_num;
+
+  unsigned int bitstream_offset = offset.bitstream_offset;
+  int blkn, i;
+
+  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+  unsigned int bit_in_bit_buffer =
+      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+  jset_input_stream_position_bit(cinfo, byte_offset,
+          bit_in_bit_buffer, offset.get_buffer);
+}
+
+GLOBAL(void)
+jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
+        huffman_index *index, int scan_no, int offset)
+{
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  if (scan_no >= index->scan_count) {
+    index->scan = realloc(index->scan,
+                    (scan_no + 1) * sizeof(huffman_scan_header));
+    index->mem_used += (scan_no - index->scan_count + 1)
+      * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
+      * sizeof(huffman_offset_data*));
+    index->scan_count = scan_no + 1;
+  }
+  index->scan[scan_no].offset = (huffman_offset_data**)malloc(
+          cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
+  index->scan[scan_no].bitstream_offset = offset;
+}
+
 /*
  * Module initialization routine for progressive Huffman entropy decoding.
  */
-
 GLOBAL(void)
 jinit_phuff_decoder (j_decompress_ptr cinfo)
 {
@@ -697,22 +769,4 @@
       *coef_bit_ptr++ = -1;
 }
 
-GLOBAL(void)
-jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
-        huffman_index *index, int scan_no, int offset)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  if (scan_no >= index->scan_count) {
-    index->scan = realloc(index->scan,
-                    (scan_no + 1) * sizeof(huffman_scan_header));
-    index->mem_used += (scan_no - index->scan_count + 1)
-      * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
-      * sizeof(huffman_offset_data*));
-    index->scan_count = scan_no + 1;
-  }
-  index->scan[scan_no].offset = (huffman_offset_data**)malloc(
-          cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
-  index->scan[scan_no].bitstream_offset = offset;
-}
-
 #endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/jidctfst.S b/jidctfst.S
deleted file mode 100644
index 34e1c24..0000000
--- a/jidctfst.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <machine/cpu-features.h>
-
-    .text
-    .align
-
-    .global jpeg_idct_ifast
-    .func   jpeg_idct_ifast
-
-// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
-
-// jpeg_idct_ifast (j_decompress_ptr       cinfo,
-//                 jpeg_component_info *   compptr,
-//                 short*                  coef_block,
-//                 unsigned char*          output_buf,
-//                 int                     output_col)
-
-#define  local_TMP0123       sp
-#define  local_TMP0          [sp, #0]
-#define  local_TMP1          [sp, #4]
-#define  local_TMP2          [sp, #8]
-#define  local_TMP3          [sp, #12]
-#define  local_RANGE_TABLE   [sp, #16]
-#define  local_OUTPUT_COL    [sp, #20]
-#define  local_OUTPUT_BUF    [sp, #24]
-#define  local_UNUSED        [sp, #28]
-#define  off_WORKSPACE       32
-#define  local_WORKSPACE     [sp, #offWORKSPACE]
-#define  local_SIZE          (off_WORKSPACE + 8*8*4)
-
-#define  off_DECOMPRESS_range_limit_base  324
-#define  off_COMPINFO_quanttable          80
-
-#define  DCTSIZE   8
-#define  VY(x)   ((x)*DCTSIZE*2)
-#define  QY(x)   ((x)*DCTSIZE*4)
-
-#define  VX(x)   ((x)*2)
-#define  QX(x)   ((x)*4)
-
-#define  FIX_1_414213562    #362
-#define  FIX_1_082392200    #277
-#define  FIX_1_847759065    #473
-#define  FIX_2_613125930    #669
-
-#define  RANGE_MASK   1023
-
-
-
-jpeg_idct_ifast:
-    PLD     (r2, #0)
-    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    ldr     r4, [sp, #4*10]
-    sub     sp, #local_SIZE
-
-    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
-    str     r4, local_OUTPUT_COL
-    str     r3, local_OUTPUT_BUF
-    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
-    add     r5, r5, #128
-    str     r5, local_RANGE_TABLE
-    mov     fp, r2                                      // fp = coef_block
-    add     ip, sp, #off_WORKSPACE
-
-VLoopTail:
-    ldrsh    r0, [fp, #VY(0)]
-    ldrsh    r1, [fp, #VY(1)]
-    ldrsh    r2, [fp, #VY(2)]
-    ldrsh    r3, [fp, #VY(3)]
-    ldrsh    r4, [fp, #VY(4)]
-    ldrsh    r5, [fp, #VY(5)]
-    ldrsh    r6, [fp, #VY(6)]
-    ldrsh    r7, [fp, #VY(7)]
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      VLoopHeadZero
-
-VLoopHead:
-    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
-    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
-    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
-    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
-    // tmp10 = tmp0 + tmp2   (r0)
-    // tmp11 = tmp0 - tmp2   (r4)
-
-    ldr      r9, [r10, #QY(4)]
-    ldr      r8, [r10, #QY(0)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r4, r9, r4
-    smlabb   r0, r8, r0, r4
-#else
-    mul      r4, r9, r4
-    mul      r0, r8, r0
-    add      r0, r4
-#endif
-    ldr      r9, [r10, #QY(6)]
-    ldr      r8, [r10, #QY(2)]
-    sub      r4, r0, r4, lsl #1
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r6, r9, r6
-    smlabb   r2, r8, r2, r6
-#else
-    mul      r6, r9, r6
-    mul      r2, r8, r2
-    add      r2, r6
-#endif
-
-    // tmp13 = tmp1 + tmp3                                       (r2)
-    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
-    // FIX_1_4142... = 362 = 45*8 + 2
-    sub      r6, r2, r6, lsl #1
-    mov      r8, #360
-    add      r8, r8, #2
-    mul      r9, r6, r8
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r9, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
-
-    // odd part
-    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
-    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
-    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
-    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
-    // z13 = tmp6 + tmp5;  (r0)
-    // z10 = tmp6 - tmp5;  (r2)
-    // z11 = tmp4 + tmp7;  (r4)
-    // z12 = tmp4 - tmp7;  (r6)
-
-    ldr     r2, [r10, #QY(1)]
-    ldr     r9, [r10, #QY(5)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r1, r2, r1
-#else
-    mul     r1, r2, r1
-#endif
-    ldr     r2, [r10, #QY(3)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r5, r9, r5
-#else
-    mul     r5, r9, r5
-#endif
-    ldr     r9, [r10, #QY(7)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smlabb  r0, r2, r3, r5
-    smlabb  r4, r9, r7, r1
-#else
-    mul     r0, r2, r3
-    add     r0, r5
-    mul     r4, r9, r7
-    add     r4, r1
-#endif
-    rsb  r2, r0, r5, lsl #1
-    rsb  r6, r4, r1, lsl #1
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add     r8, r2, r6
-    mov     r9, #472
-    mla     r8, r9, r8, r8
-    mov     r9, #276
-    mla     r0, r6, r9, r6
-    mov     r9, #668
-    mla     r2, r9, r2, r2
-    sub     r0, r0, r8
-    rsb     r2, r2, r8
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-    add   r0, r0, r7
-    sub   r7, r0, r7, lsl #1
-    add   r1, r1, r6
-    sub   r6, r1, r6, lsl #1
-    add   r2, r2, r5
-    sub   r5, r2, r5, lsl #1
-    sub   r3, r3, r4
-    add   r4, r3, r4, lsl #1
-
-    str   r0, [ip, #QY(0)]
-    str   r1, [ip, #QY(1)]
-    str   r2, [ip, #QY(2)]
-    str   r3, [ip, #QY(3)]
-    str   r4, [ip, #QY(4)]
-    str   r5, [ip, #QY(5)]
-    str   r6, [ip, #QY(6)]
-    str   r7, [ip, #QY(7)]
-
-    // inptr++;                    /* advance pointers to next column */
-    // quantptr++;
-    // wsptr++;
-    add  fp, fp, #2
-    add  r10, r10, #4
-    add  ip, ip, #4
-    add  r0, sp, #(off_WORKSPACE + 4*8)
-    cmp  ip, r0
-    bne  VLoopTail
-
-
-
-HLoopStart:
-    // reset pointers
-    PLD     (sp, #off_WORKSPACE)
-    add     ip, sp, #off_WORKSPACE
-    ldr     r10, local_RANGE_TABLE
-
-HLoopTail:
-    // output = *output_buf++ + output_col
-    ldr      r0, local_OUTPUT_BUF
-    ldr      r1, local_OUTPUT_COL
-    ldr      r2, [r0], #4
-    str      r0, local_OUTPUT_BUF
-    add      fp, r2, r1
-
-    PLD      (ip, #32)
-    ldmia    ip!, {r0-r7}
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      HLoopTailZero
-
-HLoopHead:
-    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
-    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
-    add     r0, r0, r4
-    sub     r4, r0, r4, lsl #1
-
-    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
-    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
-    // FIX_... = 360 + 2
-    add     r2, r2, r6
-    sub     r6, r2, r6, lsl #1
-    mov     r8, #360
-    add     r8, r8, #2
-    mul     r6, r8, r6
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r6, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // Odd part
-
-    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
-    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
-    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
-    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
-    add  r0, r5, r3
-    sub  r2, r5, r3
-    add  r4, r1, r7
-    sub  r6, r1, r7
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add  r8, r2, r6
-    mov  r9, #472
-    mla  r8, r9, r8, r8
-    mov  r9, #276
-    mla  r0, r6, r9, r6
-    mov  r9, #668
-    mla  r2, r9, r2, r2
-    sub  r0, r0, r8
-    sub  r2, r8, r2
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
-
-    mov    r8, #128
-    add    r0, r0, r7
-    sub    r7, r0, r7, lsl #1
-    add    r0, r8, r0, asr #5
-    add    r7, r8, r7, asr #5
-    add    r1, r1, r6
-    sub    r6, r1, r6, lsl #1
-    add    r1, r8, r1, asr #5
-    add    r6, r8, r6, asr #5
-    add    r2, r2, r5
-    sub    r5, r2, r5, lsl #1
-    add    r2, r8, r2, asr #5
-    add    r5, r8, r5, asr #5
-    sub    r3, r3, r4
-    add    r4, r3, r4, lsl #1
-    add    r3, r8, r3, asr #5
-    add    r4, r8, r4, asr #5
-
-#if __ARM_ARCH__ >= 6
-    usat   r0, #8, r0
-    usat   r1, #8, r1
-    usat   r2, #8, r2
-    usat   r3, #8, r3
-    usat   r4, #8, r4
-    usat   r5, #8, r5
-    usat   r6, #8, r6
-    usat   r7, #8, r7
-#else
-    cmp    r0, #255
-    mvnhi  r0, r0, asr #31
-    andhi  r0, #255
-    cmp    r7, #255
-    mvnhi  r7, r7, asr #31
-    cmp    r1, #255
-    mvnhi  r1, r1, asr #31
-    andhi  r1, #255
-    cmp    r6, #255
-    mvnhi  r6, r6, asr #31
-    andhi  r6, #255
-    cmp    r2, #255
-    mvnhi  r2, r2, asr #31
-    andhi  r2, #255
-    cmp    r5, #255
-    mvnhi  r5, r5, asr #31
-    andhi  r5, #255
-    cmp    r3, #255
-    mvnhi  r3, r3, asr #31
-    cmp    r4, #255
-    mvnhi  r4, r4, asr #31
-    andhi  r4, #255
-#endif
-
-    // r3 r2 r1 r0
-    orr    r0, r0, r1, lsl #8
-    orr    r0, r0, r2, lsl #16
-    orr    r0, r0, r3, lsl #24
-
-    // r7 r6 r5 r4
-    orr    r1, r4, r5, lsl #8
-    orr    r1, r1, r6, lsl #16
-    orr    r1, r1, r7, lsl #24
-    stmia  fp, {r0, r1}
-
-    add    r0, sp, #(off_WORKSPACE + 8*8*4)
-    cmp    ip, r0
-    bne    HLoopTail
-
-Exit:
-    add    sp, sp, #local_SIZE
-    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    bx     lr
-
-
-VLoopHeadZero:
-// ok, all AC coefficients are 0
-    ldr      r1, [r10, #QY(0)]
-    add      fp, fp, #2
-    add      r10, r10, #4
-    mul      r0, r1, r0
-    str      r0, [ip, #QY(0)]
-    str      r0, [ip, #QY(1)]
-    str      r0, [ip, #QY(2)]
-    str      r0, [ip, #QY(3)]
-    str      r0, [ip, #QY(4)]
-    str      r0, [ip, #QY(5)]
-    str      r0, [ip, #QY(6)]
-    str      r0, [ip, #QY(7)]
-    add      ip, ip, #4
-    add      r0, sp, #(off_WORKSPACE + 4*8)
-    cmp      ip, r0
-    beq      HLoopStart
-    b        VLoopTail
-
-HLoopTailZero:
-    mov      r0, r0, asr #5
-    add      r0, #128
-
-#if __ARM_ARCH__ >= 6
-    usat     r0, #8, r0
-#else
-    cmp      r0, #255
-    mvnhi    r0, r0, asr #31
-    andhi    r0, r0, #255
-#endif
-
-    orr      r0, r0, lsl #8
-    orr      r0, r0, lsl #16
-    mov      r1, r0
-    stmia    fp, {r0, r1}
-
-    add      r0, sp, #(off_WORKSPACE + 64*4)
-    cmp      ip, r0
-    beq      Exit
-    b        HLoopTail
-
-    .endfunc
diff --git a/jpeglib.h b/jpeglib.h
index 83bed4a..07e6872 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -649,6 +649,10 @@
 
   // save the decoder current bit buffer, entropy->bitstate.get_buffer.
   INT32 get_buffer;
+
+  // save the restart info.
+  unsigned short restarts_to_go;
+  unsigned char next_restart_num;
 } huffman_offset_data;
 
 typedef struct {