diff --git a/Android.mk b/Android.mk
index 9e1c42e..2670652 100644
--- a/Android.mk
+++ b/Android.mk
@@ -10,8 +10,8 @@
 	jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
 	jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
 	jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-	jfdctint.c jidctflt.c jidctred.c jquant1.c \
-	jquant2.c jutils.c jmemmgr.c \
+	jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+	jquant2.c jutils.c jmemmgr.c armv6_idct.S
 
 # use ashmem as libjpeg decoder's backing store
 LOCAL_CFLAGS += -DUSE_ANDROID_ASHMEM
@@ -23,21 +23,6 @@
 #LOCAL_SRC_FILES += \
 #	jmem-android.c
 
-
-# the assembler is only for the ARM version, don't break the Linux sim
-ifneq ($(TARGET_ARCH),arm)
-ANDROID_JPEG_NO_ASSEMBLER := true
-endif
-
-# temp fix until we understand why this broke cnn.com
-#ANDROID_JPEG_NO_ASSEMBLER := true
-
-ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
-LOCAL_SRC_FILES += jidctint.c jidctfst.c
-else
-LOCAL_SRC_FILES += jidctint.c jidctfst.S
-endif
-
 LOCAL_CFLAGS += -DAVOID_TABLES 
 LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
 #LOCAL_CFLAGS += -march=armv6j
@@ -45,6 +30,9 @@
 # enable tile based decode
 LOCAL_CFLAGS += -DANDROID_TILE_BASED_DECODE
 
+# enable armv6 idct assembly
+LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+
 LOCAL_MODULE:= libjpeg
 
 LOCAL_SHARED_LIBRARIES := \
diff --git a/armv6_idct.S b/armv6_idct.S
new file mode 100644
index 0000000..18e4e8a
--- /dev/null
+++ b/armv6_idct.S
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This is a fast-and-accurate implementation of inverse Discrete Cosine
+ * Transform (IDCT) for ARMv6+. It also performs dequantization of the input
+ * coefficients just like other methods.
+ *
+ * This implementation is based on the scaled 1-D DCT algorithm proposed by
+ * Arai, Agui, and Nakajima. The following code is based on the figure 4-8
+ * on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients
+ * are (almost) directly mapped into registers.
+ *
+ * The accuracy is achieved by using SMULWy and SMLAWy instructions. Both
+ * multiply 32 bits by 16 bits and store the top 32 bits of the result. It
+ * makes 32-bit fixed-point arithmetic possible without overflow. That is
+ * why jpeg_idct_ifast(), which is written in C, cannot be improved.
+ *
+ * More tricks are used to gain more speed. First of all, we use as many
+ * registers as possible. ARM processor has 16 registers including sp (r13)
+ * and pc (r15), so only 14 registers can be used without limitations. In
+ * general, we let r0 to r7 hold the coefficients; r10 and r11 hold four
+ * 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold
+ * intermediate value. In the second pass, r9 is the loop counter. In the
+ * first pass, r8 to r11 are used to hold quantization values, so the loop
+ * counter is held by sp. Yes, the stack pointer. Since it must be aligned
+ * to 4-byte boundary all the time, we align it to 32-byte boundary and use
+ * bit 3 to bit 5. As the result, we actually use 14.1 registers. :-)
+ *
+ * Second, we rearrange quantization values to access them sequentially. The
+ * table is first transposed, and the new columns are placed in the order of
+ * 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a
+ * time. Rearranging coefficients also helps, but that requires to change a
+ * dozen of files, which seems not worth it. In addition, we choose to scale
+ * up quantization values by 13 bits, so the coefficients are scaled up by
+ * 16 bits after both passes. Then we can pack and saturate them two at a
+ * time using PKHTB and USAT16 instructions.
+ *
+ * Third, we reorder the instructions to avoid bubbles in the pipeline. This
+ * is done by hand accroding to the cycle timings and the interlock behavior
+ * described in the technical reference manual of ARM1136JF-S. We also take
+ * advantage of dual issue processors by interleaving instructions with
+ * dependencies. It has been benchmarked on four devices and all the results
+ * showed distinguishable improvements. Note that PLD instructions actually
+ * slow things down, so they are removed at the last minute. In the future,
+ * this might be futher improved using a system profiler.
+ */
+
+#ifdef __arm__
+#include <machine/cpu-features.h>
+#endif
+
+#if __ARM_ARCH__ >= 6
+
+// void armv6_idct(short *coefs, int *quans, unsigned char *rows, int col)
+    .arm
+    .text
+    .align
+    .global armv6_idct
+    .func   armv6_idct
+
+armv6_idct:
+    // Push everything except sp (r13) and pc (r15).
+    stmdb   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+
+    // r12 = quans, r14 = coefs.
+    sub     r4, sp, #236
+    bic     sp, r4, #31
+    add     r5, sp, #224
+    add     r12, r1, #256
+    stm     r5, {r2, r3, r4}
+    add     r14, r0, #16
+
+pass1_head:
+    // Load quantization values. (q[0, 2, 4, 6])
+    ldmdb   r12!, {r8, r9, r10, r11}
+
+    // Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7])
+    ldrsh   r4, [r14, #-2] !
+    ldrsh   r1, [r14, #16]
+    ldrsh   r2, [r14, #32]
+    ldrsh   r3, [r14, #48]
+    ldrsh   r0, [r14, #64]
+    ldrsh   r5, [r14, #80]
+    ldrsh   r6, [r14, #96]
+    ldrsh   r7, [r14, #112]
+
+    // r4 = q[0] * c[0];
+    mul     r4, r8, r4
+
+    // Check if ACs are all zero.
+    cmp     r0, #0
+    orreqs  r8, r1, r2
+    orreqs  r8, r3, r5
+    orreqs  r8, r6, r7
+    beq     pass1_zero
+
+    // Step 1: Dequantizations.
+
+    // r2 = q[2] * c[2];
+    // r0 = q[4] * c[4] + r4;
+    // r6 = q[6] * c[6] + r2;
+    mul     r2, r9, r2
+    mla     r0, r10, r0, r4
+    mla     r6, r11, r6, r2
+
+    // Load quantization values. (q[7, 5, 1, 3])
+    ldmdb   r12!, {r8, r9, r10, r11}
+
+    // r4 = r4 * 2 - r0 = -(r0 - r4 * 2);
+    // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+    rsb     r4, r0, r4, lsl #1
+    rsb     r2, r6, r2, lsl #1
+
+    // r7 = q[7] * c[7];
+    // r5 = q[5] * c[5];
+    // r1 = q[1] * c[1] + r7;
+    // r3 = q[3] * c[3] + r5;
+    mul     r7, r8, r7
+    mul     r5, r9, r5
+    mla     r1, r10, r1, r7
+    mla     r3, r11, r3, r5
+
+    // Load constants.
+    ldrd    r10, constants
+
+    // Step 2: Rotations and Butterflies.
+
+    // r7 = r1 - r7 * 2;
+    // r1 = r1 - r3;
+    // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+    // r3 = r1 + r3 * 2;
+    // r8 = r5 + r7;
+    sub     r7, r1, r7, lsl #1
+    sub     r1, r1, r3
+    rsb     r5, r3, r5, lsl #1
+    add     r3, r1, r3, lsl #1
+    add     r8, r5, r7
+
+    // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+    // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+    // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+    smlawt  r2, r2, r10, r2
+    smulwb  r8, r8, r10
+    smlawt  r1, r1, r10, r1
+
+    // r0 = r0 + r6;
+    // r2 = r2 - r6;
+    // r6 = r0 - r6 * 2;
+    add     r0, r0, r6
+    sub     r2, r2, r6
+    sub     r6, r0, r6, lsl #1
+
+    // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+    // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+    smlawt  r5, r5, r11, r8
+    smlawb  r8, r7, r11, r8
+
+    // r4 = r4 + r2;
+    // r0 = r0 + r3;
+    // r2 = r4 - r2 * 2;
+    add     r4, r4, r2
+    add     r0, r0, r3
+    sub     r2, r4, r2, lsl #1
+
+    // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+    // r3 = r0 - r3 * 2;
+    // r1 = r1 - r7;
+    // r4 = r4 + r7;
+    // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+    // r7 = r4 - r7 * 2;
+    rsb     r7, r3, r5, lsl #3
+    sub     r3, r0, r3, lsl #1
+    sub     r1, r1, r7
+    add     r4, r4, r7
+    rsb     r5, r1, r8, lsl #3
+    sub     r7, r4, r7, lsl #1
+
+    // r2 = r2 + r1;
+    // r6 = r6 + r5;
+    // r1 = r2 - r1 * 2;
+    // r5 = r6 - r5 * 2;
+    add     r2, r2, r1
+    add     r6, r6, r5
+    sub     r1, r2, r1, lsl #1
+    sub     r5, r6, r5, lsl #1
+
+    // Step 3: Reorder and Save.
+
+    str     r0, [sp, #-4] !
+    str     r4, [sp, #32]
+    str     r2, [sp, #64]
+    str     r6, [sp, #96]
+    str     r5, [sp, #128]
+    str     r1, [sp, #160]
+    str     r7, [sp, #192]
+    str     r3, [sp, #224]
+    b       pass1_tail
+
+    // Precomputed 16-bit constants: 27146, 15137, -21407, -8867.
+    // Put them in the middle since LDRD only accepts offsets from -255 to 255.
+    .align  3
+constants:
+    .word   0x6a0a3b21
+    .word   0xac61dd5d
+
+pass1_zero:
+    str     r4, [sp, #-4] !
+    str     r4, [sp, #32]
+    str     r4, [sp, #64]
+    str     r4, [sp, #96]
+    str     r4, [sp, #128]
+    str     r4, [sp, #160]
+    str     r4, [sp, #192]
+    str     r4, [sp, #224]
+    sub     r12, r12, #16
+
+pass1_tail:
+    ands    r9, sp, #31
+    bne     pass1_head
+
+    // r12 = rows, r14 = col.
+    ldr     r12, [sp, #256]
+    ldr     r14, [sp, #260]
+
+    // Load constants.
+    ldrd    r10, constants
+
+pass2_head:
+    // Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7])
+    ldmia   sp!, {r0, r1, r2, r3, r4, r5, r6, r7}
+
+    // r0 = r0 + 0x00808000;
+    add     r0, r0, #0x00800000
+    add     r0, r0, #0x00008000
+
+    // Step 1: Analog to the first pass.
+
+    // r0 = r0 + r4;
+    // r6 = r6 + r2;
+    add     r0, r0, r4
+    add     r6, r6, r2
+
+    // r4 = r0 - r4 * 2;
+    // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+    sub     r4, r0, r4, lsl #1
+    rsb     r2, r6, r2, lsl #1
+
+    // r1 = r1 + r7;
+    // r3 = r3 + r5;
+    add     r1, r1, r7
+    add     r3, r3, r5
+
+    // Step 2: Rotations and Butterflies.
+
+    // r7 = r1 - r7 * 2;
+    // r1 = r1 - r3;
+    // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+    // r3 = r1 + r3 * 2;
+    // r8 = r5 + r7;
+    sub     r7, r1, r7, lsl #1
+    sub     r1, r1, r3
+    rsb     r5, r3, r5, lsl #1
+    add     r3, r1, r3, lsl #1
+    add     r8, r5, r7
+
+    // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+    // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+    // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+    smlawt  r2, r2, r10, r2
+    smulwb  r8, r8, r10
+    smlawt  r1, r1, r10, r1
+
+    // r0 = r0 + r6;
+    // r2 = r2 - r6;
+    // r6 = r0 - r6 * 2;
+    add     r0, r0, r6
+    sub     r2, r2, r6
+    sub     r6, r0, r6, lsl #1
+
+    // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+    // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+    smlawt  r5, r5, r11, r8
+    smlawb  r8, r7, r11, r8
+
+    // r4 = r4 + r2;
+    // r0 = r0 + r3;
+    // r2 = r4 - r2 * 2;
+    add     r4, r4, r2
+    add     r0, r0, r3
+    sub     r2, r4, r2, lsl #1
+
+    // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+    // r3 = r0 - r3 * 2;
+    // r1 = r1 - r7;
+    // r4 = r4 + r7;
+    // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+    // r7 = r4 - r7 * 2;
+    rsb     r7, r3, r5, lsl #3
+    sub     r3, r0, r3, lsl #1
+    sub     r1, r1, r7
+    add     r4, r4, r7
+    rsb     r5, r1, r8, lsl #3
+    sub     r7, r4, r7, lsl #1
+
+    // r2 = r2 + r1;
+    // r6 = r6 + r5;
+    // r1 = r2 - r1 * 2;
+    // r5 = r6 - r5 * 2;
+    add     r2, r2, r1
+    add     r6, r6, r5
+    sub     r1, r2, r1, lsl #1
+    sub     r5, r6, r5, lsl #1
+
+    // Step 3: Reorder and Save.
+
+    // Load output pointer.
+    ldr     r8, [r12], #4
+
+    // For little endian: r6, r2, r4, r0, r3, r7, r1, r5.
+    pkhtb   r6, r6, r4, asr #16
+    pkhtb   r2, r2, r0, asr #16
+    pkhtb   r3, r3, r1, asr #16
+    pkhtb   r7, r7, r5, asr #16
+    usat16  r6, #8, r6
+    usat16  r2, #8, r2
+    usat16  r3, #8, r3
+    usat16  r7, #8, r7
+    orr     r0, r2, r6, lsl #8
+    orr     r1, r7, r3, lsl #8
+
+#ifdef __ARMEB__
+    // Reverse bytes for big endian.
+    rev     r0, r0
+    rev     r1, r1
+#endif
+
+    // Use STR instead of STRD to support unaligned access.
+    str     r0, [r8, r14] !
+    str     r1, [r8, #4]
+
+pass2_tail:
+    adds    r9, r9, #0x10000000
+    bpl     pass2_head
+
+    ldr     sp, [sp, #8]
+    add     sp, sp, #236
+
+    ldmia   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+    bx      lr
+    .endfunc
+
+#endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 9e8040b..e6e9506 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -277,15 +277,24 @@
   unsigned int MCUs_per_row = cinfo->MCUs_per_row;
 #ifdef ANDROID_TILE_BASED_DECODE
   if (cinfo->tile_decode) {
+    int iMCU_width_To_MCU_width;
+    if (cinfo->comps_in_scan > 1) {
+      // Interleaved
+      iMCU_width_To_MCU_width = 1;
+    } else {
+      // Non-intervleaved
+      iMCU_width_To_MCU_width = cinfo->cur_comp_info[0]->h_samp_factor;
+    }
     MCUs_per_row = jmin(MCUs_per_row,
         (cinfo->coef->column_right_boundary - cinfo->coef->column_left_boundary)
-        * cinfo->entropy->index->MCU_sample_size * cinfo->max_h_samp_factor);
+        * cinfo->entropy->index->MCU_sample_size * iMCU_width_To_MCU_width);
   }
 #endif
 
   /* Loop to process one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
+   // configure huffman decoder
 #ifdef ANDROID_TILE_BASED_DECODE
     if (cinfo->tile_decode) {
       huffman_scan_header scan_header =
@@ -296,8 +305,10 @@
               [col_offset + yoffset * scan_header.MCUs_per_row]);
     }
 #endif
+
+    // zero all blocks
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num < MCUs_per_row;
-	 MCU_col_num++) {
+          MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
       blkn = 0;			/* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
@@ -309,7 +320,7 @@
             coef->MCU_buffer[blkn++] = buffer_ptr++;
 #ifdef ANDROID_TILE_BASED_DECODE
             if (cinfo->tile_decode && cinfo->input_scan_number == 0) {
-              // need to do pre-zero ourself.
+              // need to do pre-zero ourselves.
               jzero_far((void FAR *) coef->MCU_buffer[blkn-1],
                         (size_t) (SIZEOF(JBLOCK)));
             }
@@ -317,12 +328,14 @@
           }
         }
       }
+
+
       /* Try to fetch the MCU. */
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->MCU_ctr = MCU_col_num;
-	return JPEG_SUSPENDED;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->MCU_ctr = MCU_col_num;
+        return JPEG_SUSPENDED;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -584,14 +597,14 @@
     int start_block = 0;
 #if ANDROID_TILE_BASED_DECODE
     if (cinfo->tile_decode) {
+      // width_in_blocks for a component depends on its h_samp_factor.
       width_in_blocks = jmin(width_in_blocks,
         (cinfo->coef->MCU_column_right_boundary -
          cinfo->coef->MCU_column_left_boundary) *
-         cinfo->max_h_samp_factor /
          compptr->h_samp_factor);
       start_block = coef->pub.MCU_columns_to_skip *
-        cinfo->max_h_samp_factor / compptr->h_samp_factor;
-    }
+         compptr->h_samp_factor;
+   }
 #endif
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..74a96db 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -20,6 +20,35 @@
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
 
+#ifdef ANDROID_ARMV6_IDCT
+  #undef ANDROID_ARMV6_IDCT
+  #ifdef __arm__
+    #include <machine/cpu-features.h>
+    #if __ARM_ARCH__ >= 6
+      #define ANDROID_ARMV6_IDCT
+    #else
+      #warning "ANDROID_ARMV6_IDCT is disabled"
+    #endif
+  #endif
+#endif
+
+#ifdef ANDROID_ARMV6_IDCT
+
+/* Intentionally declare the prototype with arguments of primitive types instead
+ * of type-defined ones. This will at least generate some warnings if jmorecfg.h
+ * is changed and becomes incompatible with the assembly code.
+ */
+extern void armv6_idct(short *coefs, int *quans, unsigned char **rows, int col);
+
+void jpeg_idct_armv6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *dct_table = (IFAST_MULT_TYPE *)compptr->dct_table;
+  armv6_idct(coef_block, dct_table, output_buf, output_col);
+}
+
+#endif
 
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
@@ -115,6 +144,13 @@
 #endif
     case DCTSIZE:
       switch (cinfo->dct_method) {
+#ifdef ANDROID_ARMV6_IDCT
+      case JDCT_ISLOW:
+      case JDCT_IFAST:
+	method_ptr = jpeg_idct_armv6;
+	method = JDCT_IFAST;
+	break;
+#else /* ANDROID_ARMV6_IDCT */
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
 	method_ptr = jpeg_idct_islow;
@@ -127,6 +163,7 @@
 	method = JDCT_IFAST;
 	break;
 #endif
+#endif /* ANDROID_ARMV6_IDCT */
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
 	method_ptr = jpeg_idct_float;
@@ -181,6 +218,27 @@
 	 * IFAST_SCALE_BITS.
 	 */
 	IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+#ifdef ANDROID_ARMV6_IDCT
+	/* Precomputed values scaled up by 15 bits. */
+	static const unsigned short scales[DCTSIZE2] = {
+	  32768, 45451, 42813, 38531, 32768, 25746, 17734,  9041,
+	  45451, 63042, 59384, 53444, 45451, 35710, 24598, 12540,
+	  42813, 59384, 55938, 50343, 42813, 33638, 23170, 11812,
+	  38531, 53444, 50343, 45308, 38531, 30274, 20853, 10631,
+	  32768, 45451, 42813, 38531, 32768, 25746, 17734,  9041,
+	  25746, 35710, 33638, 30274, 25746, 20228, 13933,  7103,
+	  17734, 24598, 23170, 20853, 17734, 13933,  9598,  4893,
+	   9041, 12540, 11812, 10631,  9041,  7103,  4893,  2494,
+	};
+	/* Inverse map of [7, 5, 1, 3, 0, 2, 4, 6]. */
+	static const char orders[DCTSIZE] = {4, 2, 5, 3, 6, 1, 7, 0};
+	/* Reorder the columns after transposing. */
+	for (i = 0; i < DCTSIZE2; ++i) {
+	  int j = ((i & 7) << 3) + orders[i >> 3];
+	  ifmtbl[j] = (qtbl->quantval[i] * scales[i] + 2) >> 2;
+	}
+#else /* ANDROID_ARMV6_IDCT */
+
 #define CONST_BITS 14
 	static const INT16 aanscales[DCTSIZE2] = {
 	  /* precomputed values scaled up by 14 bits */
@@ -201,6 +259,7 @@
 				  (INT32) aanscales[i]),
 		    CONST_BITS-IFAST_SCALE_BITS);
 	}
+#endif /* ANDROID_ARMV6_IDCT */
       }
       break;
 #endif
diff --git a/jdhuff.c b/jdhuff.c
index 0d704a5..bc5d4fd 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -295,20 +295,10 @@
 		      int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
-  j_decompress_ptr cinfo = state->cinfo;
-  if (cinfo->tile_decode &&
-      cinfo->restart_interval == 0 &&
-      cinfo->unread_marker >= 0xd0 &&
-      cinfo->unread_marker <= 0xd7 &&
-      nbits > bits_left
-      ) {
-      // Skip the restart marker.
-    cinfo->marker->next_restart_num = cinfo->unread_marker - 0xd0;
-    process_restart(cinfo);
-  }
   /* Copy heavily used state fields into locals (hopefully registers) */
   register const JOCTET * next_input_byte = state->next_input_byte;
   register size_t bytes_in_buffer = state->bytes_in_buffer;
+  j_decompress_ptr cinfo = state->cinfo;
 
   /* Attempt to load at least MIN_GET_BITS bits into get_buffer. */
   /* (It is assumed that no request will be for more than that many bits.) */
@@ -509,24 +499,20 @@
 }
 
 /*
- * Configure the Huffman decoder reader position and bit buffer.
+ * Save the current Huffman deocde position and the DC coefficients
+ * for each component into bitstream_offset and dc_info[], respectively.
  */
-GLOBAL(void)
-jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
-        huffman_offset_data offset)
+METHODDEF(void)
+get_huffman_decoder_configuration(j_decompress_ptr cinfo,
+        huffman_offset_data *offset)
 {
-  unsigned int bitstream_offset = offset.bitstream_offset;
-  int blkn, i;
-
-  cinfo->restart_interval = 0;
-  cinfo->unread_marker = 0;
-
-  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
-  unsigned int bit_in_bit_buffer =
-      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
-
-  jset_input_stream_position_bit(cinfo, byte_offset,
-          bit_in_bit_buffer, offset.get_buffer);
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  short int *dc_info = offset->prev_dc;
+  int i;
+  jpeg_get_huffman_decoder_configuration(cinfo, offset);
+  for (i = 0; i < cinfo->comps_in_scan; i++) {
+    dc_info[i] = entropy->saved.last_dc_val[i];
+  }
 }
 
 /*
@@ -546,6 +532,10 @@
 	return;
   }
 
+  // Save restarts_to_go and next_restart_num
+  offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+  offset->next_restart_num = cinfo->marker->next_restart_num;
+
   offset->bitstream_offset =
       (jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
       + entropy->bitstate.bits_left;
@@ -570,20 +560,28 @@
 }
 
 /*
- * Save the current Huffman deocde position and the DC coefficients
- * for each component into bitstream_offset and dc_info[], respectively.
+ * Configure the Huffman decoder reader position and bit buffer.
  */
-METHODDEF(void)
-get_huffman_decoder_configuration(j_decompress_ptr cinfo,
-        huffman_offset_data *offset)
+GLOBAL(void)
+jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
+        huffman_offset_data offset)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  short int *dc_info = offset->prev_dc;
-  int i;
-  jpeg_get_huffman_decoder_configuration(cinfo, offset);
-  for (i = 0; i < cinfo->comps_in_scan; i++) {
-    dc_info[i] = entropy->saved.last_dc_val[i];
-  }
+
+  // Restore restarts_to_go and next_restart_num
+  cinfo->unread_marker = 0;
+  entropy->restarts_to_go = offset.restarts_to_go;
+  cinfo->marker->next_restart_num = offset.next_restart_num;
+
+  unsigned int bitstream_offset = offset.bitstream_offset;
+  int blkn, i;
+
+  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+  unsigned int bit_in_bit_buffer =
+      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+  jset_input_stream_position_bit(cinfo, byte_offset,
+          bit_in_bit_buffer, offset.get_buffer);
 }
 
 /*
diff --git a/jdphuff.c b/jdphuff.c
index a58cdd2..922017e 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -632,21 +632,6 @@
 }
 
 /*
- * Configure the Huffman decoder to decode the image
- * starting from (iMCU_row_offset, iMCU_col_offset).
- */
-METHODDEF(void)
-configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
-{
-  int i;
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  jpeg_configure_huffman_decoder(cinfo, offset);
-  entropy->saved.EOBRUN = offset.EOBRUN;
-  for (i = 0; i < cinfo->comps_in_scan; i++)
-    entropy->saved.last_dc_val[i] = offset.prev_dc[i];
-}
-
-/*
  * Save the current Huffman deocde position and the DC coefficients
  * for each component into bitstream_offset and dc_info[], respectively.
  */
@@ -656,16 +641,103 @@
 {
   int i;
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  jpeg_get_huffman_decoder_configuration(cinfo, offset);
+  jpeg_get_huffman_decoder_configuration_progressive(cinfo, offset);
   offset->EOBRUN = entropy->saved.EOBRUN;
   for (i = 0; i < cinfo->comps_in_scan; i++)
     offset->prev_dc[i] = entropy->saved.last_dc_val[i];
 }
 
+
+/*
+ * Save the current Huffman decoder position and the bit buffer
+ * into bitstream_offset and get_buffer, respectively.
+ */
+GLOBAL(void)
+jpeg_get_huffman_decoder_configuration_progressive(j_decompress_ptr cinfo,
+        huffman_offset_data *offset)
+{
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+  if (cinfo->restart_interval) {
+    // We are at the end of a data segment
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return;
+  }
+
+  // Save restarts_to_go and next_restart_num.
+  offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+  offset->next_restart_num = cinfo->marker->next_restart_num;
+
+  offset->bitstream_offset =
+      (jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
+      + entropy->bitstate.bits_left;
+
+  offset->get_buffer = entropy->bitstate.get_buffer;
+}
+
+
+/*
+ * Configure the Huffman decoder to decode the image
+ * starting from (iMCU_row_offset, iMCU_col_offset).
+ */
+METHODDEF(void)
+configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
+{
+  int i;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  jpeg_configure_huffman_decoder_progressive(cinfo, offset);
+  entropy->saved.EOBRUN = offset.EOBRUN;
+  for (i = 0; i < cinfo->comps_in_scan; i++)
+    entropy->saved.last_dc_val[i] = offset.prev_dc[i];
+}
+
+/*
+ * Configure the Huffman decoder reader position and bit buffer.
+ */
+GLOBAL(void)
+jpeg_configure_huffman_decoder_progressive(j_decompress_ptr cinfo,
+        huffman_offset_data offset)
+{
+	phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+  // Restore restarts_to_go and next_restart_num
+  cinfo->unread_marker = 0;
+  entropy->restarts_to_go = offset.restarts_to_go;
+  cinfo->marker->next_restart_num = offset.next_restart_num;
+
+  unsigned int bitstream_offset = offset.bitstream_offset;
+  int blkn, i;
+
+  unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+  unsigned int bit_in_bit_buffer =
+      bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+  jset_input_stream_position_bit(cinfo, byte_offset,
+          bit_in_bit_buffer, offset.get_buffer);
+}
+
+GLOBAL(void)
+jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
+        huffman_index *index, int scan_no, int offset)
+{
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  if (scan_no >= index->scan_count) {
+    index->scan = realloc(index->scan,
+                    (scan_no + 1) * sizeof(huffman_scan_header));
+    index->mem_used += (scan_no - index->scan_count + 1)
+      * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
+      * sizeof(huffman_offset_data*));
+    index->scan_count = scan_no + 1;
+  }
+  index->scan[scan_no].offset = (huffman_offset_data**)malloc(
+          cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
+  index->scan[scan_no].bitstream_offset = offset;
+}
+
 /*
  * Module initialization routine for progressive Huffman entropy decoding.
  */
-
 GLOBAL(void)
 jinit_phuff_decoder (j_decompress_ptr cinfo)
 {
@@ -697,22 +769,4 @@
       *coef_bit_ptr++ = -1;
 }
 
-GLOBAL(void)
-jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
-        huffman_index *index, int scan_no, int offset)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  if (scan_no >= index->scan_count) {
-    index->scan = realloc(index->scan,
-                    (scan_no + 1) * sizeof(huffman_scan_header));
-    index->mem_used += (scan_no - index->scan_count + 1)
-      * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
-      * sizeof(huffman_offset_data*));
-    index->scan_count = scan_no + 1;
-  }
-  index->scan[scan_no].offset = (huffman_offset_data**)malloc(
-          cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
-  index->scan[scan_no].bitstream_offset = offset;
-}
-
 #endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/jidctfst.S b/jidctfst.S
deleted file mode 100644
index 34e1c24..0000000
--- a/jidctfst.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <machine/cpu-features.h>
-
-    .text
-    .align
-
-    .global jpeg_idct_ifast
-    .func   jpeg_idct_ifast
-
-// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
-
-// jpeg_idct_ifast (j_decompress_ptr       cinfo,
-//                 jpeg_component_info *   compptr,
-//                 short*                  coef_block,
-//                 unsigned char*          output_buf,
-//                 int                     output_col)
-
-#define  local_TMP0123       sp
-#define  local_TMP0          [sp, #0]
-#define  local_TMP1          [sp, #4]
-#define  local_TMP2          [sp, #8]
-#define  local_TMP3          [sp, #12]
-#define  local_RANGE_TABLE   [sp, #16]
-#define  local_OUTPUT_COL    [sp, #20]
-#define  local_OUTPUT_BUF    [sp, #24]
-#define  local_UNUSED        [sp, #28]
-#define  off_WORKSPACE       32
-#define  local_WORKSPACE     [sp, #offWORKSPACE]
-#define  local_SIZE          (off_WORKSPACE + 8*8*4)
-
-#define  off_DECOMPRESS_range_limit_base  324
-#define  off_COMPINFO_quanttable          80
-
-#define  DCTSIZE   8
-#define  VY(x)   ((x)*DCTSIZE*2)
-#define  QY(x)   ((x)*DCTSIZE*4)
-
-#define  VX(x)   ((x)*2)
-#define  QX(x)   ((x)*4)
-
-#define  FIX_1_414213562    #362
-#define  FIX_1_082392200    #277
-#define  FIX_1_847759065    #473
-#define  FIX_2_613125930    #669
-
-#define  RANGE_MASK   1023
-
-
-
-jpeg_idct_ifast:
-    PLD     (r2, #0)
-    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    ldr     r4, [sp, #4*10]
-    sub     sp, #local_SIZE
-
-    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
-    str     r4, local_OUTPUT_COL
-    str     r3, local_OUTPUT_BUF
-    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
-    add     r5, r5, #128
-    str     r5, local_RANGE_TABLE
-    mov     fp, r2                                      // fp = coef_block
-    add     ip, sp, #off_WORKSPACE
-
-VLoopTail:
-    ldrsh    r0, [fp, #VY(0)]
-    ldrsh    r1, [fp, #VY(1)]
-    ldrsh    r2, [fp, #VY(2)]
-    ldrsh    r3, [fp, #VY(3)]
-    ldrsh    r4, [fp, #VY(4)]
-    ldrsh    r5, [fp, #VY(5)]
-    ldrsh    r6, [fp, #VY(6)]
-    ldrsh    r7, [fp, #VY(7)]
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      VLoopHeadZero
-
-VLoopHead:
-    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
-    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
-    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
-    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
-    // tmp10 = tmp0 + tmp2   (r0)
-    // tmp11 = tmp0 - tmp2   (r4)
-
-    ldr      r9, [r10, #QY(4)]
-    ldr      r8, [r10, #QY(0)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r4, r9, r4
-    smlabb   r0, r8, r0, r4
-#else
-    mul      r4, r9, r4
-    mul      r0, r8, r0
-    add      r0, r4
-#endif
-    ldr      r9, [r10, #QY(6)]
-    ldr      r8, [r10, #QY(2)]
-    sub      r4, r0, r4, lsl #1
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r6, r9, r6
-    smlabb   r2, r8, r2, r6
-#else
-    mul      r6, r9, r6
-    mul      r2, r8, r2
-    add      r2, r6
-#endif
-
-    // tmp13 = tmp1 + tmp3                                       (r2)
-    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
-    // FIX_1_4142... = 362 = 45*8 + 2
-    sub      r6, r2, r6, lsl #1
-    mov      r8, #360
-    add      r8, r8, #2
-    mul      r9, r6, r8
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r9, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
-
-    // odd part
-    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
-    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
-    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
-    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
-    // z13 = tmp6 + tmp5;  (r0)
-    // z10 = tmp6 - tmp5;  (r2)
-    // z11 = tmp4 + tmp7;  (r4)
-    // z12 = tmp4 - tmp7;  (r6)
-
-    ldr     r2, [r10, #QY(1)]
-    ldr     r9, [r10, #QY(5)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r1, r2, r1
-#else
-    mul     r1, r2, r1
-#endif
-    ldr     r2, [r10, #QY(3)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r5, r9, r5
-#else
-    mul     r5, r9, r5
-#endif
-    ldr     r9, [r10, #QY(7)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smlabb  r0, r2, r3, r5
-    smlabb  r4, r9, r7, r1
-#else
-    mul     r0, r2, r3
-    add     r0, r5
-    mul     r4, r9, r7
-    add     r4, r1
-#endif
-    rsb  r2, r0, r5, lsl #1
-    rsb  r6, r4, r1, lsl #1
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add     r8, r2, r6
-    mov     r9, #472
-    mla     r8, r9, r8, r8
-    mov     r9, #276
-    mla     r0, r6, r9, r6
-    mov     r9, #668
-    mla     r2, r9, r2, r2
-    sub     r0, r0, r8
-    rsb     r2, r2, r8
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-    add   r0, r0, r7
-    sub   r7, r0, r7, lsl #1
-    add   r1, r1, r6
-    sub   r6, r1, r6, lsl #1
-    add   r2, r2, r5
-    sub   r5, r2, r5, lsl #1
-    sub   r3, r3, r4
-    add   r4, r3, r4, lsl #1
-
-    str   r0, [ip, #QY(0)]
-    str   r1, [ip, #QY(1)]
-    str   r2, [ip, #QY(2)]
-    str   r3, [ip, #QY(3)]
-    str   r4, [ip, #QY(4)]
-    str   r5, [ip, #QY(5)]
-    str   r6, [ip, #QY(6)]
-    str   r7, [ip, #QY(7)]
-
-    // inptr++;                    /* advance pointers to next column */
-    // quantptr++;
-    // wsptr++;
-    add  fp, fp, #2
-    add  r10, r10, #4
-    add  ip, ip, #4
-    add  r0, sp, #(off_WORKSPACE + 4*8)
-    cmp  ip, r0
-    bne  VLoopTail
-
-
-
-HLoopStart:
-    // reset pointers
-    PLD     (sp, #off_WORKSPACE)
-    add     ip, sp, #off_WORKSPACE
-    ldr     r10, local_RANGE_TABLE
-
-HLoopTail:
-    // output = *output_buf++ + output_col
-    ldr      r0, local_OUTPUT_BUF
-    ldr      r1, local_OUTPUT_COL
-    ldr      r2, [r0], #4
-    str      r0, local_OUTPUT_BUF
-    add      fp, r2, r1
-
-    PLD      (ip, #32)
-    ldmia    ip!, {r0-r7}
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      HLoopTailZero
-
-HLoopHead:
-    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
-    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
-    add     r0, r0, r4
-    sub     r4, r0, r4, lsl #1
-
-    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
-    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
-    // FIX_... = 360 + 2
-    add     r2, r2, r6
-    sub     r6, r2, r6, lsl #1
-    mov     r8, #360
-    add     r8, r8, #2
-    mul     r6, r8, r6
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r6, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // Odd part
-
-    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
-    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
-    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
-    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
-    add  r0, r5, r3
-    sub  r2, r5, r3
-    add  r4, r1, r7
-    sub  r6, r1, r7
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add  r8, r2, r6
-    mov  r9, #472
-    mla  r8, r9, r8, r8
-    mov  r9, #276
-    mla  r0, r6, r9, r6
-    mov  r9, #668
-    mla  r2, r9, r2, r2
-    sub  r0, r0, r8
-    sub  r2, r8, r2
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
-
-    mov    r8, #128
-    add    r0, r0, r7
-    sub    r7, r0, r7, lsl #1
-    add    r0, r8, r0, asr #5
-    add    r7, r8, r7, asr #5
-    add    r1, r1, r6
-    sub    r6, r1, r6, lsl #1
-    add    r1, r8, r1, asr #5
-    add    r6, r8, r6, asr #5
-    add    r2, r2, r5
-    sub    r5, r2, r5, lsl #1
-    add    r2, r8, r2, asr #5
-    add    r5, r8, r5, asr #5
-    sub    r3, r3, r4
-    add    r4, r3, r4, lsl #1
-    add    r3, r8, r3, asr #5
-    add    r4, r8, r4, asr #5
-
-#if __ARM_ARCH__ >= 6
-    usat   r0, #8, r0
-    usat   r1, #8, r1
-    usat   r2, #8, r2
-    usat   r3, #8, r3
-    usat   r4, #8, r4
-    usat   r5, #8, r5
-    usat   r6, #8, r6
-    usat   r7, #8, r7
-#else
-    cmp    r0, #255
-    mvnhi  r0, r0, asr #31
-    andhi  r0, #255
-    cmp    r7, #255
-    mvnhi  r7, r7, asr #31
-    cmp    r1, #255
-    mvnhi  r1, r1, asr #31
-    andhi  r1, #255
-    cmp    r6, #255
-    mvnhi  r6, r6, asr #31
-    andhi  r6, #255
-    cmp    r2, #255
-    mvnhi  r2, r2, asr #31
-    andhi  r2, #255
-    cmp    r5, #255
-    mvnhi  r5, r5, asr #31
-    andhi  r5, #255
-    cmp    r3, #255
-    mvnhi  r3, r3, asr #31
-    cmp    r4, #255
-    mvnhi  r4, r4, asr #31
-    andhi  r4, #255
-#endif
-
-    // r3 r2 r1 r0
-    orr    r0, r0, r1, lsl #8
-    orr    r0, r0, r2, lsl #16
-    orr    r0, r0, r3, lsl #24
-
-    // r7 r6 r5 r4
-    orr    r1, r4, r5, lsl #8
-    orr    r1, r1, r6, lsl #16
-    orr    r1, r1, r7, lsl #24
-    stmia  fp, {r0, r1}
-
-    add    r0, sp, #(off_WORKSPACE + 8*8*4)
-    cmp    ip, r0
-    bne    HLoopTail
-
-Exit:
-    add    sp, sp, #local_SIZE
-    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    bx     lr
-
-
-VLoopHeadZero:
-// ok, all AC coefficients are 0
-    ldr      r1, [r10, #QY(0)]
-    add      fp, fp, #2
-    add      r10, r10, #4
-    mul      r0, r1, r0
-    str      r0, [ip, #QY(0)]
-    str      r0, [ip, #QY(1)]
-    str      r0, [ip, #QY(2)]
-    str      r0, [ip, #QY(3)]
-    str      r0, [ip, #QY(4)]
-    str      r0, [ip, #QY(5)]
-    str      r0, [ip, #QY(6)]
-    str      r0, [ip, #QY(7)]
-    add      ip, ip, #4
-    add      r0, sp, #(off_WORKSPACE + 4*8)
-    cmp      ip, r0
-    beq      HLoopStart
-    b        VLoopTail
-
-HLoopTailZero:
-    mov      r0, r0, asr #5
-    add      r0, #128
-
-#if __ARM_ARCH__ >= 6
-    usat     r0, #8, r0
-#else
-    cmp      r0, #255
-    mvnhi    r0, r0, asr #31
-    andhi    r0, r0, #255
-#endif
-
-    orr      r0, r0, lsl #8
-    orr      r0, r0, lsl #16
-    mov      r1, r0
-    stmia    fp, {r0, r1}
-
-    add      r0, sp, #(off_WORKSPACE + 64*4)
-    cmp      ip, r0
-    beq      Exit
-    b        HLoopTail
-
-    .endfunc
diff --git a/jpeglib.h b/jpeglib.h
index 83bed4a..07e6872 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -649,6 +649,10 @@
 
   // save the decoder current bit buffer, entropy->bitstate.get_buffer.
   INT32 get_buffer;
+
+  // save the restart info.
+  unsigned short restarts_to_go;
+  unsigned char next_restart_num;
 } huffman_offset_data;
 
 typedef struct {
