Initial Contribution
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..9cfe4f6
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,37 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_SRC_FILES := \
+	jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+	jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+	jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+	jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+	jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+	jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+	jfdctint.c jidctflt.c jidctred.c jquant1.c \
+	jquant2.c jutils.c jmemmgr.c \
+	jmemnobs.c
+
+# the assembler is only for the ARM version, don't break the Linux sim
+ifneq ($(TARGET_ARCH),arm)
+ANDROID_JPEG_NO_ASSEMBLER := true
+endif
+
+# temp fix until we understand why this broke cnn.com
+#ANDROID_JPEG_NO_ASSEMBLER := true
+
+ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
+LOCAL_SRC_FILES += jidctint.c jidctfst.c
+else
+LOCAL_SRC_FILES += jidctint.c jidctfst.S
+endif
+
+LOCAL_CFLAGS += -DAVOID_TABLES 
+LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
+#LOCAL_CFLAGS += -march=armv6j
+
+LOCAL_MODULE:= libjpeg
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/MODULE_LICENSE_BSD_LIKE b/MODULE_LICENSE_BSD_LIKE
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_BSD_LIKE
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..007625f
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,38 @@
+This software is based in part on the work of the Independent JPEG Group.
+
+----------------------
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-1998, Thomas G. Lane.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library.  If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it.  This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
diff --git a/jccolor.c b/jccolor.c
index 0a8a4b5..57a76c3 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -12,6 +12,8 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 
+// this enables unrolling null_convert's loop, and reading/write ints for speed
+#define ENABLE_ANDROID_NULL_CONVERT
 
 /* Private subobject */
 
@@ -298,6 +300,36 @@
   }
 }
 
+#ifdef ENABLE_ANDROID_NULL_CONVERT
+
+typedef unsigned long UINT32;
+
+#define B0(n)   ((n) & 0xFF)
+#define B1(n)   (((n) >> 8) & 0xFF)
+#define B2(n)   (((n) >> 16) & 0xFF)
+#define B3(n)   ((n) >> 24)
+
+#define PACK(a, b, c, d)    ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
+
+static int ptr_is_quad(const void* p)
+{
+    return (((const char*)p - (const char*)0) & 3) == 0;
+}
+
+static void copyquads(const UINT32 in[], UINT32 out0[], UINT32 out1[], UINT32 out2[], int col4)
+{
+    do {
+        UINT32 src0 = *in++;
+        UINT32 src1 = *in++;
+        UINT32 src2 = *in++;
+        // LEndian
+        *out0++ = PACK(B0(src0), B3(src0), B2(src1), B1(src2));
+        *out1++ = PACK(B1(src0), B0(src1), B3(src1), B2(src2));
+        *out2++ = PACK(B2(src0), B1(src1), B0(src2), B3(src2));
+    } while (--col4 != 0);
+}
+
+#endif
 
 /*
  * Convert some rows of samples to the JPEG colorspace.
@@ -317,6 +349,42 @@
   int nc = cinfo->num_components;
   JDIMENSION num_cols = cinfo->image_width;
 
+#ifdef ENABLE_ANDROID_NULL_CONVERT
+    if (1 == num_rows && 3 == nc && num_cols > 0) {
+        JSAMPROW inptr = *input_buf;
+        JSAMPROW outptr0 = output_buf[0][output_row];
+        JSAMPROW outptr1 = output_buf[1][output_row];
+        JSAMPROW outptr2 = output_buf[2][output_row];
+        
+        int col = num_cols;
+        int col4 = col >> 2;
+        if (col4 > 0 && ptr_is_quad(inptr) && ptr_is_quad(outptr0) &&
+                        ptr_is_quad(outptr1) && ptr_is_quad(outptr2)) {
+            
+            const UINT32* in = (const UINT32*)inptr;
+            UINT32* out0 = (UINT32*)outptr0;
+            UINT32* out1 = (UINT32*)outptr1;
+            UINT32* out2 = (UINT32*)outptr2;
+            copyquads(in, out0, out1, out2, col4);
+            col &= 3;
+            if (0 == col)
+                return;
+            col4 <<= 2;
+            inptr += col4 * 3;  /* we read this 3 times per in copyquads */
+            outptr0 += col4;
+            outptr1 += col4;
+            outptr2 += col4;
+            /* fall through to while-loop */
+        }
+        do {
+            *outptr0++ = *inptr++;
+            *outptr1++ = *inptr++;
+            *outptr2++ = *inptr++;
+        } while (--col != 0);
+        return;
+    }
+SLOW:
+#endif
   while (--num_rows >= 0) {
     /* It seems fastest to make a separate pass for each component. */
     for (ci = 0; ci < nc; ci++) {
diff --git a/jconfig.h b/jconfig.h
new file mode 100644
index 0000000..15a9817
--- /dev/null
+++ b/jconfig.h
@@ -0,0 +1,156 @@
+/* android jconfig.h */
+/*
+ * jconfig.doc
+ *
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file documents the configuration options that are required to
+ * customize the JPEG software for a particular system.
+ *
+ * The actual configuration options for a particular installation are stored
+ * in jconfig.h.  On many machines, jconfig.h can be generated automatically
+ * or copied from one of the "canned" jconfig files that we supply.  But if
+ * you need to generate a jconfig.h file by hand, this file tells you how.
+ *
+ * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
+ * EDIT A COPY NAMED JCONFIG.H.
+ */
+
+
+/*
+ * These symbols indicate the properties of your machine or compiler.
+ * #define the symbol if yes, #undef it if no.
+ */
+
+/* Does your compiler support function prototypes?
+ * (If not, you also need to use ansi2knr, see install.doc)
+ */
+#define HAVE_PROTOTYPES
+
+/* Does your compiler support the declaration "unsigned char" ?
+ * How about "unsigned short" ?
+ */
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+
+/* Define "void" as "char" if your compiler doesn't know about type void.
+ * NOTE: be sure to define void such that "void *" represents the most general
+ * pointer type, e.g., that returned by malloc().
+ */
+/* #define void char */
+
+/* Define "const" as empty if your compiler doesn't know the "const" keyword.
+ */
+/* #define const */
+
+/* Define this if an ordinary "char" type is unsigned.
+ * If you're not sure, leaving it undefined will work at some cost in speed.
+ * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
+ */
+#undef CHAR_IS_UNSIGNED
+
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
+#define HAVE_STDDEF_H
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
+#define HAVE_STDLIB_H
+
+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
+#undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
+#undef NEED_SYS_TYPES_H
+
+/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
+ * unless you are using a large-data memory model or 80386 flat-memory mode.
+ * On less brain-damaged CPUs this symbol must not be defined.
+ * (Defining this symbol causes large data structures to be referenced through
+ * "far" pointers and to be allocated with a special version of malloc.)
+ */
+#undef NEED_FAR_POINTERS
+
+/* Define this if your linker needs global names to be unique in less
+ * than the first 15 characters.
+ */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
+#undef INCOMPLETE_TYPES_BROKEN
+
+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+
+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+
+#endif /* JPEG_INTERNALS */
+
+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
+#ifdef JPEG_CJPEG_DJPEG
+
+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
+#undef TWO_FILE_COMMANDLINE
+
+/* Define this if your system needs explicit cleanup of temporary files.
+ * This is crucial under MS-DOS, where the temporary "files" may be areas
+ * of extended memory; on most other systems it's not as important.
+ */
+#undef NEED_SIGNAL_CATCHER
+
+/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
+#undef DONT_USE_B_MODE
+
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
+#undef PROGRESS_REPORT
+
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jdcolor.c b/jdcolor.c
index 6c04dfe..202360c 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -28,6 +28,26 @@
 typedef my_color_deconverter * my_cconvert_ptr;
 
 
+#ifdef ANDROID_RGB
+
+/* Declarations for ordered dithering.
+ * 
+ * We use 4x4 ordered dither array packed into 32 bits. This array is
+ * sufficent for dithering RGB_888 to RGB_565.
+ */
+
+#define DITHER_MASK         0x3
+#define DITHER_ROTATE(x)    (((x)<<24) | (((x)>>8)&0x00FFFFFF))
+static const INT32 dither_matrix[4] = {
+  0x0008020A,
+  0x0C040E06,
+  0x030B0109,
+  0x0F070D05
+};
+
+#endif
+
+
 /**************** YCbCr -> RGB conversion: most common case **************/
 
 /*
@@ -76,26 +96,26 @@
 
   cconvert->Cr_r_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
+                                (MAXJSAMPLE+1) * SIZEOF(int));
   cconvert->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
+                                (MAXJSAMPLE+1) * SIZEOF(int));
   cconvert->Cr_g_tab = (INT32 *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
+                                (MAXJSAMPLE+1) * SIZEOF(INT32));
   cconvert->Cb_g_tab = (INT32 *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
+                                (MAXJSAMPLE+1) * SIZEOF(INT32));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
     /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
     /* Cr=>R value is nearest int to 1.40200 * x */
     cconvert->Cr_r_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
     /* Cb=>B value is nearest int to 1.77200 * x */
     cconvert->Cb_b_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
     cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
@@ -104,7 +124,6 @@
   }
 }
 
-
 /*
  * Convert some rows of samples to the output colorspace.
  *
@@ -148,17 +167,338 @@
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
-			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-						 SCALEBITS))];
+                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                 SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
       outptr += RGB_PIXELSIZE;
     }
   }
 }
 
+#ifdef ANDROID_RGB
+METHODDEF(void)
+ycc_rgba_8888_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register int y, cb, cr;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  SHIFT_TEMPS
 
-/**************** Cases other than YCbCr -> RGB **************/
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      y  = GETJSAMPLE(inptr0[col]);
+      cb = GETJSAMPLE(inptr1[col]);
+      cr = GETJSAMPLE(inptr2[col]);
+      /* Range-limiting is essential due to noise introduced by DCT losses. */
+      outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
+      outptr[RGB_GREEN] = range_limit[y +
+                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                 SCALEBITS))];
+      outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
+      outptr[RGB_ALPHA] =  0xFF;
+      outptr += 4;
+    }
+  }
+}
 
+METHODDEF(void)
+ycc_rgb_565_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register int y, cb, cr;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  SHIFT_TEMPS
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int r, g, b;
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        y  = GETJSAMPLE(*inptr0++);
+        cb = GETJSAMPLE(*inptr1++);
+        cr = GETJSAMPLE(*inptr2++);
+        r = range_limit[y + Crrtab[cr]];
+        g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS))];
+        b = range_limit[y + Cbbtab[cb]];
+        rgb = PACK_SHORT_565(r,g,b);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      y  = GETJSAMPLE(*inptr0++);
+      cb = GETJSAMPLE(*inptr1++);
+      cr = GETJSAMPLE(*inptr2++);
+      r = range_limit[y + Crrtab[cr]];
+      g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS))];
+      b = range_limit[y + Cbbtab[cb]];
+      rgb = PACK_SHORT_565(r,g,b);
+
+      y  = GETJSAMPLE(*inptr0++);
+      cb = GETJSAMPLE(*inptr1++);
+      cr = GETJSAMPLE(*inptr2++);
+      r = range_limit[y + Crrtab[cr]];
+      g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS))];
+      b = range_limit[y + Cbbtab[cb]];
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      y  = GETJSAMPLE(*inptr0);
+      cb = GETJSAMPLE(*inptr1);
+      cr = GETJSAMPLE(*inptr2);
+      r = range_limit[y + Crrtab[cr]];
+      g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS))];
+      b = range_limit[y + Cbbtab[cb]];
+      rgb = PACK_SHORT_565(r,g,b);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+
+METHODDEF(void)
+ycc_rgb_565D_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register int y, cb, cr;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  SHIFT_TEMPS
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int r, g, b;
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        y  = GETJSAMPLE(*inptr0++);
+        cb = GETJSAMPLE(*inptr1++);
+        cr = GETJSAMPLE(*inptr2++);
+        r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+        g = range_limit[DITHER_565_G(y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS)), d0)];
+        b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+        rgb = PACK_SHORT_565(r,g,b);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      y  = GETJSAMPLE(*inptr0++);
+      cb = GETJSAMPLE(*inptr1++);
+      cr = GETJSAMPLE(*inptr2++);
+      r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+      g = range_limit[DITHER_565_G(y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS)), d0)];
+      b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+      d0 = DITHER_ROTATE(d0);
+      rgb = PACK_SHORT_565(r,g,b);
+      y  = GETJSAMPLE(*inptr0++);
+      cb = GETJSAMPLE(*inptr1++);
+      cr = GETJSAMPLE(*inptr2++);
+      r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+      g = range_limit[DITHER_565_G(y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS)), d0)];
+      b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+      d0 = DITHER_ROTATE(d0);
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      y  = GETJSAMPLE(*inptr0);
+      cb = GETJSAMPLE(*inptr1);
+      cr = GETJSAMPLE(*inptr2);
+      r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
+      g = range_limit[DITHER_565_G(y + ((int)RIGHT_SHIFT(Cbgtab[cb]+Crgtab[cr], SCALEBITS)), d0)];
+      b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
+      rgb = PACK_SHORT_565(r,g,b);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+
+#endif
+
+/**************** Cases other than YCbCr -> RGB(A) **************/
+
+#ifdef ANDROID_RGB
+METHODDEF(void)
+rgb_rgba_8888_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+  SHIFT_TEMPS
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr2++;
+      *outptr++ = 0xFF;
+    }
+  }
+}
+
+METHODDEF(void)
+rgb_rgb_565_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+  SHIFT_TEMPS
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int r, g, b;
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        r = GETJSAMPLE(*inptr0++);
+        g = GETJSAMPLE(*inptr1++);
+        b = GETJSAMPLE(*inptr2++);
+        rgb = PACK_SHORT_565(r,g,b);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      r = GETJSAMPLE(*inptr0++);
+      g = GETJSAMPLE(*inptr1++);
+      b = GETJSAMPLE(*inptr2++);
+      rgb = PACK_SHORT_565(r,g,b);
+      r = GETJSAMPLE(*inptr0++);
+      g = GETJSAMPLE(*inptr1++);
+      b = GETJSAMPLE(*inptr2++);
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      r = GETJSAMPLE(*inptr0);
+      g = GETJSAMPLE(*inptr1);
+      b = GETJSAMPLE(*inptr2);
+      rgb = PACK_SHORT_565(r,g,b);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+
+
+METHODDEF(void)
+rgb_rgb_565D_convert (j_decompress_ptr cinfo,
+         JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  JDIMENSION num_cols = cinfo->output_width;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  SHIFT_TEMPS
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int r, g, b;
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
+        g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
+        b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+        rgb = PACK_SHORT_565(r,g,b);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
+      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
+      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      d0 = DITHER_ROTATE(d0);
+      rgb = PACK_SHORT_565(r,g,b);
+      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
+      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
+      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      d0 = DITHER_ROTATE(d0);
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
+      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
+      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      rgb = PACK_SHORT_565(r,g,b);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+
+#endif
 
 /*
  * Color conversion for no colorspace change: just copy the data,
@@ -233,6 +573,110 @@
   }
 }
 
+#ifdef ANDROID_RGB
+METHODDEF(void)
+gray_rgba_8888_convert (j_decompress_ptr cinfo,
+          JSAMPIMAGE input_buf, JDIMENSION input_row,
+          JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr = input_buf[0][input_row++];
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
+      outptr[RGB_ALPHA] = 0xff;
+      outptr += 4;
+    }
+  }
+}
+
+METHODDEF(void)
+gray_rgb_565_convert (j_decompress_ptr cinfo,
+          JSAMPIMAGE input_buf, JDIMENSION input_row,
+          JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int g;
+    inptr = input_buf[0][input_row++];
+    outptr = *output_buf++;
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        g = *inptr++;
+        rgb = PACK_SHORT_565(g, g, g);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      g = *inptr++;
+      rgb = PACK_SHORT_565(g, g, g);
+      g = *inptr++;
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g));
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      g = *inptr;
+      rgb = PACK_SHORT_565(g, g, g);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+
+METHODDEF(void)
+gray_rgb_565D_convert (j_decompress_ptr cinfo,
+          JSAMPIMAGE input_buf, JDIMENSION input_row,
+          JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  JDIMENSION num_cols = cinfo->output_width;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+
+  while (--num_rows >= 0) {
+    INT32 rgb;
+    unsigned int g;
+    inptr = input_buf[0][input_row++];
+    outptr = *output_buf++;
+    if (PACK_NEED_ALIGNMENT(outptr)) {
+        g = *inptr++;
+        g = range_limit[DITHER_565_R(g, d0)];
+        rgb = PACK_SHORT_565(g, g, g);
+        *(INT16*)outptr = rgb;
+        outptr += 2;
+        num_cols--;
+    }
+    for (col = 0; col < (num_cols>>1); col++) {
+      g = *inptr++;
+      g = range_limit[DITHER_565_R(g, d0)];
+      rgb = PACK_SHORT_565(g, g, g);
+      d0 = DITHER_ROTATE(d0);
+      g = *inptr++;
+      g = range_limit[DITHER_565_R(g, d0)];
+      rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(g, g, g));
+      d0 = DITHER_ROTATE(d0);
+      WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
+      outptr += 4;
+    }
+    if (num_cols&1) {
+      g = *inptr;
+      g = range_limit[DITHER_565_R(g, d0)];
+      rgb = PACK_SHORT_565(g, g, g);
+      *(INT16*)outptr = rgb;
+    }
+  }
+}
+#endif
 
 /*
  * Adobe-style YCCK->CMYK conversion.
@@ -272,11 +716,11 @@
       cb = GETJSAMPLE(inptr1[col]);
       cr = GETJSAMPLE(inptr2[col]);
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];	/* red */
-      outptr[1] = range_limit[MAXJSAMPLE - (y +			/* green */
-			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-						 SCALEBITS)))];
-      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];	/* blue */
+      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
+      outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
+                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                 SCALEBITS)))];
+      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
       outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
       outptr += 4;
@@ -368,6 +812,47 @@
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
+#ifdef ANDROID_RGB
+  case JCS_RGBA_8888:
+    cinfo->out_color_components = 4;
+    if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      cconvert->pub.color_convert = ycc_rgba_8888_convert;
+      build_ycc_rgb_table(cinfo);
+    } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+      cconvert->pub.color_convert = gray_rgba_8888_convert;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      cconvert->pub.color_convert = rgb_rgba_8888_convert;
+    } else
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+    break;
+
+  case JCS_RGB_565:
+    cinfo->out_color_components = RGB_PIXELSIZE;
+    if (cinfo->dither_mode == JDITHER_NONE) {
+      if (cinfo->jpeg_color_space == JCS_YCbCr) {
+        cconvert->pub.color_convert = ycc_rgb_565_convert;
+        build_ycc_rgb_table(cinfo);
+      } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+        cconvert->pub.color_convert = gray_rgb_565_convert;
+      } else if (cinfo->jpeg_color_space == JCS_RGB) {
+        cconvert->pub.color_convert = rgb_rgb_565_convert;
+      } else
+        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+    } else {
+      /* only ordered dither is supported */
+      if (cinfo->jpeg_color_space == JCS_YCbCr) {
+        cconvert->pub.color_convert = ycc_rgb_565D_convert;
+        build_ycc_rgb_table(cinfo);
+      } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+        cconvert->pub.color_convert = gray_rgb_565D_convert;
+      } else if (cinfo->jpeg_color_space == JCS_RGB) {
+        cconvert->pub.color_convert = rgb_rgb_565D_convert;
+      } else
+        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+    }
+    break;
+#endif
+    
   case JCS_CMYK:
     cinfo->out_color_components = 4;
     if (cinfo->jpeg_color_space == JCS_YCCK) {
diff --git a/jdmaster.c b/jdmaster.c
index 2802c5b..8925013 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -47,11 +47,24 @@
   /* Merging is the equivalent of plain box-filter upsampling */
   if (cinfo->do_fancy_upsampling || cinfo->CCIR601_sampling)
     return FALSE;
+
+#ifdef ANDROID_RGB
+  /* jdmerge.c only supports YCC=>RGB565 and YCC=>RGB color conversion */
+  if (cinfo->jpeg_color_space != JCS_YCbCr || 
+      cinfo->num_components != 3 ||
+      cinfo->out_color_components != 3 ||
+      (cinfo->out_color_space != JCS_RGB_565 && 
+         cinfo->out_color_space != JCS_RGB)) {
+    return FALSE;
+  }
+#else
   /* jdmerge.c only supports YCC=>RGB color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
       cinfo->out_color_space != JCS_RGB ||
       cinfo->out_color_components != RGB_PIXELSIZE)
     return FALSE;
+#endif
+
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
       cinfo->comp_info[1].h_samp_factor != 1 ||
@@ -179,11 +192,17 @@
     cinfo->out_color_components = RGB_PIXELSIZE;
     break;
 #endif /* else share code with YCbCr */
+#ifdef ANDROID_RGB
+  case JCS_RGB_565:
+#endif
   case JCS_YCbCr:
     cinfo->out_color_components = 3;
     break;
   case JCS_CMYK:
   case JCS_YCCK:
+#ifdef ANDROID_RGB
+  case JCS_RGBA_8888:
+#endif
     cinfo->out_color_components = 4;
     break;
   default:			/* else must be same colorspace as in file */
@@ -217,7 +236,7 @@
  * For most steps we can mathematically guarantee that the initial value
  * of x is within MAXJSAMPLE+1 of the legal range, so a table running from
  * -(MAXJSAMPLE+1) to 2*MAXJSAMPLE+1 is sufficient.  But for the initial
- * limiting step (just after the IDCT), a wildly out-of-range value is 
+ * limiting step (just after the IDCT), a wildly out-of-range value is
  * possible if the input data is corrupt.  To avoid any chance of indexing
  * off the end of memory and getting a bad-pointer trap, we perform the
  * post-IDCT limiting thus:
diff --git a/jdmerge.c b/jdmerge.c
index 3744446..77f3308 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -38,6 +38,24 @@
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
+#ifdef ANDROID_RGB
+
+/* Declarations for ordered dithering.
+ * 
+ * We use 4x4 ordered dither array packed into 32 bits. This array is
+ * sufficent for dithering RGB_888 to RGB_565.
+ */
+
+#define DITHER_MASK         0x3
+#define DITHER_ROTATE(x)    (((x)<<24) | (((x)>>8)&0x00FFFFFF))
+static const INT32 dither_matrix[4] = {
+  0x0008020A,
+  0x0C040E06,
+  0x030B0109,
+  0x0F070D05
+};
+
+#endif
 
 /* Private subobject */
 
@@ -154,8 +172,14 @@
 
   if (upsample->spare_full) {
     /* If we have a spare row saved from a previous cycle, just return it. */
+      JDIMENSION size = upsample->out_row_width;
+#ifdef ANDROID_RGB
+    if (cinfo->out_color_space == JCS_RGB_565)
+      size = cinfo->output_width*2;
+#endif
     jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
-		      1, upsample->out_row_width);
+		      1, size);
+
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -255,14 +279,14 @@
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
+    outptr[RGB_RED] = range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[RGB_BLUE] = range_limit[y + cblue];
     outptr += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
+    outptr[RGB_RED] = range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[RGB_BLUE] = range_limit[y + cblue];
     outptr += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -273,13 +297,148 @@
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
-    outptr[RGB_RED] =   range_limit[y + cred];
+    outptr[RGB_RED] = range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[RGB_BLUE] = range_limit[y + cblue];
   }
 }
 
 
+#ifdef ANDROID_RGB
+METHODDEF(void)
+h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
+              JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+              JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr;
+  JSAMPROW inptr0, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+  /* Loop for each pair of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    /* Fetch 2 Y values and emit 2 pixels */
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr, rgb);
+    outptr += 4;
+  }
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr0);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r,g,b);
+    *(INT16*)outptr = rgb;
+  }
+}
+
+
+METHODDEF(void)
+h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
+              JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+              JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr;
+  JSAMPROW inptr0, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JDIMENSION col_index = 0;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+  /* Loop for each pair of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    /* Fetch 2 Y values and emit 2 pixels */
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr0++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr, rgb);
+    outptr += 4;
+  }
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr0);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    rgb = PACK_SHORT_565(r,g,b);
+    *(INT16*)outptr = rgb;
+  }
+}
+
+
+#endif
+
 /*
  * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
  */
@@ -319,24 +478,24 @@
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
+    outptr0[RGB_RED] = range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[RGB_BLUE] = range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
+    outptr0[RGB_RED] = range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[RGB_BLUE] = range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
+    outptr1[RGB_RED] = range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[RGB_BLUE] = range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
+    outptr1[RGB_RED] = range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[RGB_BLUE] = range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -347,17 +506,197 @@
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
+    outptr0[RGB_RED] = range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[RGB_BLUE] = range_limit[y + cblue];
     y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
+    outptr1[RGB_RED] = range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[RGB_BLUE] = range_limit[y + cblue];
   }
 }
 
 
+#ifdef ANDROID_RGB
+
+METHODDEF(void)
+h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
+              JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+              JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr0, outptr1;
+  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr00 = input_buf[0][in_row_group_ctr*2];
+  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+  /* Loop for each group of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    /* Fetch 4 Y values and emit 4 pixels */
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr0, rgb);
+    outptr0 += 4;
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr1, rgb);
+    outptr1 += 4;
+  }
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr00);
+    r = range_limit[y + cred];
+    g = range_limit[y + cgreen];
+    b = range_limit[y + cblue];
+    rgb = PACK_SHORT_565(r,g,b);
+    *(INT16*)outptr0 = rgb;
+   y  = GETJSAMPLE(*inptr01);
+   r = range_limit[y + cred];
+   g = range_limit[y + cgreen];
+   b = range_limit[y + cblue];
+   rgb = PACK_SHORT_565(r,g,b);
+   *(INT16*)outptr1 = rgb;
+  }
+}
+
+
+
+METHODDEF(void)
+h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
+              JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+              JSAMPARRAY output_buf)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  register int y, cred, cgreen, cblue;
+  int cb, cr;
+  register JSAMPROW outptr0, outptr1;
+  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  JDIMENSION col;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  int * Crrtab = upsample->Cr_r_tab;
+  int * Cbbtab = upsample->Cb_b_tab;
+  INT32 * Crgtab = upsample->Cr_g_tab;
+  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JDIMENSION col_index = 0;
+  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  INT32 d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  unsigned int r, g, b;
+  INT32 rgb;
+  SHIFT_TEMPS
+
+  inptr00 = input_buf[0][in_row_group_ctr*2];
+  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+  /* Loop for each group of output pixels */
+  for (col = cinfo->output_width >> 1; col > 0; col--) {
+    
+    /* Do the chroma part of the calculation */
+    cb = GETJSAMPLE(*inptr1++);
+    cr = GETJSAMPLE(*inptr2++);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    /* Fetch 4 Y values and emit 4 pixels */    
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr00++);
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr0, rgb);
+    outptr0 += 4;
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
+    rgb = PACK_SHORT_565(r,g,b);
+    y  = GETJSAMPLE(*inptr01++);
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
+    rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r,g,b));
+    WRITE_TWO_PIXELS(outptr1, rgb);
+    outptr1 += 4;
+  }
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    cb = GETJSAMPLE(*inptr1);
+    cr = GETJSAMPLE(*inptr2);
+    cred = Crrtab[cr];
+    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cblue = Cbbtab[cb];
+    y  = GETJSAMPLE(*inptr00);
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    rgb = PACK_SHORT_565(r,g,b);
+    *(INT16*)outptr0 = rgb;
+   y  = GETJSAMPLE(*inptr01);
+   r = range_limit[DITHER_565_R(y + cred, d1)];
+   g = range_limit[DITHER_565_G(y + cgreen, d1)];
+   b = range_limit[DITHER_565_B(y + cblue, d1)];
+   rgb = PACK_SHORT_565(r,g,b);
+   *(INT16*)outptr1 = rgb;
+  }
+}
+
+#endif
+
 /*
  * Module initialization routine for merged upsampling/color conversion.
  *
@@ -379,10 +718,19 @@
   upsample->pub.need_context_rows = FALSE;
 
   upsample->out_row_width = cinfo->output_width * cinfo->out_color_components;
-
+  
   if (cinfo->max_v_samp_factor == 2) {
     upsample->pub.upsample = merged_2v_upsample;
     upsample->upmethod = h2v2_merged_upsample;
+#ifdef ANDROID_RGB
+    if (cinfo->out_color_space == JCS_RGB_565) {
+        if (cinfo->dither_mode == JDITHER_NONE) {
+            upsample->upmethod = h2v2_merged_upsample_565;
+        } else {
+            upsample->upmethod = h2v2_merged_upsample_565D;
+        }
+    }
+#endif
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -390,6 +738,15 @@
   } else {
     upsample->pub.upsample = merged_1v_upsample;
     upsample->upmethod = h2v1_merged_upsample;
+#ifdef ANDROID_RGB
+    if (cinfo->out_color_space == JCS_RGB_565) {
+        if (cinfo->dither_mode == JDITHER_NONE) {
+            upsample->upmethod = h2v1_merged_upsample_565;
+        } else {
+            upsample->upmethod = h2v1_merged_upsample_565D;
+        }
+    }
+#endif
     /* No spare row needed */
     upsample->spare_row = NULL;
   }
diff --git a/jidctfst.S b/jidctfst.S
new file mode 100644
index 0000000..88fb661
--- /dev/null
+++ b/jidctfst.S
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+    .text
+    .align
+
+    .global jpeg_idct_ifast
+    .func   jpeg_idct_ifast
+
+// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
+
+// jpeg_idct_ifast (j_decompress_ptr       cinfo,
+//                 jpeg_component_info *   compptr,
+//                 short*                  coef_block,
+//                 unsigned char*          output_buf,
+//                 int                     output_col)
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__)
+#define ARMv6 1
+#endif
+
+#define  local_TMP0123       sp
+#define  local_TMP0          [sp, #0]
+#define  local_TMP1          [sp, #4]
+#define  local_TMP2          [sp, #8]
+#define  local_TMP3          [sp, #12]
+#define  local_RANGE_TABLE   [sp, #16]
+#define  local_OUTPUT_COL    [sp, #20]
+#define  local_OUTPUT_BUF    [sp, #24]
+#define  local_UNUSED        [sp, #28]
+#define  off_WORKSPACE       32
+#define  local_WORKSPACE     [sp, #offWORKSPACE]
+#define  local_SIZE          (off_WORKSPACE + 8*8*4)
+
+#define  off_DECOMPRESS_range_limit_base  324
+#define  off_COMPINFO_quanttable          80
+
+#define  DCTSIZE   8
+#define  VY(x)   ((x)*DCTSIZE*2)
+#define  QY(x)   ((x)*DCTSIZE*4)
+
+#define  VX(x)   ((x)*2)
+#define  QX(x)   ((x)*4)
+
+#define  FIX_1_414213562    #362
+#define  FIX_1_082392200    #277
+#define  FIX_1_847759065    #473
+#define  FIX_2_613125930    #669
+
+#define  RANGE_MASK   1023
+
+
+
+jpeg_idct_ifast:
+    pld     [r2, #0]
+    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+    ldr     r4, [sp, #4*10]
+    sub     sp, #local_SIZE
+
+    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
+    str     r4, local_OUTPUT_COL
+    str     r3, local_OUTPUT_BUF
+    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
+    add     r5, r5, #128
+    str     r5, local_RANGE_TABLE
+    mov     fp, r2                                      // fp = coef_block
+    add     ip, sp, #off_WORKSPACE
+
+VLoopTail:
+    ldrsh    r0, [fp, #VY(0)]
+    ldrsh    r1, [fp, #VY(1)]
+    ldrsh    r2, [fp, #VY(2)]
+    ldrsh    r3, [fp, #VY(3)]
+    ldrsh    r4, [fp, #VY(4)]
+    ldrsh    r5, [fp, #VY(5)]
+    ldrsh    r6, [fp, #VY(6)]
+    ldrsh    r7, [fp, #VY(7)]
+
+    cmp      r1, #0
+    orreqs   r8, r2, r3
+    orreqs   r8, r4, r5
+    orreqs   r8, r6, r7
+    beq      VLoopHeadZero
+
+VLoopHead:
+    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
+    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
+    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
+    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
+    // tmp10 = tmp0 + tmp2   (r0)
+    // tmp11 = tmp0 - tmp2   (r4)
+
+    ldr      r9, [r10, #QY(4)]
+    ldr      r8, [r10, #QY(0)]
+    smulbb   r4, r9, r4
+    smlabb   r0, r8, r0, r4
+    ldr      r9, [r10, #QY(6)]
+    ldr      r8, [r10, #QY(2)]
+    sub      r4, r0, r4, lsl #1
+    smulbb   r6, r9, r6
+    smlabb   r2, r8, r2, r6
+
+    // tmp13 = tmp1 + tmp3                                       (r2)
+    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
+    // FIX_1_4142... = 362 = 45*8 + 2
+    sub      r6, r2, r6, lsl #1
+    mov      r8, #360
+    add      r8, r8, #2
+    mul      r9, r6, r8
+
+    // tmp0 = tmp10 + tmp13;   (r0)
+    // tmp3 = tmp10 - tmp13;   (r8)
+    // tmp1 = tmp11 + tmp12;   (r4)
+    // tmp2 = tmp11 - tmp12;   (r6)
+    add     r0, r0, r2
+    rsb     r6, r2, r9, asr #8
+    sub     r8, r0, r2, lsl #1
+    add     r4, r4, r6
+    sub     r6, r4, r6, lsl #1
+
+    stmia   local_TMP0123, {r0, r4, r6, r8}
+
+    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
+
+    // odd part
+    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
+    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
+    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
+    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
+    // z13 = tmp6 + tmp5;  (r0)
+    // z10 = tmp6 - tmp5;  (r2)
+    // z11 = tmp4 + tmp7;  (r4)
+    // z12 = tmp4 - tmp7;  (r6)
+
+    ldr     r2, [r10, #QY(1)]
+    ldr     r9, [r10, #QY(5)]
+    smulbb  r1, r2, r1
+    ldr     r2, [r10, #QY(3)]
+    smulbb  r5, r9, r5
+    ldr     r9, [r10, #QY(7)]
+    smlabb  r0, r2, r3, r5
+    smlabb  r4, r9, r7, r1
+    rsb  r2, r0, r5, lsl #1
+    rsb  r6, r4, r1, lsl #1
+
+    // tmp7 = z11 + z13;                             (r7)
+    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+    // FIX_... = 360 + 2
+    add   r7, r4, r0
+    sub   r1, r4, r0
+    mov   r8, #360
+    add   r8, r8, #2
+    mul   r1, r8, r1
+
+    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
+    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
+    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
+    // FIX_1_8477... = 473 = 472 + 1
+    // FIX_1_082...  = 277 = 276 + 1
+    // FIX_2_...     = 669 = 668 + 1
+    add     r8, r2, r6
+    mov     r9, #472
+    mla     r8, r9, r8, r8
+    mov     r9, #276
+    mla     r0, r6, r9, r6
+    mov     r9, #668
+    mla     r2, r9, r2, r2
+    sub     r0, r0, r8
+    rsb     r2, r2, r8
+
+    // tmp6 = tmp12 - tmp7;  (r6)
+    // tmp5 = tmp11 - tmp6;  (r5)
+    // tmp4 = tmp10 + tmp5;  (r4)
+    rsb  r6, r7, r2, asr #8
+    rsb  r5, r6, r1, asr #8
+    add  r4, r5, r0, asr #8
+
+    ldmia local_TMP0123, {r0, r1, r2, r3}
+
+    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
+    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
+    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
+    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
+    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
+    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
+    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
+    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+
+    add   r0, r0, r7
+    sub   r7, r0, r7, lsl #1
+    add   r1, r1, r6
+    sub   r6, r1, r6, lsl #1
+    add   r2, r2, r5
+    sub   r5, r2, r5, lsl #1
+    sub   r3, r3, r4
+    add   r4, r3, r4, lsl #1
+
+    str   r0, [ip, #QY(0)]
+    str   r1, [ip, #QY(1)]
+    str   r2, [ip, #QY(2)]
+    str   r3, [ip, #QY(3)]
+    str   r4, [ip, #QY(4)]
+    str   r5, [ip, #QY(5)]
+    str   r6, [ip, #QY(6)]
+    str   r7, [ip, #QY(7)]
+
+    // inptr++;                    /* advance pointers to next column */
+    // quantptr++;
+    // wsptr++;
+    add  fp, fp, #2
+    add  r10, r10, #4
+    add  ip, ip, #4
+    add  r0, sp, #(off_WORKSPACE + 4*8)
+    cmp  ip, r0
+    bne  VLoopTail
+
+
+
+HLoopStart:
+    // reset pointers
+    pld     [sp, #off_WORKSPACE]
+    add     ip, sp, #off_WORKSPACE
+    ldr     r10, local_RANGE_TABLE
+
+HLoopTail:
+    // output = *output_buf++ + output_col
+    ldr      r0, local_OUTPUT_BUF
+    ldr      r1, local_OUTPUT_COL
+    ldr      r2, [r0], #4
+    str      r0, local_OUTPUT_BUF
+    add      fp, r2, r1
+
+    pld      [ip, #32]
+    ldmia    ip!, {r0-r7}
+
+    cmp      r1, #0
+    orreqs   r8, r2, r3
+    orreqs   r8, r4, r5
+    orreqs   r8, r6, r7
+    beq      HLoopTailZero
+
+HLoopHead:
+    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
+    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
+    add     r0, r0, r4
+    sub     r4, r0, r4, lsl #1
+
+    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
+    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
+    // FIX_... = 360 + 2
+    add     r2, r2, r6
+    sub     r6, r2, r6, lsl #1
+    mov     r8, #360
+    add     r8, r8, #2
+    mul     r6, r8, r6
+
+    // tmp0 = tmp10 + tmp13;   (r0)
+    // tmp3 = tmp10 - tmp13;   (r8)
+    // tmp1 = tmp11 + tmp12;   (r4)
+    // tmp2 = tmp11 - tmp12;   (r6)
+    add     r0, r0, r2
+    rsb     r6, r2, r6, asr #8
+    sub     r8, r0, r2, lsl #1
+    add     r4, r4, r6
+    sub     r6, r4, r6, lsl #1
+
+    stmia   local_TMP0123, {r0, r4, r6, r8}
+
+    // Odd part
+
+    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
+    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
+    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
+    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
+    add  r0, r5, r3
+    sub  r2, r5, r3
+    add  r4, r1, r7
+    sub  r6, r1, r7
+
+    // tmp7 = z11 + z13;                             (r7)
+    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+    // FIX_... = 360 + 2
+    add   r7, r4, r0
+    sub   r1, r4, r0
+    mov   r8, #360
+    add   r8, r8, #2
+    mul   r1, r8, r1
+
+    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
+    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
+    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
+    // FIX_1_8477... = 473 = 472 + 1
+    // FIX_1_082...  = 277 = 276 + 1
+    // FIX_2_...     = 669 = 668 + 1
+    add  r8, r2, r6
+    mov  r9, #472
+    mla  r8, r9, r8, r8
+    mov  r9, #276
+    mla  r0, r6, r9, r6
+    mov  r9, #668
+    mla  r2, r9, r2, r2
+    sub  r0, r0, r8
+    sub  r2, r8, r2
+
+    // tmp6 = tmp12 - tmp7;  (r6)
+    // tmp5 = tmp11 - tmp6;  (r5)
+    // tmp4 = tmp10 + tmp5;  (r4)
+    rsb  r6, r7, r2, asr #8
+    rsb  r5, r6, r1, asr #8
+    add  r4, r5, r0, asr #8
+
+    ldmia local_TMP0123, {r0, r1, r2, r3}
+
+    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
+
+    mov    r8, #128
+    add    r0, r0, r7
+    sub    r7, r0, r7, lsl #1
+    add    r0, r8, r0, asr #5
+    add    r7, r8, r7, asr #5
+    add    r1, r1, r6
+    sub    r6, r1, r6, lsl #1
+    add    r1, r8, r1, asr #5
+    add    r6, r8, r6, asr #5
+    add    r2, r2, r5
+    sub    r5, r2, r5, lsl #1
+    add    r2, r8, r2, asr #5
+    add    r5, r8, r5, asr #5
+    sub    r3, r3, r4
+    add    r4, r3, r4, lsl #1
+    add    r3, r8, r3, asr #5
+    add    r4, r8, r4, asr #5
+
+#ifdef ARMv6
+    usat   r0, #8, r0
+    usat   r1, #8, r1
+    usat   r2, #8, r2
+    usat   r3, #8, r3
+    usat   r4, #8, r4
+    usat   r5, #8, r5
+    usat   r6, #8, r6
+    usat   r7, #8, r7
+#else
+    cmp    r0, #255
+    mvnhi  r0, r0, asr #31
+    andhi  r0, #255
+    cmp    r7, #255
+    mvnhi  r7, r7, asr #31
+    cmp    r1, #255
+    mvnhi  r1, r1, asr #31
+    andhi  r1, #255
+    cmp    r6, #255
+    mvnhi  r6, r6, asr #31
+    andhi  r6, #255
+    cmp    r2, #255
+    mvnhi  r2, r2, asr #31
+    andhi  r2, #255
+    cmp    r5, #255
+    mvnhi  r5, r5, asr #31
+    andhi  r5, #255
+    cmp    r3, #255
+    mvnhi  r3, r3, asr #31
+    cmp    r4, #255
+    mvnhi  r4, r4, asr #31
+    andhi  r4, #255
+#endif
+
+    // r3 r2 r1 r0
+    orr    r0, r0, r1, lsl #8
+    orr    r0, r0, r2, lsl #16
+    orr    r0, r0, r3, lsl #24
+
+    // r7 r6 r5 r4
+    orr    r1, r4, r5, lsl #8
+    orr    r1, r1, r6, lsl #16
+    orr    r1, r1, r7, lsl #24
+    stmia  fp, {r0, r1}
+
+    add    r0, sp, #(off_WORKSPACE + 8*8*4)
+    cmp    ip, r0
+    bne    HLoopTail
+
+Exit:
+    add    sp, sp, #local_SIZE
+    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+    bx     lr
+
+
+VLoopHeadZero:
+// ok, all AC coefficients are 0
+    ldr      r1, [r10, #QY(0)]
+    add      fp, fp, #2
+    add      r10, r10, #4
+    mul      r0, r1, r0
+    str      r0, [ip, #QY(0)]
+    str      r0, [ip, #QY(1)]
+    str      r0, [ip, #QY(2)]
+    str      r0, [ip, #QY(3)]
+    str      r0, [ip, #QY(4)]
+    str      r0, [ip, #QY(5)]
+    str      r0, [ip, #QY(6)]
+    str      r0, [ip, #QY(7)]
+    add      ip, ip, #4
+    add      r0, sp, #(off_WORKSPACE + 4*8)
+    cmp      ip, r0
+    beq      HLoopStart
+    b        VLoopTail
+
+HLoopTailZero:
+    mov      r0, r0, asr #5
+    add      r0, #128
+
+#ifdef ARMv6
+    usat     r0, #8, r0
+#else
+    cmp      r0, #255
+    mvnhi    r0, r0, asr #31
+    andhi    r0, r0, #255
+#endif
+
+    orr      r0, r0, lsl #8
+    orr      r0, r0, lsl #16
+    mov      r1, r0
+    stmia    fp, {r0, r1}
+
+    add      r0, sp, #(off_WORKSPACE + 64*4)
+    cmp      ip, r0
+    beq      Exit
+    b        HLoopTail
+
+    .endfunc
diff --git a/jmorecfg.h b/jmorecfg.h
index 54a7d1c..d699a2c 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -10,6 +10,28 @@
  * optimizations.  Most users will not need to touch this file.
  */
 
+/*
+ * Define ANDROID_RGB to enable specific optimizations for Android
+ *   JCS_RGBA_8888 support
+ *   JCS_RGB_565 support
+ * 
+ */
+
+#define ANDROID_RGB
+
+#ifdef ANDROID_RGB
+#define PACK_SHORT_565(r,g,b)  ((((r)<<8)&0xf800)|(((g)<<3)&0x7C0)|((b)>>3))
+#define PACK_TWO_PIXELS(l,r)   ((r<<16) | l)
+#define PACK_NEED_ALIGNMENT(ptr) (((int)(ptr))&3)
+#define WRITE_TWO_PIXELS(addr, pixels) do {     \
+         ((INT16*)(addr))[0] = (pixels);        \
+         ((INT16*)(addr))[1] = (pixels)>>16;    \
+    } while(0)
+#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(INT32*)(addr)) = pixels)
+#define DITHER_565_R(r, dither) ((r) + ((dither)&0xFF))
+#define DITHER_565_G(g, dither) ((g) + (((dither)&0xFF)>>1))
+#define DITHER_565_B(b, dither) ((b) + ((dither)&0xFF))
+#endif
 
 /*
  * Define BITS_IN_JSAMPLE as either
@@ -314,8 +336,10 @@
 #define RGB_RED		0	/* Offset of Red in an RGB scanline element */
 #define RGB_GREEN	1	/* Offset of Green */
 #define RGB_BLUE	2	/* Offset of Blue */
-#define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
-
+#ifdef ANDROID_RGB
+#define RGB_ALPHA   3   /* Offset of Alpha */
+#endif
+#define RGB_PIXELSIZE   3   /* JSAMPLEs per RGB scanline element */
 
 /* Definitions for speed-related optimizations. */
 
diff --git a/jpeglib.h b/jpeglib.h
index d1be8dd..0f3a547 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -128,9 +128,9 @@
   /* The decompressor output side may not use these variables. */
   int dc_tbl_no;		/* DC entropy table selector (0..3) */
   int ac_tbl_no;		/* AC entropy table selector (0..3) */
-  
+
   /* Remaining fields should be treated as private by applications. */
-  
+
   /* These values are computed during compression or decompression startup: */
   /* Component's size in DCT blocks.
    * Any dummy blocks added to complete an MCU are not counted; therefore
@@ -209,7 +209,11 @@
 	JCS_RGB,		/* red/green/blue */
 	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
 	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK		/* Y/Cb/Cr/K */
+	JCS_YCCK,		/* Y/Cb/Cr/K */
+#ifdef ANDROID_RGB
+    JCS_RGBA_8888,  /* red/green/blue/alpha */
+    JCS_RGB_565     /* red/green/blue in 565 format */
+#endif
 } J_COLOR_SPACE;
 
 /* DCT/IDCT algorithm options. */
@@ -298,14 +302,14 @@
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
-  
+
   JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
   /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+
   JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
   JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -345,7 +349,7 @@
   UINT16 X_density;		/* Horizontal pixel density */
   UINT16 Y_density;		/* Vertical pixel density */
   boolean write_Adobe_marker;	/* should an Adobe marker be written? */
-  
+
   /* State variable: index of next scanline to be written to
    * jpeg_write_scanlines().  Application may use this to control its
    * processing loop, e.g., "while (next_scanline < image_height)".
@@ -370,7 +374,7 @@
    * There are v_samp_factor * DCTSIZE sample rows of each component in an
    * "iMCU" (interleaved MCU) row.
    */
-  
+
   /*
    * These fields are valid during any one scan.
    * They describe the components and MCUs actually appearing in the scan.
@@ -378,10 +382,10 @@
   int comps_in_scan;		/* # of JPEG components in this scan */
   jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
-  
+
   JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
   JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
-  
+
   int blocks_in_MCU;		/* # of DCT blocks per MCU */
   int MCU_membership[C_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
@@ -652,7 +656,7 @@
 #define JMSG_LENGTH_MAX  200	/* recommended size of format_message buffer */
   /* Reset error state variables at start of a new image */
   JMETHOD(void, reset_error_mgr, (j_common_ptr cinfo));
-  
+
   /* The message ID code and any parameters are saved here.
    * A message can have one string parameter or up to 8 int parameters.
    */
@@ -662,11 +666,11 @@
     int i[8];
     char s[JMSG_STR_PARM_MAX];
   } msg_parm;
-  
+
   /* Standard state variables for error facility */
-  
+
   int trace_level;		/* max msg_level that will be displayed */
-  
+
   /* For recoverable corrupt-data errors, we emit a warning message,
    * but keep going unless emit_message chooses to abort.  emit_message
    * should count warnings in num_warnings.  The surrounding application
@@ -824,7 +828,7 @@
 /* Short forms of external names for systems with brain-damaged linkers.
  * We shorten external names to be unique in the first six letters, which
  * is good enough for all known systems.
- * (If your compiler itself needs names to be unique in less than 15 
+ * (If your compiler itself needs names to be unique in less than 15
  * characters, you are out of luck.  Get a better compiler.)
  */