am f7e17efa: am 80519db0: am 40dfc1d1: am 8fe7969d: am dc20ac98: improve the handling of images with duplicate component IDs

* commit 'f7e17efa07f55cf64199942e488cba7d8b2d5b13':
  improve the handling of images with duplicate component IDs
diff --git a/Android.mk b/Android.mk
index 5bb0657..6e78c74 100644
--- a/Android.mk
+++ b/Android.mk
@@ -40,9 +40,17 @@
   LOCAL_SRC_FILES += jidctintelsse.c
 endif
 
-# enable armv6 idct assembly
 ifeq ($(strip $(TARGET_ARCH)),arm)
-  LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+  ifeq ($(ARCH_ARM_HAVE_NEON),true)
+    #use NEON accelerations
+    LOCAL_CFLAGS += -DNV_ARM_NEON
+    LOCAL_SRC_FILES += \
+        jsimd_arm_neon.S \
+        jsimd_neon.c
+  else
+    # enable armv6 idct assembly
+    LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+  endif
 endif
 
 # use mips assembler IDCT implementation if MIPS DSP-ASE is present
diff --git a/NOTICE b/NOTICE
index 007625f..70e356f 100644
--- a/NOTICE
+++ b/NOTICE
@@ -36,3 +36,59 @@
 We specifically permit and encourage the use of this software as the basis of
 commercial products, provided that all warranty or liability claims are
 assumed by the product vendor.
+
+
+----------------------
+
+
+ ARM NEON optimizations for libjpeg-turbo
+
+ Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
+ All rights reserved.
+ Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+
+ This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+
+ ----------------------
+
+
+ Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the NVIDIA CORPORATION nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/jdcolor.c b/jdcolor.c
index 202360c..d9048eb 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -11,7 +11,9 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-
+#ifdef NV_ARM_NEON
+#include "jsimd_neon.h"
+#endif
 
 /* Private subobject */
 
@@ -816,7 +818,15 @@
   case JCS_RGBA_8888:
     cinfo->out_color_components = 4;
     if (cinfo->jpeg_color_space == JCS_YCbCr) {
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+      if (cap_neon_ycc_rgb()) {
+        cconvert->pub.color_convert = jsimd_ycc_rgba8888_convert;
+      } else {
+        cconvert->pub.color_convert = ycc_rgba_8888_convert;
+      }
+#else
       cconvert->pub.color_convert = ycc_rgba_8888_convert;
+#endif
       build_ycc_rgb_table(cinfo);
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgba_8888_convert;
@@ -830,7 +840,15 @@
     cinfo->out_color_components = RGB_PIXELSIZE;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+        if (cap_neon_ycc_rgb())  {
+          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+        } else {
+          cconvert->pub.color_convert = ycc_rgb_565_convert;
+        }
+#else
         cconvert->pub.color_convert = ycc_rgb_565_convert;
+#endif
         build_ycc_rgb_table(cinfo);
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
         cconvert->pub.color_convert = gray_rgb_565_convert;
diff --git a/jddctmgr.c b/jddctmgr.c
index 8bc9668..980f8f3 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -32,6 +32,10 @@
   #endif
 #endif
 
+#ifdef NV_ARM_NEON
+#include "jsimd_neon.h"
+#endif
+
 #ifdef ANDROID_ARMV6_IDCT
 
 /* Intentionally declare the prototype with arguments of primitive types instead
@@ -145,11 +149,27 @@
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 2:
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+      if (cap_neon_idct_2x2()) {
+        method_ptr = jsimd_idct_2x2;
+      } else {
+        method_ptr = jpeg_idct_2x2;
+      }
+#else
       method_ptr = jpeg_idct_2x2;
+#endif
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 4:
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+	  if (cap_neon_idct_4x4()) {
+        method_ptr = jsimd_idct_4x4;
+      } else {
+        method_ptr = jpeg_idct_4x4;
+      }
+#else
       method_ptr = jpeg_idct_4x4;
+#endif
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
 #endif
@@ -184,7 +204,15 @@
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-	method_ptr = jpeg_idct_ifast;
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+        if (cap_neon_idct_ifast()) {
+          method_ptr = jsimd_idct_ifast;
+        } else {
+          method_ptr = jpeg_idct_ifast;
+        }
+#else
+        method_ptr = jpeg_idct_ifast;
+#endif
 	method = JDCT_IFAST;
 	break;
 #endif
diff --git a/jmorecfg.h b/jmorecfg.h
index b327264..71cb393 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -366,12 +366,12 @@
 #ifndef MULTIPLIER
 #ifdef ANDROID_INTELSSE2_IDCT
   #define MULTIPLIER short
+#elif ANDROID_MIPS_IDCT
+  #define MULTIPLIER  short
+#elif NV_ARM_NEON
+  #define MULTIPLIER short
 #else
-  #ifdef ANDROID_MIPS_IDCT
-    #define MULTIPLIER  short
-  #else
-    #define MULTIPLIER  int		/* type for fastest integer multiply */
-  #endif
+  #define MULTIPLIER  int		/* type for fastest integer multiply */
 #endif
 #endif
 
diff --git a/jsimd_arm_neon.S b/jsimd_arm_neon.S
new file mode 100644
index 0000000..cc0d540
--- /dev/null
+++ b/jsimd_arm_neon.S
@@ -0,0 +1,913 @@
+/*
+ * ARM NEON optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
+ * All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+/* Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of the NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv7a
+.arm
+
+
+#define RESPECT_STRICT_ALIGNMENT 1
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+    .func \fname
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+.endm
+
+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
+.macro transpose_4x4 x0, x1, x2, x3
+    vtrn.16 \x0, \x1
+    vtrn.16 \x2, \x3
+    vtrn.32 \x0, \x2
+    vtrn.32 \x1, \x3
+.endm
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
+ * function from jidctfst.c
+ *
+ * TODO: a bit better instructions scheduling is needed.
+ */
+
+#define XFIX_1_082392200 d0[0]
+#define XFIX_1_414213562 d0[1]
+#define XFIX_1_847759065 d0[2]
+#define XFIX_2_613125930 d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+/* 1-D IDCT helper macro */
+
+.macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \
+                    t10, t11, t12, t13, t14
+
+    vsub.s16        \t10, \x0, \x4
+    vadd.s16        \x4,  \x0, \x4
+    vswp.s16        \t10, \x0
+    vsub.s16        \t11, \x2, \x6
+    vadd.s16        \x6,  \x2, \x6
+    vswp.s16        \t11, \x2
+    vsub.s16        \t10, \x3, \x5
+    vadd.s16        \x5,  \x3, \x5
+    vswp.s16        \t10, \x3
+    vsub.s16        \t11, \x1, \x7
+    vadd.s16        \x7,  \x1, \x7
+    vswp.s16        \t11, \x1
+
+    vqdmulh.s16     \t13, \x2,  d0[1]
+    vadd.s16        \t12, \x3,  \x3
+    vadd.s16        \x2,  \x2,  \t13
+    vqdmulh.s16     \t13, \x3,  d0[3]
+    vsub.s16        \t10,  \x1, \x3
+    vadd.s16        \t12, \t12, \t13
+    vqdmulh.s16     \t13, \t10, d0[2]
+    vsub.s16        \t11, \x7,  \x5
+    vadd.s16        \t10, \t10, \t13
+    vqdmulh.s16     \t13, \t11, d0[1]
+    vadd.s16        \t11, \t11, \t13
+
+    vqdmulh.s16     \t13, \x1,  d0[0]
+    vsub.s16        \x2,  \x6,  \x2
+    vsub.s16        \t14, \x0,  \x2
+    vadd.s16        \x2,  \x0,  \x2
+    vadd.s16        \x0,  \x4,  \x6
+    vsub.s16        \x4,  \x4,  \x6
+    vadd.s16        \x1,  \x1,  \t13
+    vadd.s16        \t13, \x7,  \x5
+    vsub.s16        \t12, \t13, \t12
+    vsub.s16        \t12, \t12, \t10
+    vadd.s16        \t11, \t12, \t11
+    vsub.s16        \t10, \x1,  \t10
+    vadd.s16        \t10, \t10, \t11
+
+    vsub.s16        \x7,  \x0,  \t13
+    vadd.s16        \x0,  \x0,  \t13
+    vadd.s16        \x6,  \t14, \t12
+    vsub.s16        \x1,  \t14, \t12
+    vsub.s16        \x5,  \x2,  \t11
+    vadd.s16        \x2,  \x2,  \t11
+    vsub.s16        \x3,  \x4,  \t10
+    vadd.s16        \x4,  \x4,  \t10
+.endm
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP             .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants */
+    adr             TMP, jsimd_idct_ifast_neon_consts
+    vld1.16         {d0}, [TMP, :64]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | d8      | d9
+     *   3 | d10     | d11
+     *   4 | d12     | d13
+     *   5 | d14     | d15
+     *   6 | d16     | d17
+     *   7 | d18     | d19
+     */
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
+    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
+    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]!
+    /* Dequantize */
+    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
+    vmul.s16        q2, q2, q10
+    vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]!
+    vmul.s16        q3, q3, q11
+    vmul.s16        q4, q4, q12
+    vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]!
+    vmul.s16        q5, q5, q13
+    vmul.s16        q6, q6, q14
+    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
+    vmul.s16        q7, q7, q15
+    vmul.s16        q8, q8, q10
+    vmul.s16        q9, q9, q11
+
+    /* Pass 1 : process columns from input, store into work array.*/
+    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+    /* Transpose */
+    vtrn.16 q2, q3
+    vtrn.16 q4, q5
+    vtrn.32 q2, q4
+    vtrn.32 q3, q5
+
+    vtrn.16 q6, q7
+    vtrn.16 q8, q9
+    vtrn.32 q6, q8
+    vtrn.32 q7, q9
+
+    vswp            d12, d5
+    vswp            d14, d7
+    vswp            d16, d9
+    vswp            d18, d11
+
+    /* Pass 2 */
+    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+    /* Transpose */
+
+    vtrn.16 q2, q3
+    vtrn.16 q4, q5
+    vtrn.32 q2, q4
+    vtrn.32 q3, q5
+
+    vtrn.16 q6, q7
+    vtrn.16 q8, q9
+    vtrn.32 q6, q8
+    vtrn.32 q7, q9
+
+    vswp            d12, d5
+    vswp            d14, d7
+    vswp            d16, d9
+    vswp            d18, d11
+
+    /* Descale and range limit */
+    vmov.s16        q15, #(0x80 << 5)
+    vqadd.s16       q2, q2, q15
+    vqadd.s16       q3, q3, q15
+    vqadd.s16       q4, q4, q15
+    vqadd.s16       q5, q5, q15
+    vqadd.s16       q6, q6, q15
+    vqadd.s16       q7, q7, q15
+    vqadd.s16       q8, q8, q15
+    vqadd.s16       q9, q9, q15
+    vqshrun.s16     d4, q2, #5
+    vqshrun.s16     d6, q3, #5
+    vqshrun.s16     d8, q4, #5
+    vqshrun.s16     d10, q5, #5
+    vqshrun.s16     d12, q6, #5
+    vqshrun.s16     d14, q7, #5
+    vqshrun.s16     d16, q8, #5
+    vqshrun.s16     d18, q9, #5
+
+    /* Store results to the output buffer */
+    .irp            x, d4, d6, d8, d10, d12, d14, d16, d18
+    ldr             TMP, [OUTPUT_BUF], #4
+    add             TMP, TMP, OUTPUT_COL
+    vst1.8          {\x}, [TMP]!
+    .endr
+
+    vpop            {d8-d15}
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_4x4_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ *
+ * TODO: a bit better instructions scheduling can be achieved by expanding
+ *       idct_helper/transpose_4x4 macros and reordering instructions,
+ *       but readability will suffer somewhat.
+ */
+
+#define CONST_BITS  13
+
+#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+
+.balign 16
+jsimd_idct_4x4_neon_consts:
+    .short     FIX_1_847759065     /* d0[0] */
+    .short     -FIX_0_765366865    /* d0[1] */
+    .short     -FIX_0_211164243    /* d0[2] */
+    .short     FIX_1_451774981     /* d0[3] */
+    .short     -FIX_2_172734803    /* d1[0] */
+    .short     FIX_1_061594337     /* d1[1] */
+    .short     -FIX_0_509795579    /* d1[2] */
+    .short     -FIX_0_601344887    /* d1[3] */
+    .short     FIX_0_899976223     /* d2[0] */
+    .short     FIX_2_562915447     /* d2[1] */
+    .short     1 << (CONST_BITS+1) /* d2[2] */
+    .short     0                   /* d2[3] */
+
+.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
+    vmull.s16       q14, \x4,  d2[2]
+    vmlal.s16       q14, \x8,  d0[0]
+    vmlal.s16       q14, \x14, d0[1]
+
+    vmull.s16       q13, \x16, d1[2]
+    vmlal.s16       q13, \x12, d1[3]
+    vmlal.s16       q13, \x10, d2[0]
+    vmlal.s16       q13, \x6,  d2[1]
+
+    vmull.s16       q15, \x4,  d2[2]
+    vmlsl.s16       q15, \x8,  d0[0]
+    vmlsl.s16       q15, \x14, d0[1]
+
+    vmull.s16       q12, \x16, d0[2]
+    vmlal.s16       q12, \x12, d0[3]
+    vmlal.s16       q12, \x10, d1[0]
+    vmlal.s16       q12, \x6,  d1[1]
+
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
+
+.if \shift > 16
+    vrshr.s32       q10,  q10, #\shift
+    vrshr.s32       q14,  q14, #\shift
+    vmovn.s32       \y26, q10
+    vmovn.s32       \y29, q14
+.else
+    vrshrn.s32      \y26, q10, #\shift
+    vrshrn.s32      \y29, q14, #\shift
+.endif
+
+    vadd.s32        q10, q15, q12
+    vsub.s32        q15, q15, q12
+
+.if \shift > 16
+    vrshr.s32       q10,  q10, #\shift
+    vrshr.s32       q15,  q15, #\shift
+    vmovn.s32       \y27, q10
+    vmovn.s32       \y28, q15
+.else
+    vrshrn.s32      \y27, q10, #\shift
+    vrshrn.s32      \y28, q15, #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_4x4_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants (d3 is just used for padding) */
+    adr             TMP4, jsimd_idct_4x4_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | d8      | d9
+     *   3 | d10     | d11
+     *   4 | -       | -
+     *   5 | d12     | d13
+     *   6 | d14     | d15
+     *   7 | d16     | d17
+     */
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
+    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
+    add COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
+    vld1.16         {d16, d17}, [COEF_BLOCK]!
+    /* dequantize */
+    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
+    vmul.s16        q2, q2, q9
+    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE]!
+    vmul.s16        q3, q3, q10
+    vmul.s16        q4, q4, q11
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE]!
+    vmul.s16        q5, q5, q12
+    vmul.s16        q6, q6, q13
+    vld1.16         {d30, d31}, [DCT_TABLE]!
+    vmul.s16        q7, q7, q14
+    vmul.s16        q8, q8, q15
+
+
+    /* Pass 1 */
+    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
+    transpose_4x4   d4, d6, d8, d10
+    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
+    transpose_4x4   d5, d7, d9, d11
+
+    /* Pass 2 */
+    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
+    transpose_4x4   d26, d27, d28, d29
+
+    /* Range limit */
+    vmov.u16        q15, #0x80
+    vadd.s16        q13, q13, q15
+    vadd.s16        q14, q14, q15
+    vqmovun.s16     d26, q13
+    vqmovun.s16     d27, q14
+
+    /* Store results to the output buffer */
+    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+    add             TMP3, TMP3, OUTPUT_COL
+    add             TMP4, TMP4, OUTPUT_COL
+
+#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
+    /* We can use much less instructions on little endian systems if the
+     * OS kernel is not configured to trap unaligned memory accesses
+     */
+    vst1.32         {d26[0]}, [TMP1]!
+    vst1.32         {d27[0]}, [TMP3]!
+    vst1.32         {d26[1]}, [TMP2]!
+    vst1.32         {d27[1]}, [TMP4]!
+#else
+    vst1.8          {d26[0]}, [TMP1]!
+    vst1.8          {d27[0]}, [TMP3]!
+    vst1.8          {d26[1]}, [TMP1]!
+    vst1.8          {d27[1]}, [TMP3]!
+    vst1.8          {d26[2]}, [TMP1]!
+    vst1.8          {d27[2]}, [TMP3]!
+    vst1.8          {d26[3]}, [TMP1]!
+    vst1.8          {d27[3]}, [TMP3]!
+
+    vst1.8          {d26[4]}, [TMP2]!
+    vst1.8          {d27[4]}, [TMP4]!
+    vst1.8          {d26[5]}, [TMP2]!
+    vst1.8          {d27[5]}, [TMP4]!
+    vst1.8          {d26[6]}, [TMP2]!
+    vst1.8          {d27[6]}, [TMP4]!
+    vst1.8          {d26[7]}, [TMP2]!
+    vst1.8          {d27[7]}, [TMP4]!
+#endif
+
+    vpop            {d8-d15}
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_2x2_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ */
+
+.balign 8
+jsimd_idct_2x2_neon_consts:
+    .short     -FIX_0_720959822    /* d0[0] */
+    .short     FIX_0_850430095     /* d0[1] */
+    .short     -FIX_1_272758580    /* d0[2] */
+    .short     FIX_3_624509785     /* d0[3] */
+
+.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
+    vshll.s16  q14,  \x4,  #15
+    vmull.s16  q13,  \x6,  d0[3]
+    vmlal.s16  q13,  \x10, d0[2]
+    vmlal.s16  q13,  \x12, d0[1]
+    vmlal.s16  q13,  \x16, d0[0]
+
+    vadd.s32   q10,  q14,  q13
+    vsub.s32   q14,  q14,  q13
+
+.if \shift > 16
+    vrshr.s32  q10,  q10,  #\shift
+    vrshr.s32  q14,  q14,  #\shift
+    vmovn.s32  \y26, q10
+    vmovn.s32  \y27, q14
+.else
+    vrshrn.s32 \y26, q10,  #\shift
+    vrshrn.s32 \y27, q14,  #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_2x2_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants */
+    adr             TMP2, jsimd_idct_2x2_neon_consts
+    vld1.16         {d0}, [TMP2, :64]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | -       | -
+     *   3 | d10     | d11
+     *   4 | -       | -
+     *   5 | d12     | d13
+     *   6 | -       | -
+     *   7 | d16     | d17
+     */
+
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d10, d11}, [COEF_BLOCK]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d12, d13}, [COEF_BLOCK]!
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    vld1.16         {d16, d17}, [COEF_BLOCK]!
+    /* Dequantize */
+    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
+    vmul.s16        q2, q2, q9
+    vmul.s16        q3, q3, q10
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d24, d25}, [DCT_TABLE]!
+    vmul.s16        q5, q5, q12
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d26, d27}, [DCT_TABLE]!
+    vmul.s16        q6, q6, q13
+    add             DCT_TABLE, DCT_TABLE, #16
+    vld1.16         {d30, d31}, [DCT_TABLE]!
+    vmul.s16        q8, q8, q15
+
+    /* Pass 1 */
+    vmull.s16       q13, d6,  d0[3]
+    vmlal.s16       q13, d10, d0[2]
+    vmlal.s16       q13, d12, d0[1]
+    vmlal.s16       q13, d16, d0[0]
+    vmull.s16       q12, d7,  d0[3]
+    vmlal.s16       q12, d11, d0[2]
+    vmlal.s16       q12, d13, d0[1]
+    vmlal.s16       q12, d17, d0[0]
+    vshll.s16       q14, d4,  #15
+    vshll.s16       q15, d5,  #15
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
+    vrshrn.s32      d4,  q10, #13
+    vrshrn.s32      d6,  q14, #13
+    vadd.s32        q10, q15, q12
+    vsub.s32        q14, q15, q12
+    vrshrn.s32      d5,  q10, #13
+    vrshrn.s32      d7,  q14, #13
+    vtrn.16         q2,  q3
+    vtrn.32         q3,  q5
+
+    /* Pass 2 */
+    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
+
+    /* Range limit */
+    vmov.u16        q15, #0x80
+    vadd.s16        q13, q13, q15
+    vqmovun.s16     d26, q13
+    vqmovun.s16     d27, q13
+
+    /* Store results to the output buffer */
+    ldmia           OUTPUT_BUF, {TMP1, TMP2}
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+
+    vst1.8          {d26[0]}, [TMP1]!
+    vst1.8          {d27[4]}, [TMP1]!
+    vst1.8          {d26[1]}, [TMP2]!
+    vst1.8          {d27[5]}, [TMP2]!
+
+    vpop            {d8-d15}
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_rgba8888_convert_neon
+ * jsimd_ycc_rgb565_convert_neon
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+
+.macro do_load size
+    .if \size == 8
+        vld1.8  {d4}, [U]!
+        vld1.8  {d5}, [V]!
+        vld1.8  {d0}, [Y]!
+        pld     [Y, #64]
+        pld     [U, #64]
+        pld     [V, #64]
+    .elseif \size == 4
+        vld1.8  {d4[0]}, [U]!
+        vld1.8  {d4[1]}, [U]!
+        vld1.8  {d4[2]}, [U]!
+        vld1.8  {d4[3]}, [U]!
+        vld1.8  {d5[0]}, [V]!
+        vld1.8  {d5[1]}, [V]!
+        vld1.8  {d5[2]}, [V]!
+        vld1.8  {d5[3]}, [V]!
+        vld1.8  {d0[0]}, [Y]!
+        vld1.8  {d0[1]}, [Y]!
+        vld1.8  {d0[2]}, [Y]!
+        vld1.8  {d0[3]}, [Y]!
+    .elseif \size == 2
+        vld1.8  {d4[4]}, [U]!
+        vld1.8  {d4[5]}, [U]!
+        vld1.8  {d5[4]}, [V]!
+        vld1.8  {d5[5]}, [V]!
+        vld1.8  {d0[4]}, [Y]!
+        vld1.8  {d0[5]}, [Y]!
+    .elseif \size == 1
+        vld1.8  {d4[6]}, [U]!
+        vld1.8  {d5[6]}, [V]!
+        vld1.8  {d0[6]}, [Y]!
+    .else
+        .error unsupported macroblock size
+    .endif
+.endm
+
+
+
+
+
+.macro do_store bpp, size
+    .if \bpp == 16
+            /* if 16 bits, pack into RGB565 format */
+            vmov      d27, d10          /* insert red channel */
+            vsri.u8   d27, d11, #5      /* shift and insert the green channel */
+            vsli.u8   d26, d11, #3
+            vsri.u8   d26, d12, #3     /* shift and insert the blue channel */
+
+        .if \size == 8
+            vst2.8  {d26, d27}, [RGB]!
+        .elseif \size == 4
+            vst2.8  {d26[0], d27[0]}, [RGB]!
+            vst2.8  {d26[1], d27[1]}, [RGB]!
+            vst2.8  {d26[2], d27[2]}, [RGB]!
+            vst2.8  {d26[3], d27[3]}, [RGB]!
+        .elseif \size == 2
+            vst2.8  {d26[4], d27[4]}, [RGB]!
+            vst2.8  {d26[5], d27[5]}, [RGB]!
+        .elseif \size == 1
+            vst2.8  {d26[6], d27[6]}, [RGB]!
+        .else
+            .error unsupported macroblock size
+        .endif
+    .elseif \bpp == 24
+        .if \size == 8
+            vst3.8  {d10, d11, d12}, [RGB]!
+        .elseif \size == 4
+            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
+            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
+            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
+            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
+        .elseif \size == 2
+            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
+            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
+        .elseif \size == 1
+            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
+        .else
+            .error unsupported macroblock size
+        .endif
+    .elseif \bpp == 32
+        .if \size == 8
+            vst4.8  {d10, d11, d12, d13}, [RGB]!
+        .elseif \size == 4
+            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+        .elseif \size == 2
+            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+        .elseif \size == 1
+            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+        .else
+            .error unsupported macroblock size
+        .endif
+    .else
+        .error unsupported bpp
+    .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+.macro do_yuv_to_rgb
+    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
+    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
+    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
+    vrshrn.s32      d20, q10, #15
+    vrshrn.s32      d21, q11, #15
+    vrshrn.s32      d24, q12, #14
+    vrshrn.s32      d25, q13, #14
+    vrshrn.s32      d28, q14, #14
+    vrshrn.s32      d29, q15, #14
+    vaddw.u8        q10, q10, d0
+    vaddw.u8        q12, q12, d0
+    vaddw.u8        q14, q14, d0
+    vqmovun.s16     d1\g_offs, q10
+    vqmovun.s16     d1\r_offs, q12
+    vqmovun.s16     d1\b_offs, q14
+.endm
+
+/* Apple gas crashes on adrl, work around that by using adr.
+ * But this requires a copy of these constants for each function.
+ */
+
+.balign 16
+jsimd_ycc_\colorid\()_neon_consts:
+    .short          0,      0,     0,      0
+    .short          22971, -11277, -23401, 29033
+    .short          -128,  -128,   -128,   -128
+    .short          -128,  -128,   -128,   -128
+
+asm_function jsimd_ycc_\colorid\()_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    INPUT_ROW       .req r2
+    OUTPUT_BUF      .req r3
+    NUM_ROWS        .req r4
+
+    INPUT_BUF0      .req r5
+    INPUT_BUF1      .req r6
+    INPUT_BUF2      .req INPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
+    adr             ip, jsimd_ycc_\colorid\()_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save ARM registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             INPUT_BUF0, [INPUT_BUF]
+    ldr             INPUT_BUF1, [INPUT_BUF, #4]
+    ldr             INPUT_BUF2, [INPUT_BUF, #8]
+    .unreq          INPUT_BUF
+
+    /* Save NEON registers */
+    vpush           {d8-d15}
+
+    /* Initially set d10, d11, d12, d13 to 0xFF */
+    vmov.u8         q5, #255
+    vmov.u8         q6, #255
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
+    add             INPUT_ROW, INPUT_ROW, #1
+    ldr             RGB, [OUTPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             2f
+1:
+    do_load         8
+    do_yuv_to_rgb
+    do_store        \bpp, 8
+    subs            N, N, #8
+    bge             1b
+    tst             N, #7
+    beq             8f
+2:
+    tst             N, #4
+    beq             3f
+    do_load         4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         1
+5:
+    do_yuv_to_rgb
+    tst             N, #4
+    beq             6f
+    do_store        \bpp, 4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        \bpp, 2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        \bpp, 1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8-d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          INPUT_ROW
+    .unreq          OUTPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          INPUT_BUF0
+    .unreq          INPUT_BUF1
+    .unreq          INPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+.endfunc
+
+.purgem do_yuv_to_rgb
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_ycc_rgb_convert_neon rgba8888, 32, 0, 1, 2
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 1, 2
+
+
+.purgem do_load
+.purgem do_store
+
+/*****************************************************************************/
diff --git a/jsimd_neon.c b/jsimd_neon.c
new file mode 100644
index 0000000..6ad1bd3
--- /dev/null
+++ b/jsimd_neon.c
@@ -0,0 +1,170 @@
+/* Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of the NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h"
+#include <machine/cpu-features.h>
+
+
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+
+EXTERN(void) jsimd_ycc_rgba8888_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_rgb565_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+GLOBAL(void)
+jsimd_ycc_rgba8888_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+    void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+    neonfct=jsimd_ycc_rgba8888_convert_neon;
+
+    neonfct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+    void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+    neonfct=jsimd_ycc_rgb565_convert_neon;
+
+    neonfct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+
+GLOBAL(int)
+cap_neon_idct_2x2 (void)
+{
+  if (  (DCTSIZE != 8)              ||
+        (sizeof(JCOEF) != 2)        ||
+        (BITS_IN_JSAMPLE != 8)      ||
+        (sizeof(JDIMENSION) != 4)   ||
+        (sizeof(ISLOW_MULT_TYPE) != 2))
+    return 0;
+
+    return 1;
+}
+
+GLOBAL(int)
+cap_neon_idct_4x4 (void)
+{
+
+  if (  (DCTSIZE != 8)              ||
+        (sizeof(JCOEF) != 2)        ||
+        (BITS_IN_JSAMPLE != 8)      ||
+        (sizeof(JDIMENSION) != 4)   ||
+        (sizeof(ISLOW_MULT_TYPE) != 2))
+    return 0;
+
+    return 1;
+}
+
+GLOBAL(int)
+cap_neon_idct_ifast (void)
+{
+
+  if (  (DCTSIZE != 8)                  ||
+        (sizeof(JCOEF) != 2)            ||
+        (BITS_IN_JSAMPLE != 8)          ||
+        (sizeof(JDIMENSION) != 4)       ||
+        (sizeof(IFAST_MULT_TYPE) != 2)  ||
+        (IFAST_SCALE_BITS != 2))
+    return 0;
+
+    return 1;
+
+}
+
+GLOBAL(int)
+cap_neon_ycc_rgb (void)
+{
+
+  if(   (BITS_IN_JSAMPLE != 8)                          ||
+        (sizeof(JDIMENSION) != 4)                       ||
+        ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)))
+    return 0;
+
+    return 1;
+}
+
+#endif
+
+
diff --git a/jsimd_neon.h b/jsimd_neon.h
new file mode 100644
index 0000000..24320aa
--- /dev/null
+++ b/jsimd_neon.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of the NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __JSIMD_NEON_H__
+#define __JSIMD_NEON_H__
+#include "jinclude.h"
+#include "jpeglib.h"
+#include <machine/cpu-features.h>
+
+#if defined(NV_ARM_NEON) && defined(__ARM_HAVE_NEON)
+
+#if defined(ANDROID_ARMV6_IDCT)
+#error "ANDROID_ARMV6_IDCT and NV_ARM_NEON are muturaly exclusive modes, both have been defined"
+#endif
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col);
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col);
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col);
+
+GLOBAL(int)
+cap_neon_idct_2x2 (void);
+
+GLOBAL(int)
+cap_neon_idct_4x4 (void);
+
+GLOBAL(int)
+cap_neon_idct_ifast (void);
+
+GLOBAL(int)
+cap_neon_ycc_rgb (void);
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows);
+
+GLOBAL(void)
+jsimd_ycc_rgba8888_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows);
+#endif
+
+#endif