JPEG decode: new blitter on BYT

BZ: 140105

1. Use CM for YUV->RGBA_linear blit on BYT
2. Use FOURCC instead of HAL_PIXEL_FORMAT in RenderTarget.pixel_format
3. Implement scaling output for BYT
4. Implement header-only parsing to optimize boundaryMode decode
5. Let libjpeg enable/disable available capabilities
5. Implement 422h->nv12+nv21+yv12+yuy2 blit (for usb-camera)

Change-Id: Ie3df2707134533968721e942bbc81440e37efe0a
Signed-off-by: Cheng Yao <yao.cheng@intel.com>
diff --git a/imagedecoder/Android.mk b/imagedecoder/Android.mk
index 70928e7..153d594 100644
--- a/imagedecoder/Android.mk
+++ b/imagedecoder/Android.mk
@@ -11,7 +11,6 @@
 LOCAL_C_INCLUDES += \
     $(LOCAL_PATH) \
     $(TARGET_OUT_HEADERS)/libva \
-    $(TARGET_OUT_HEADERS)/libmix_videovpp
 
 LOCAL_COPY_HEADERS_TO  := libjpegdec
 
@@ -29,19 +28,29 @@
     libhardware
 
 LOCAL_LDLIBS += -lpthread
-LOCAL_CFLAGS += -Wno-multichar
+LOCAL_CFLAGS += -Wno-multichar -DLOG_TAG=\"ImageDecoder\"
+LOCAL_CFLAGS += -DLOG_NDEBUG=0
 
 ifeq ($(TARGET_BOARD_PLATFORM),baytrail)
+GPGPU_OBJ_NAME := libjpeg_cm_genx.isa
+GPGPU_OBJS += $(PRODUCT_OUT)/system/lib/$(GPGPU_OBJ_NAME)
+LOCAL_C_INCLUDES += $(TARGET_OUT_HEADERS)/ufo
 LOCAL_SRC_FILES += JPEGBlitter_gen.cpp
 LOCAL_SRC_FILES += JPEGDecoder_gen.cpp
+LOCAL_C_INCLUDES += $(TOP)/vendor/intel/hardware/PRIVATE/ufo/inc
+LOCAL_CFLAGS += -Wno-non-virtual-dtor -DGFXGEN
+LOCAL_LDFLAGS += -L$(INTEL_CM_RUNTIME)/lib/x86/ -l:igfxcmrt32.so
+$(GPGPU_OBJS):
+	cp $(LOCAL_PATH)/$(GPGPU_OBJ_NAME) $@
 else
 LOCAL_SRC_FILES += JPEGBlitter_img.cpp
 LOCAL_SRC_FILES += JPEGDecoder_img.cpp
 endif
-
 LOCAL_MODULE:= libjpegdec
 LOCAL_MODULE_TAGS := optional
 
+$(LOCAL_MODULE): $(GPGPU_OBJS)
+
 include $(BUILD_SHARED_LIBRARY)
 
 ifeq ($(TARGET_BOARD_PLATFORM),baytrail)
@@ -53,7 +62,6 @@
 LOCAL_C_INCLUDES += \
     $(LOCAL_PATH) \
     $(TARGET_OUT_HEADERS)/libva \
-    $(TARGET_OUT_HEADERS)/libmix_videovpp
 
 LOCAL_SHARED_LIBRARIES += \
     libcutils \
@@ -66,6 +74,7 @@
 
 LOCAL_LDLIBS += -lpthread
 LOCAL_CFLAGS += -Wno-multichar
+LOCAL_CFLAGS += -DLOG_NDEBUG=0
 
 LOCAL_MODULE:= testjpegdec
 LOCAL_MODULE_TAGS := optional
@@ -78,12 +87,15 @@
 LOCAL_SRC_FILES += \
     JPEGDecoder_libjpeg_wrapper.cpp
 
+ifeq ($(TARGET_BOARD_PLATFORM),baytrail)
+LOCAL_CFLAGS += -DGFXGEN
+endif
+
 LOCAL_C_INCLUDES += \
     $(LOCAL_PATH) \
     $(call include-path-for, jpeg) \
     $(TARGET_OUT_HEADERS)/libva \
     $(TARGET_OUT_HEADERS)/libjpegdec \
-    $(TARGET_OUT_HEADERS)/libmix_videovpp
 
 LOCAL_COPY_HEADERS_TO  := libjpeg_hw
 
@@ -94,12 +106,15 @@
     libcutils \
     libutils \
     liblog  \
+    libva \
+    libva-android \
     libjpegdec \
     libhardware
 
 LOCAL_LDLIBS += -lpthread
-LOCAL_CFLAGS += -Wno-multichar
+LOCAL_CFLAGS += -Wno-multichar -DLOG_TAG=\"ImageDecoder\"
 LOCAL_CFLAGS += -DUSE_INTEL_JPEGDEC
+LOCAL_CFLAGS += -DLOG_NDEBUG=0
 
 LOCAL_MODULE:= libjpeg_hw
 LOCAL_MODULE_TAGS := optional
diff --git a/imagedecoder/ImageDecoderTrace.h b/imagedecoder/ImageDecoderTrace.h
index 466b606..a3dadc0 100644
--- a/imagedecoder/ImageDecoderTrace.h
+++ b/imagedecoder/ImageDecoderTrace.h
@@ -50,11 +50,6 @@
 #else
 // for Android OS
 
-#ifdef LOG_TAG
-#undef LOG_TAG
-#endif
-#define LOG_TAG "ImageDecoder"
-
 #ifdef LOG_NDEBUG
 #undef LOG_NDEBUG
 #endif
diff --git a/imagedecoder/JPEGBlitter.cpp b/imagedecoder/JPEGBlitter.cpp
index cb1e917..f6f0f95 100644
--- a/imagedecoder/JPEGBlitter.cpp
+++ b/imagedecoder/JPEGBlitter.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -25,64 +24,29 @@
 *    Yao Cheng <yao.cheng@intel.com>
 *
 */
-//#define LOG_NDEBUG 0
 
 #include <va/va.h>
 #include <va/va_tpi.h>
 #include "JPEGBlitter.h"
 #include "JPEGDecoder.h"
-
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 #include <assert.h>
-//#define LOG_TAG "JPEGBlitter"
 
-JpegBlitter::JpegBlitter()
+JpegBlitter::JpegBlitter(VADisplay display, VAConfigID vpCfgId, VAContextID vpCtxId)
     :mDecoder(NULL),
-    mConfigId(VA_INVALID_ID),
-    mContextId(VA_INVALID_ID)
+    mDisplay(display),
+    mConfigId(vpCfgId),
+    mContextId(vpCtxId),
+    mPrivate(NULL),
+    mInitialized(false)
 {
-    // empty
 }
 
 JpegBlitter::~JpegBlitter()
 {
-    if (mDecoder) {
-        destroyContext();
-    }
+    deinit();
 }
 
-void JpegBlitter::destroyContext()
-{
-    if (mDecoder == NULL)
-        return;
-
-    Mutex::Autolock autoLock(mLock);
-    if (mDecoder) {
-        vaDestroyContext(mDecoder->mDisplay, mContextId);
-        mContextId = VA_INVALID_ID;
-        vaDestroyConfig(mDecoder->mDisplay, mConfigId);
-        mConfigId = VA_INVALID_ID;
-        mDecoder = NULL;
-    }
-}
-
-void JpegBlitter::setDecoder(JpegDecoder &decoder)
-{
-    destroyContext();
-    Mutex::Autolock autoLock(mLock);
-    mDecoder = &decoder;
-    VAConfigAttrib  vpp_attrib;
-    VAStatus st;
-    vpp_attrib.type  = VAConfigAttribRTFormat;
-    vpp_attrib.value = VA_RT_FORMAT_YUV420;
-    st = vaCreateConfig(mDecoder->mDisplay, VAProfileNone,
-                                VAEntrypointVideoProc,
-                                &vpp_attrib,
-                                1, &mConfigId);
-    assert(st == VA_STATUS_SUCCESS);
-    st = vaCreateContext(mDecoder->mDisplay, mConfigId, 1920, 1080, 0, NULL, 0, &mContextId);
-    assert(st == VA_STATUS_SUCCESS);
-}
 
diff --git a/imagedecoder/JPEGBlitter.h b/imagedecoder/JPEGBlitter.h
index b9fcc08..9f828fe 100644
--- a/imagedecoder/JPEGBlitter.h
+++ b/imagedecoder/JPEGBlitter.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -29,25 +28,49 @@
 #ifndef JPEG_BLITTER_H
 #define JPEG_BLITTER_H
 
-#include <VideoVPPBase.h>
 #include "JPEGCommon.h"
 #include <utils/threads.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <hardware/gralloc.h>
+
+using namespace android;
 
 class JpegDecoder;
+typedef void* BlitEvent;
 
 class JpegBlitter
 {
 public:
-    JpegBlitter();
+    JpegBlitter(VADisplay display, VAConfigID vpCfgId, VAContextID vpCtxId);
     virtual ~JpegBlitter();
-    virtual void setDecoder(JpegDecoder &decoder);
-    virtual JpegDecodeStatus blit(RenderTarget &src, RenderTarget &dst);
+    virtual void init(JpegDecoder &dec);
+    virtual void deinit();
+    virtual JpegDecodeStatus blit(RenderTarget &src, RenderTarget &dst, int scale_factor);
+    virtual JpegDecodeStatus getRgbaTile(RenderTarget &src,
+                                         uint8_t *sysmem,
+                                         int left, int top, int width, int height, int scale_factor);
+    virtual JpegDecodeStatus blitToLinearRgba(RenderTarget &src,
+                                              uint8_t *sysmem,
+                                              uint32_t width, uint32_t height,
+                                              BlitEvent &event, int scale_factor);
+    virtual JpegDecodeStatus blitToCameraSurfaces(RenderTarget &src,
+                                                  buffer_handle_t dst_nv12,
+                                                  buffer_handle_t dst_yuy2,
+                                                  uint8_t *dst_nv21,
+                                                  uint8_t *dst_yv12,
+                                                  uint32_t width, uint32_t height,
+                                                  BlitEvent &event);
+    virtual void syncBlit(BlitEvent &event);
 private:
     mutable Mutex mLock;
-    virtual void destroyContext();
     JpegDecoder *mDecoder;
+    VADisplay mDisplay;
     VAConfigID mConfigId;
     VAContextID mContextId;
+    void *mPrivate;
+    bool mInitialized;
 };
 
 #endif
diff --git a/imagedecoder/JPEGBlitter_gen.cpp b/imagedecoder/JPEGBlitter_gen.cpp
index b1167d3..e819883 100644
--- a/imagedecoder/JPEGBlitter_gen.cpp
+++ b/imagedecoder/JPEGBlitter_gen.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -25,32 +24,45 @@
 *    Yao Cheng <yao.cheng@intel.com>
 *
 */
-//#define LOG_NDEBUG 0
 
 #include "JPEGBlitter.h"
 #include "JPEGCommon_Gen.h"
 #include "JPEGDecoder.h"
-
+#include <utils/Timers.h>
 #include <va/va.h>
 #include <va/va_tpi.h>
 #include "ImageDecoderTrace.h"
 
+#include <cm/cm_rt.h>
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
-
 #include <assert.h>
 
+#define NV12_INTERMEDIATE 0
+#define PRE_INIT_CM 1
+#define BLIT_METHOD_CM 1 // 0 for VA+GpuCopy method, 1 for pure CM method
+#define DUMP_RGBA 0
+
+#define CM_KERNEL_FUNC_NAME yuv_tiled_to_rgba_linear
+
 #define JD_CHECK(err, label) \
         if (err) { \
-            ETRACE("%s::%d: failed: %d", __PRETTY_FUNCTION__, __LINE__, err); \
+            ETRACE("%s::%d: failed: %d", __FUNCTION__, __LINE__, err); \
             goto label; \
         }
 
 #define JD_CHECK_RET(err, label, retcode) \
         if (err) { \
             status = retcode; \
-            ETRACE("%s::%d: failed: %d", __PRETTY_FUNCTION__, __LINE__, err); \
+            ETRACE("%s::%d: failed: %d", __FUNCTION__, __LINE__, err); \
+            goto label; \
+        }
+
+#define JD_CM_CHECK_RET(err, label, retcode) \
+        if (err) { \
+            status = retcode; \
+            ETRACE("CM %s::%d: failed: 0x%08x", __FUNCTION__, __LINE__, err); \
             goto label; \
         }
 
@@ -59,11 +71,13 @@
     switch(fourcc) {
     case VA_FOURCC_NV12:
     case VA_FOURCC_YUY2:
-    case VA_FOURCC_422H:
-    case VA_FOURCC_422V:
+    case VA_FOURCC_UYVY:
+    case VA_FOURCC('4','0','0','P'):
     case VA_FOURCC_411P:
     case VA_FOURCC_411R:
     case VA_FOURCC_IMC3:
+    case VA_FOURCC_422H:
+    case VA_FOURCC_422V:
     case VA_FOURCC_444P:
     case VA_FOURCC_YV12:
         return VAProcColorStandardBT601;
@@ -72,246 +86,7 @@
     }
 }
 
-void write_to_file(const char *file, const VAImage *pImg, const uint8_t *pSrc)
-{
-    FILE *fp = fopen(file, "wb");
-    if (!fp) {
-        return;
-    }
-    const uint8_t *pY, *pU, *pV, *pYUYV, *pRGBA, *pUV;
-    float h_samp_factor, v_samp_factor;
-    int row, col;
-    char fourccstr[5];
-    VTRACE("Dumping %s buffer to %s", fourcc2str(fourccstr, pImg->format.fourcc), file);
-    switch (pImg->format.fourcc) {
-    case VA_FOURCC_IMC3:
-        h_samp_factor = 1;
-        v_samp_factor = 0.5;
-        break;
-    case VA_FOURCC_422H:
-        h_samp_factor = 0.5;
-        v_samp_factor = 1;
-        break;
-    case VA_FOURCC_444P:
-        h_samp_factor = 1;
-        v_samp_factor = 1;
-        break;
-    case VA_FOURCC_YUY2:
-    {
-        pYUYV = pSrc + pImg->offsets[0];
-        VTRACE("YUY2 output width %u stride %u", pImg->width, pImg->pitches[0]);
-        for (row = 0; row < pImg->height; ++row) {
-            fwrite(pYUYV, 2, pImg->width, fp);
-            pYUYV += pImg->pitches[0];
-        }
-    }
-    fclose(fp);
-    return;
-    case VA_FOURCC_NV12:
-    {
-        pY = pSrc + pImg->offsets[0];
-        pUV = pSrc + pImg->offsets[1];
-        VTRACE("NV12 output width %u stride %u, %u", pImg->width, pImg->pitches[0], pImg->pitches[1]);
-        for (row = 0; row < pImg->height; ++row) {
-            fwrite(pY, 1, pImg->width, fp);
-            pY += pImg->pitches[0];
-        }
-        for (row = 0; row < pImg->height/2; ++row) {
-            fwrite(pUV, 1, pImg->width, fp);
-            pUV += pImg->pitches[1];
-        }
-    }
-    fclose(fp);
-    return;
-    case VA_FOURCC_RGBA:
-    case VA_FOURCC_BGRA:
-    case VA_FOURCC_ARGB:
-    case VA_FOURCC('A', 'B', 'G', 'R'):
-    {
-        pRGBA = pSrc + pImg->offsets[0];
-        VTRACE("RGBA output width %u stride %u", pImg->width, pImg->pitches[0]);
-        for (row = 0; row < pImg->height; ++row) {
-            fwrite(pRGBA, 4, pImg->width, fp);
-            pRGBA += pImg->pitches[0];
-        }
-    }
-    fclose(fp);
-    return;
-    default:
-        // non-supported
-        {
-            char fourccstr[5];
-            ETRACE("%s: Not-supported input YUV format", fourcc2str(fourccstr, pImg->format.fourcc));
-        }
-        return;
-    }
-    pY = pSrc + pImg->offsets[0];
-    pU = pSrc + pImg->offsets[1];
-    pV = pSrc + pImg->offsets[2];
-    // Y
-    for (row = 0; row < pImg->height; ++row) {
-        fwrite(pY, 1, pImg->width, fp);
-        pY += pImg->pitches[0];
-    }
-    // U
-    for (row = 0; row < pImg->height * v_samp_factor; ++row) {
-        fwrite(pU, 1, pImg->width * h_samp_factor, fp);
-        pU += pImg->pitches[1];
-    }
-    // V
-    for (row = 0; row < pImg->height * v_samp_factor; ++row) {
-        fwrite(pV, 1, pImg->width * h_samp_factor, fp);
-        pV += pImg->pitches[2];
-    }
-    fclose(fp);
-}
-
-static void write_to_YUY2(uint8_t *pDst,
-                          uint32_t dst_w,
-                          uint32_t dst_h,
-                          uint32_t dst_stride,
-                          const VAImage *pImg,
-                          const uint8_t *pSrc)
-{
-    const uint8_t *pY, *pU, *pV;
-    float h_samp_factor, v_samp_factor;
-    int row, col;
-    char fourccstr[5];
-    uint32_t copy_w = (dst_w < pImg->width)? dst_w: pImg->width;
-    uint32_t copy_h = (dst_h < pImg->height)? dst_h: pImg->height;
-    switch (pImg->format.fourcc) {
-    case VA_FOURCC_IMC3:
-        h_samp_factor = 0.5;
-        v_samp_factor = 0.5;
-        break;
-    case VA_FOURCC_422H:
-        h_samp_factor = 0.5;
-        v_samp_factor = 1;
-        break;
-    case VA_FOURCC_444P:
-        h_samp_factor = 1;
-        v_samp_factor = 1;
-        break;
-    default:
-        // non-supported
-        ETRACE("%s to YUY2: Not-supported input YUV format", fourcc2str(fourccstr, pImg->format.fourcc));
-        return;
-    }
-    pY = pSrc + pImg->offsets[0];
-    pU = pSrc + pImg->offsets[1];
-    pV = pSrc + pImg->offsets[2];
-    for (row = 0; row < copy_h; ++row) {
-        for (col = 0; col < copy_w; ++col) {
-            // Y
-            *(pDst + 2 * col) = *(pY + col);
-            uint32_t actual_col = h_samp_factor * col;
-            if (col % 2 == 1) {
-                // U
-                *(pDst + 2 * col + 1) = *(pU + actual_col);
-            }
-            else {
-                // V
-                *(pDst + 2 * col + 1) = *(pV + actual_col);
-            }
-        }
-        pDst += dst_stride;
-        pY += pImg->pitches[0];
-        uint32_t actual_row = row * v_samp_factor;
-        pU = pSrc + pImg->offsets[1] + actual_row * pImg->pitches[1];
-        pV = pSrc + pImg->offsets[2] + actual_row * pImg->pitches[2];
-    }
-}
-
-static void dumpSurface(const char* filename, VADisplay display, VASurfaceID surface)
-{
-    VAStatus st;
-    VAImage img;
-    uint8_t *buf;
-    st = vaDeriveImage(display, surface, &img);
-    if (st) {
-        ETRACE("vaDeriveImage failed with %d", st);
-        return;
-    }
-    uint32_t in_fourcc = img.format.fourcc;
-    VTRACE("Start dumping %s surface to %s", fourcc2str(NULL, in_fourcc), filename);
-    st = vaMapBuffer(display, img.buf, (void **)&buf);
-    if (st) {
-        ETRACE("vaMapBuffer failed with %d", st);
-        vaDestroyImage(display, img.image_id);
-        return;
-    }
-    VTRACE("start write_to_file");
-    write_to_file(filename, &img, buf);
-    vaUnmapBuffer(display, img.buf);
-    vaDestroyImage(display, img.image_id);
-}
-
-static void dumpGallocBuffer(const char* filename,
-                                buffer_handle_t handle,
-                                int width,
-                                int height,
-                                uint32_t fourcc)
-{
-    // NOT IMPLEMENTED
-}
-
-
-static JpegDecodeStatus swBlit(VADisplay display, VAContextID context,
-                 VASurfaceID in_surf, VARectangle *in_rect, uint32_t in_fourcc,
-                 VASurfaceID out_surf, VARectangle *out_rect, uint32_t out_fourcc)
-{
-    assert(out_fourcc == VA_FOURCC_YUY2);
-    assert((in_fourcc == VA_FOURCC_IMC3) || (in_fourcc == VA_FOURCC_422H) || (in_fourcc == VA_FOURCC_444P));
-    VAStatus st;
-    char str[10];
-    JpegDecodeStatus status;
-    VAImage in_img, out_img;
-    in_img.image_id = VA_INVALID_ID;
-    in_img.buf = VA_INVALID_ID;
-    out_img.image_id = VA_INVALID_ID;
-    out_img.buf = VA_INVALID_ID;
-    uint8_t *in_buf, *out_buf;
-    in_buf = out_buf = NULL;
-    st = vaDeriveImage(display, in_surf, &in_img);
-    JD_CHECK_RET(st, cleanup, JD_BLIT_FAILURE);
-    st = vaDeriveImage(display, out_surf, &out_img);
-    JD_CHECK_RET(st, cleanup, JD_BLIT_FAILURE);
-    st = vaMapBuffer(display, in_img.buf, (void **)&in_buf);
-    JD_CHECK_RET(st, cleanup, JD_BLIT_FAILURE);
-    st = vaMapBuffer(display, out_img.buf, (void **)&out_buf);
-    JD_CHECK_RET(st, cleanup, JD_BLIT_FAILURE);
-    VTRACE("%s in: %s, %ux%u, size %u, offset=%u,%u,%u, pitch=%u,%u,%u", __FUNCTION__,
-        fourcc2str(NULL, in_fourcc),
-        in_img.width,
-        in_img.height,
-        in_img.data_size,
-        in_img.offsets[0], in_img.offsets[1], in_img.offsets[2],
-        in_img.pitches[0], in_img.pitches[1], in_img.pitches[2]);
-    VTRACE("%s out: %s, %ux%u, size %u, offset=%u,%u,%u, pitch=%u,%u,%u", __FUNCTION__,
-        fourcc2str(NULL, out_fourcc),
-        out_img.width,
-        out_img.height,
-        out_img.data_size,
-        out_img.offsets[0], out_img.offsets[1], out_img.offsets[2],
-        out_img.pitches[0], out_img.pitches[1], out_img.pitches[2]);
-    write_to_YUY2(out_buf, out_img.width, out_img.height, out_img.pitches[0], &in_img, in_buf);
-    vaUnmapBuffer(display, in_img.buf);
-    vaUnmapBuffer(display, out_img.buf);
-    vaDestroyImage(display, in_img.image_id);
-    vaDestroyImage(display, out_img.image_id);
-    VTRACE("%s Finished SW CSC %s=>%s", __FUNCTION__, fourcc2str(str, in_fourcc), fourcc2str(str + 5, out_fourcc));
-    return JD_SUCCESS;
-
-cleanup:
-    ETRACE("%s failed to do swBlit %s=>%s", __FUNCTION__, fourcc2str(str, in_fourcc), fourcc2str(str + 5, out_fourcc));
-    if (in_buf != NULL) vaUnmapBuffer(display, in_img.buf);
-    if (out_buf != NULL) vaUnmapBuffer(display, out_img.buf);
-    if (in_img.image_id != VA_INVALID_ID) vaDestroyImage(display, in_img.image_id);
-    if (out_img.image_id != VA_INVALID_ID) vaDestroyImage(display, out_img.image_id);
-    return status;
-}
-
-static JpegDecodeStatus hwBlit(VADisplay display, VAContextID context,
+static JpegDecodeStatus vaVppBlit(VADisplay display, VAContextID context,
                  VASurfaceID in_surf, VARectangle *in_rect, uint32_t in_fourcc,
                  VASurfaceID out_surf, VARectangle *out_rect, uint32_t out_fourcc)
 {
@@ -324,9 +99,6 @@
     nsecs_t t1, t2;
 
     memset(&vpp_param, 0, sizeof(VAProcPipelineParameterBuffer));
-#if PRE_TOUCH_SURFACE
-    //zeroSurfaces(display, &out_surf, 1);
-#endif
     t1 = systemTime();
     vpp_param.surface                 = in_surf;
     vpp_param.output_region           = out_rect;
@@ -371,9 +143,9 @@
     JD_CHECK_RET(vpp_status, cleanup, JD_BLIT_FAILURE);
     t2 = systemTime();
     VTRACE("Finished HW CSC %s(%d,%d,%u,%u)=>%s(%d,%d,%u,%u) for %f ms",
-        fourcc2str(str, in_fourcc),
+        fourcc2str(in_fourcc, str),
         in_rect->x, in_rect->y, in_rect->width, in_rect->height,
-        fourcc2str(str + 5, out_fourcc),
+        fourcc2str(out_fourcc, str + 5),
         out_rect->x, out_rect->y, out_rect->width, out_rect->height,
         ns2us(t2 - t1)/1000.0);
 
@@ -388,38 +160,175 @@
                  VASurfaceID in_surf, VARectangle *in_rect, uint32_t in_fourcc,
                  VASurfaceID out_surf, VARectangle *out_rect, uint32_t out_fourcc)
 {
+    char fourccstr[10];
+    ALOGD("%s, in %s, out %s", __FUNCTION__, fourcc2str(in_fourcc, fourccstr), fourcc2str(out_fourcc, fourccstr + 5));
     if (((in_fourcc == VA_FOURCC_422H) ||
+        (in_fourcc == VA_FOURCC_444P) ||
+        (in_fourcc == VA_FOURCC_IMC3) ||
+        (in_fourcc == VA_FOURCC_411P) ||
+        (in_fourcc == VA_FOURCC_422V) ||
         (in_fourcc == VA_FOURCC_NV12) ||
         (in_fourcc == VA_FOURCC_YUY2) ||
+        (in_fourcc == VA_FOURCC_UYVY) ||
         (in_fourcc == VA_FOURCC_YV12) ||
+        (in_fourcc == VA_FOURCC_BGRA) ||
         (in_fourcc == VA_FOURCC_RGBA))
         &&
         ((out_fourcc == VA_FOURCC_422H) ||
+        (out_fourcc == VA_FOURCC_444P) ||
+        (out_fourcc == VA_FOURCC_IMC3) ||
+        (out_fourcc == VA_FOURCC_411P) ||
+        (out_fourcc == VA_FOURCC_422V) ||
         (out_fourcc == VA_FOURCC_NV12) ||
         (out_fourcc == VA_FOURCC_YV12) ||
         (out_fourcc == VA_FOURCC_YUY2) ||
+        (out_fourcc == VA_FOURCC_UYVY) ||
+        (out_fourcc == VA_FOURCC_BGRA) ||
         (out_fourcc == VA_FOURCC_RGBA))) {
-        return hwBlit(display, context, in_surf, in_rect, in_fourcc,
+        return vaVppBlit(display, context, in_surf, in_rect, in_fourcc,
                out_surf, out_rect, out_fourcc);
     }
     else {
-        return swBlit(display, context, in_surf, in_rect, in_fourcc,
-               out_surf, out_rect, out_fourcc);
+        return JD_INPUT_FORMAT_UNSUPPORTED;
     }
 }
 
-JpegDecodeStatus JpegBlitter::blit(RenderTarget &src, RenderTarget &dst)
+static CmDevice *pDev = NULL;
+static CmProgram *pProgram = NULL;
+static CmKernel *pKernel = NULL;
+static Mutex cmLock;
+void JpegBlitter::init(JpegDecoder &dec)
 {
+    if (!mInitialized) {
+        Mutex::Autolock autoLock(mLock);
+        if (!mInitialized) {
+            mDecoder = &dec;
+#if PRE_INIT_CM
+            nsecs_t t1, t2;
+            t1 = t2 = systemTime();
+#if BLIT_METHOD_CM
+#define ISA_FILE "/system/lib/libjpeg_cm_genx.isa"
+            if (!pDev || !pProgram) {
+                VTRACE("%s waiting for cm lock", __FUNCTION__);
+                Mutex::Autolock autoCmLock(cmLock);
+                VTRACE("%s got cm lock", __FUNCTION__);
+                if (!pDev || !pProgram) {
+                    ITRACE("%s CM is not initialized yet, pre-init it", __FUNCTION__);
+                    UINT ver;
+                    INT result;
+                    FILE* pIsaFile = NULL;
+                    int codeSize;
+                    BYTE* pIsaBytes = NULL;
+                    result = CreateCmDevice(pDev, ver, mDisplay);
+                    if (result != CM_SUCCESS) {
+                        ETRACE("%s CreateCmDevice failed: %d", __FUNCTION__, result);
+                        VTRACE("%s release cm lock", __FUNCTION__);
+                        abort();
+                    }
+
+                    pIsaFile = fopen(ISA_FILE, "rb");
+                    if (pIsaFile==NULL) {
+                        ETRACE("%s fopen failed", __FUNCTION__);
+                        DestroyCmDevice(pDev);
+                        VTRACE("%s release cm lock", __FUNCTION__);
+                        abort();
+                    }
+                    fseek (pIsaFile, 0, SEEK_END);
+                    codeSize = ftell (pIsaFile);
+                    rewind(pIsaFile);
+                    if (codeSize==0) {
+                        ETRACE("%s codesize failed", __FUNCTION__);
+                        DestroyCmDevice(pDev);
+                        fclose(pIsaFile);
+                        VTRACE("%s release cm lock", __FUNCTION__);
+                        abort();
+                    }
+                    pIsaBytes = (BYTE*) malloc(codeSize);
+                    if (pIsaBytes==NULL) {
+                        ETRACE("%s malloc failed", __FUNCTION__);
+                        DestroyCmDevice(pDev);
+                        fclose(pIsaFile);
+                        abort();
+                    }
+                    if (fread(pIsaBytes, 1, codeSize, pIsaFile) != codeSize) {
+                        ETRACE("%s fread failed", __FUNCTION__);
+                        free(pIsaBytes);
+                        DestroyCmDevice(pDev);
+                        fclose(pIsaFile);
+                        VTRACE("%s release cm lock", __FUNCTION__);
+                        abort();
+                    }
+                    fclose(pIsaFile);
+                    pIsaFile = NULL;
+
+                    result = pDev->LoadProgram(pIsaBytes, codeSize, pProgram);
+                    if (result != CM_SUCCESS) {
+                        ETRACE("%s LoadProgram failed: %d", __FUNCTION__, result);
+                        free(pIsaBytes);
+                        DestroyCmDevice(pDev);
+                        VTRACE("%s release cm lock", __FUNCTION__);
+                        abort();
+                    }
+                    free(pIsaBytes);
+                    pIsaBytes = NULL;
+
+                    t2 = systemTime();
+                    VTRACE("%s CM pre-init succeded, took %.2f ms", __FUNCTION__, (t2-t1)/1000000.0);
+                }
+                VTRACE("%s release cm lock", __FUNCTION__);
+            }
+#else
+            if (!pDev) {
+                ITRACE("%s CM is not initialized yet, pre-init it", __FUNCTION__);
+                UINT ver;
+                INT result;
+                result = CreateCmDevice(pDev, ver, mDisplay);
+                if (result != CM_SUCCESS || !pDev) {
+                    ETRACE("%s CreateCmDevice returns %d", __FUNCTION__, result);
+                    abort();
+                }
+                t2 = systemTime();
+                VTRACE("%s CM pre-init succeded, took %.2f ms", __FUNCTION__, (t2-t1)/1000000.0);
+            }
+#endif
+#endif
+            mInitialized = true;
+        }
+    }
+}
+
+void JpegBlitter::deinit()
+{
+    if (mInitialized) {
+        Mutex::Autolock autoLock(mLock);
+        if (mInitialized) {
+#if PRE_INIT_CM
+#if BLIT_METHOD_CM
+            //if (pIsaBytes && pProgram && pDev) {
+            //    free(pIsaBytes);
+            //    pDev->DestroyProgram(pProgram);
+            //    DestroyCmDevice(pDev);
+            //}
+#endif
+#endif
+            mInitialized = false;
+        }
+    }
+}
+
+JpegDecodeStatus JpegBlitter::blit(RenderTarget &src, RenderTarget &dst, int scale_factor)
+{
+    assert(mInitialized);
     if (mDecoder == NULL)
         return JD_UNINITIALIZED;
     JpegDecodeStatus st;
     uint32_t src_fourcc, dst_fourcc;
     char tmp[10];
-    src_fourcc = pixelFormat2Fourcc(src.pixel_format);
-    dst_fourcc = pixelFormat2Fourcc(dst.pixel_format);
+    src_fourcc = src.pixel_format;
+    dst_fourcc = dst.pixel_format;
     VASurfaceID src_surf = mDecoder->getSurfaceID(src);
     if (src_surf == VA_INVALID_ID) {
-        ETRACE("%s invalid src %s target", __FUNCTION__, fourcc2str(NULL, src_fourcc));
+        ETRACE("%s invalid src %s target", __FUNCTION__, fourcc2str(src_fourcc));
         return JD_INVALID_RENDER_TARGET;
     }
     VASurfaceID dst_surf = mDecoder->getSurfaceID(dst);
@@ -432,10 +341,721 @@
         }
     }
 
-    VTRACE("%s blitting from %s to %s", __FUNCTION__, fourcc2str(tmp, src_fourcc), fourcc2str(tmp + 5, dst_fourcc));
+    VTRACE("%s blitting from %s to %s", __FUNCTION__, fourcc2str(src_fourcc, tmp), fourcc2str(dst_fourcc, tmp + 5));
     st = vaBlit(mDecoder->mDisplay, mContextId, src_surf, &src.rect, src_fourcc,
                 dst_surf, &dst.rect, dst_fourcc);
 
     return st;
 }
 
+static JpegDecodeStatus blitToLinearRgba_va_gpucopy(JpegDecoder *decoder,
+        VADisplay dp, VAContextID ctx, RenderTarget &src,
+        uint8_t *sysmem, uint32_t width, uint32_t height, int scale_factor)
+{
+    CmQueue *pQueue = NULL;
+    CmSurface2D *pSurf= NULL;
+    CmEvent *pEvent = NULL;
+    INT result;
+    UINT ver;
+    RenderTarget target;
+    VASurfaceID surf;
+    nsecs_t t1, t2, t3, t4;
+    target.type = RenderTarget::INTERNAL_BUF;
+    target.pixel_format = VA_FOURCC_RGBA;
+    target.handle = generateHandle();
+    target.width = aligned_width(width, SURF_TILING_Y);
+    target.height = aligned_height(height, SURF_TILING_Y);
+    target.stride = aligned_width(width, SURF_TILING_Y);
+    target.rect.x = target.rect.y = 0;
+    target.rect.width = width;
+    target.rect.height = height;
+    VASurfaceID src_surf = decoder->getSurfaceID(src);
+    if (src_surf == VA_INVALID_ID) {
+        ETRACE("%s invalid src %s target", __FUNCTION__, fourcc2str(src.pixel_format));
+        return JD_INVALID_RENDER_TARGET;
+    }
+    JpegDecodeStatus st = decoder->createSurfaceFromRenderTarget(target, &surf);
+    if (st != JD_SUCCESS || surf == VA_INVALID_ID) {
+        ETRACE("%s failed to create surface for RGBA linear target", __FUNCTION__);
+        return JD_RESOURCE_FAILURE;
+    }
+    st = vaBlit(dp, ctx, src_surf, &src.rect, src.pixel_format,
+                surf, &target.rect, target.pixel_format);
+    if (st != JD_SUCCESS) {
+        ETRACE("%s failed to VA blit to RGBA", __FUNCTION__);
+        return JD_RESOURCE_FAILURE;
+    }
+
+#if DUMP_RGBA
+    uint8_t *data;
+    uint32_t offsets[3];
+    uint32_t pitches[3];
+    JpegDecoder::MapHandle hnd = decoder->mapData(target, (void**)&data, offsets, pitches);
+    assert(hnd);
+    char fname[128];
+    sprintf(fname, "/sdcard/%dx%d.rgba", target.stride, target.height);
+    FILE *fdump = fopen(fname, "wb");
+    assert(fdump);
+    fwrite(data, 4, target.height * target.stride, fdump);
+    fclose(fdump);
+    decoder->unmapData(target, hnd);
+#endif
+
+    if (st) {
+        ETRACE("%s: failed to blit to RGBA linear", __FUNCTION__);
+        decoder->destroySurface(target);
+        return JD_BLIT_FAILURE;
+    }
+
+    t1 = systemTime();
+#if PRE_INIT_CM
+#else
+    result = CreateCmDevice(pDev, ver, dp);
+    if (result != CM_SUCCESS || !pDev) {
+        ETRACE("%s CmCreateSurface2D returns %d", __FUNCTION__, result);
+        return JD_BLIT_FAILURE;
+    }
+#endif
+    result = pDev->CreateSurface2D(surf, pSurf);
+    if (result != CM_SUCCESS || !pSurf) {
+        ETRACE("%s CmCreateSurface2D returns %d", __FUNCTION__, result);
+        DestroyCmDevice(pDev );
+        return JD_BLIT_FAILURE;
+    }
+    result = pDev->CreateQueue( pQueue);
+    if (result != CM_SUCCESS || !pQueue) {
+        ETRACE("%s CmCreateQueue returns %d", __FUNCTION__, result);
+        pDev->DestroySurface(pSurf);
+        DestroyCmDevice( pDev );
+        return JD_BLIT_FAILURE;
+    }
+    t2 = systemTime();
+    result = pQueue->EnqueueCopyGPUToCPU(pSurf, sysmem, pEvent);
+    if (result != CM_SUCCESS) {
+        ETRACE("%s CmEnqueueCopyGPUToCPU returns %d", __FUNCTION__, result);
+        pDev->DestroySurface(pSurf);
+        DestroyCmDevice( pDev );
+        return JD_BLIT_FAILURE;
+    }
+    t3 = systemTime();
+    result = pDev->DestroySurface(pSurf);
+    if (result != CM_SUCCESS) {
+        WTRACE("%s CmDestroySurface returns %d", __FUNCTION__, result);
+    }
+#if PRE_INIT_CM
+    assert(pDev);
+#else
+    result = DestroyCmDevice(pDev);
+    if (result != CM_SUCCESS) {
+        WTRACE("%s DestroyCmDevice failed %d", __FUNCTION__, result);
+    }
+#endif
+    t4 = systemTime();
+    st = decoder->destroySurface(target);
+    if (st) {
+        WTRACE("%s: failed to destroy VA surface", __FUNCTION__);
+    }
+    ITRACE("%s: cm GpuCopy took %.2f+%.2f+%.2f ms", __FUNCTION__,
+        (t2 - t1)/1000000.0,
+        (t3 - t2)/1000000.0,
+        (t4 - t3)/1000000.0);
+    return st;
+}
+
+JpegDecodeStatus JpegBlitter::getRgbaTile(RenderTarget &src,
+                                     uint8_t *sysmem,
+                                     int left, int top, int width, int height, int scale_factor)
+{
+#define ISA_FILE "/system/lib/libjpeg_cm_genx.isa"
+#define CM_GPU_TASK_WIDTH 8
+#define CM_GPU_TASK_HEIGHT 8
+    VASurfaceID srcVaId;
+
+    srcVaId = mDecoder->getSurfaceID(src);
+    JpegDecodeStatus status = JD_SUCCESS;
+    uint32_t aligned_w = width;//aligned_width(width, SURF_TILING_Y);
+    uint32_t aligned_h = height;//aligned_height(height, SURF_TILING_Y);
+
+    CmThreadSpace *pThreadSpace = NULL;
+    CmTask *pKernelArray = NULL;
+    CmQueue *pQueue = NULL;
+    CmSurface2D *pInSurf= NULL;
+    SurfaceIndex *pInSurfId = NULL;
+    CmBufferUP *pOutBuf = NULL;
+    SurfaceIndex *pOutBufId = NULL;
+    CmEvent *pEvent = NULL;
+    UINT ver;
+    int threadswidth, threadsheight;
+    INT result;
+    DWORD dwTimeOutMs = -1;
+    uint32_t cm_in_fourcc;
+    threadswidth = aligned_w/CM_GPU_TASK_WIDTH;
+    threadsheight = aligned_h/CM_GPU_TASK_HEIGHT;
+    nsecs_t t1, t2, t3, t4, t5, t6, t7;
+    VTRACE("%s before holding cm lock", __FUNCTION__);
+    Mutex::Autolock autoLock(cmLock);
+    VTRACE("%s got cm lock", __FUNCTION__);
+    t1 = t2 = t3 = t4 = t5 = t6 = t7 = systemTime();
+
+#if PRE_INIT_CM
+    assert(pDev && pProgram);
+#else
+    FILE* pIsaFile = NULL;
+    int codeSize;
+    BYTE* pIsaBytes = NULL;
+    result = CreateCmDevice(pDev, ver, dp);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    pIsaFile = fopen(ISA_FILE, "rb");
+    if (pIsaFile==NULL) {
+        ETRACE("%s fopen failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    fseek (pIsaFile, 0, SEEK_END);
+    codeSize = ftell (pIsaFile);
+    rewind(pIsaFile);
+    if (codeSize==0) {
+        ETRACE("%s codesize failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    pIsaBytes = (BYTE*) malloc(codeSize);
+    if (pIsaBytes==NULL) {
+        ETRACE("%s malloc failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    if (fread(pIsaBytes, 1, codeSize, pIsaFile) != codeSize) {
+        ETRACE("%s fread failed", __FUNCTION__);
+        free(pIsaFile);
+        fclose(pIsaFile);
+        DestroyCmDevice(pDev);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    fclose(pIsaFile);
+    pIsaFile = NULL;
+
+    result = pDev->LoadProgram(pIsaBytes, codeSize, pProgram);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    free(pIsaBytes);
+    pIsaBytes = NULL;
+    VTRACE("%s cm init succeded", __FUNCTION__);
+#endif
+
+    t2 = systemTime();
+    // create thread space
+    result = pDev->CreateKernel(pProgram, CM_KERNEL_FUNCTION(yuv_tiled_to_rgba_tile), pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    result = pDev->CreateSurface2D(srcVaId, pInSurf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pInSurf->GetIndex(pInSurfId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // create bufferUp from dst ptr
+    result = pDev->CreateBufferUP(aligned_w * aligned_h * 4, sysmem, pOutBuf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pOutBuf->GetIndex(pOutBufId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    result = pDev->CreateQueue( pQueue);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pDev->CreateThreadSpace(threadswidth, threadsheight, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernel->SetThreadCount( threadswidth* threadsheight );
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // enqueue csc
+    pKernel->SetKernelArg(0,sizeof(SurfaceIndex),pInSurfId);
+    pKernel->SetKernelArg(1,sizeof(SurfaceIndex),pOutBufId);
+    pKernel->SetKernelArg(2,sizeof(int),&left);
+    pKernel->SetKernelArg(3,sizeof(int),&top);
+    pKernel->SetKernelArg(4,sizeof(int),&aligned_w);
+    pKernel->SetKernelArg(5,sizeof(int),&aligned_h);
+
+    cm_in_fourcc = src.pixel_format;
+
+    pKernel->SetKernelArg(6,sizeof(uint32_t),&cm_in_fourcc);
+    result = pDev->CreateTask(pKernelArray);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernelArray->AddKernel (pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pQueue->Enqueue(pKernelArray, pEvent, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // wait kernel finish
+    t3 = systemTime();
+    result = pEvent->WaitForTaskFinished(dwTimeOutMs);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    //event = NULL;//(BlitEvent)pEvent;
+    t4 = systemTime();
+
+cleanup:
+    // destroy thread space/house cleaning
+    if (pOutBuf) pDev->DestroyBufferUP(pOutBuf);
+    t5 = systemTime();
+    if (pInSurf) pDev->DestroySurface(pInSurf);
+    t6 = systemTime();
+    if (pKernelArray) pDev->DestroyTask(pKernelArray);
+    if (pThreadSpace) pDev->DestroyThreadSpace(pThreadSpace);
+    if (pKernel) pDev->DestroyKernel(pKernel);
+#if PRE_INIT_CM
+#else
+    if (pProgram) pDev->DestroyProgram(pProgram);
+    if (pDev) DestroyCmDevice(pDev);
+#endif
+    t7 = systemTime();
+
+    VTRACE("%s blit with CM %ux%u took %.2f + %.2f + %.2f + %.2f + %.2f + %.2f ms", __FUNCTION__,
+        width, height,
+        (t2 - t1)/1000000.0,
+        (t3 - t2)/1000000.0,
+        (t4 - t3)/1000000.0,
+        (t5 - t4)/1000000.0,
+        (t6 - t5)/1000000.0,
+        (t7 - t6)/1000000.0);
+    VTRACE("%s release cm lock", __FUNCTION__);
+    return status;
+}
+
+
+static JpegDecodeStatus blitToLinearRgba_cm(JpegDecoder *decoder,
+        VADisplay dp, VAContextID ctx, RenderTarget &src, uint8_t *sysmem, uint32_t width, uint32_t height,
+        BlitEvent &event, int scale_factor)
+{
+#define ISA_FILE "/system/lib/libjpeg_cm_genx.isa"
+#define CM_GPU_TASK_WIDTH 32
+#define CM_GPU_TASK_HEIGHT 8
+    VASurfaceID srcVaId;
+    Mutex::Autolock autoLock(cmLock);
+
+    srcVaId = decoder->getSurfaceID(src);
+    JpegDecodeStatus status = JD_SUCCESS;
+    uint32_t aligned_in_w = aligned_width(width, SURF_TILING_Y);
+    uint32_t aligned_in_h = aligned_height(height, SURF_TILING_Y);
+    uint32_t aligned_out_w = aligned_width(width/scale_factor, SURF_TILING_Y);
+    uint32_t aligned_out_h = aligned_height(height/scale_factor, SURF_TILING_Y);
+
+#if NV12_INTERMEDIATE
+    RenderTarget nv12_target;
+    VASurfaceID nv12_surf_id;
+    VASurfaceID nv12_surf;
+    nv12_target.type = RenderTarget::INTERNAL_BUF;
+    nv12_target.pixel_format = VA_FOURCC_NV12;
+    nv12_target.handle = generateHandle();
+    nv12_target.width = aligned_in_w;
+    nv12_target.height = aligned_in_h;
+    nv12_target.stride = aligned_in_w;
+    nv12_target.rect.x = nv12_target.rect.y = 0;
+    nv12_target.rect.width = width;
+    nv12_target.rect.height = height;
+    status = decoder->createSurfaceFromRenderTarget(nv12_target, &nv12_surf_id);
+    if (status != JD_SUCCESS || nv12_surf_id == VA_INVALID_ID) {
+        ETRACE("%s failed to create surface for NV12 target", __FUNCTION__);
+        return JD_RESOURCE_FAILURE;
+    }
+    vaBlit(dp, ctx, srcVaId, &src.rect, src.pixel_format,
+        nv12_surf_id, &nv12_target.rect, VA_FOURCC_NV12);
+    srcVaId = nv12_surf_id;
+#endif
+
+    CmThreadSpace *pThreadSpace = NULL;
+    CmTask *pKernelArray = NULL;
+    CmQueue *pQueue = NULL;
+    CmSurface2D *pInSurf= NULL;
+    SurfaceIndex *pInSurfId = NULL;
+    CmBufferUP *pOutBuf = NULL;
+    SurfaceIndex *pOutBufId = NULL;
+    CmEvent *pEvent = NULL;
+    UINT ver;
+    int threadswidth, threadsheight;
+    INT result;
+    DWORD dwTimeOutMs = -1;
+    uint32_t cm_in_fourcc;
+    threadswidth = aligned_in_w/CM_GPU_TASK_WIDTH;
+    threadsheight = aligned_in_h/CM_GPU_TASK_HEIGHT;
+    nsecs_t t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+    VTRACE("%s before holding cm lock", __FUNCTION__);
+    VTRACE("%s got cm lock", __FUNCTION__);
+    t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = t10 = systemTime();
+
+#if PRE_INIT_CM
+    assert(pDev && pProgram);
+#else
+    FILE* pIsaFile = NULL;
+    int codeSize;
+    BYTE* pIsaBytes = NULL;
+    result = CreateCmDevice(pDev, ver, dp);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    pIsaFile = fopen(ISA_FILE, "rb");
+    if (pIsaFile==NULL) {
+        ETRACE("%s fopen failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    fseek (pIsaFile, 0, SEEK_END);
+    codeSize = ftell (pIsaFile);
+    rewind(pIsaFile);
+    if (codeSize==0) {
+        ETRACE("%s codesize failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    pIsaBytes = (BYTE*) malloc(codeSize);
+    if (pIsaBytes==NULL) {
+        ETRACE("%s malloc failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    if (fread(pIsaBytes, 1, codeSize, pIsaFile) != codeSize) {
+        ETRACE("%s fread failed", __FUNCTION__);
+        free(pIsaFile);
+        fclose(pIsaFile);
+        DestroyCmDevice(pDev);
+        VTRACE("%s release cm lock", __FUNCTION__);
+        return JD_BLIT_FAILURE;
+    }
+    fclose(pIsaFile);
+    pIsaFile = NULL;
+
+    result = pDev->LoadProgram(pIsaBytes, codeSize, pProgram);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    free(pIsaBytes);
+    pIsaBytes = NULL;
+    VTRACE("%s cm init succeded", __FUNCTION__);
+#endif
+
+    // create thread space
+    result = pDev->CreateKernel(pProgram, CM_KERNEL_FUNCTION(CM_KERNEL_FUNC_NAME), pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    VTRACE("Creating CmSurface from VASurface %d", srcVaId);
+    t2 = systemTime();
+    result = pDev->CreateSurface2D(srcVaId, pInSurf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    result = pInSurf->GetIndex(pInSurfId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // create bufferUp from dst ptr
+    VTRACE("CmSurfaceID got");
+    t3 = systemTime();
+    result = pDev->CreateBufferUP(aligned_out_w * aligned_out_h * 4, sysmem, pOutBuf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pOutBuf->GetIndex(pOutBufId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    t4 = systemTime();
+    result = pDev->CreateQueue( pQueue);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pDev->CreateThreadSpace(threadswidth, threadsheight, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernel->SetThreadCount( threadswidth* threadsheight );
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // enqueue csc
+    pKernel->SetKernelArg(0,sizeof(SurfaceIndex),pInSurfId);
+    pKernel->SetKernelArg(1,sizeof(SurfaceIndex),pOutBufId);
+    pKernel->SetKernelArg(2,sizeof(int),&aligned_out_w);
+#if NV12_INTERMEDIATE
+    cm_in_fourcc = VA_FOURCC_NV12;
+#else
+    cm_in_fourcc = src.pixel_format;
+#endif
+    pKernel->SetKernelArg(3,sizeof(uint32_t),&cm_in_fourcc);
+    pKernel->SetKernelArg(4,sizeof(int), &scale_factor);
+    result = pDev->CreateTask(pKernelArray);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernelArray->AddKernel (pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pQueue->Enqueue(pKernelArray, pEvent, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // wait kernel finish
+    t5 = systemTime();
+    result = pEvent->WaitForTaskFinished(dwTimeOutMs);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    event = NULL;//(BlitEvent)pEvent;
+    t6 = systemTime();
+
+cleanup:
+#if NV12_INTERMEDIATE
+    if (nv12_surf_id != VA_INVALID_ID) decoder->destroySurface(nv12_target);
+#endif
+    // destroy thread space/house cleaning
+    if (pOutBuf) pDev->DestroyBufferUP(pOutBuf);
+    t7 = systemTime();
+    if (pInSurf) pDev->DestroySurface(pInSurf);
+    t8 = systemTime();
+    if (pKernelArray) pDev->DestroyTask(pKernelArray);
+    if (pThreadSpace) pDev->DestroyThreadSpace(pThreadSpace);
+    if (pKernel) pDev->DestroyKernel(pKernel);
+#if PRE_INIT_CM
+#else
+    if (pProgram) pDev->DestroyProgram(pProgram);
+    if (pDev) DestroyCmDevice(pDev);
+#endif
+    t9 = systemTime();
+
+    VTRACE("%s blit with CM %ux%u(%dx) took %.2f + %.2f + %.2f + %.2f + %.2f + %.2f + %.2f + %.2f ms", __FUNCTION__,
+        width, height, scale_factor,
+        (t2 - t1)/1000000.0,
+        (t3 - t2)/1000000.0,
+        (t4 - t3)/1000000.0,
+        (t5 - t4)/1000000.0,
+        (t6 - t5)/1000000.0,
+        (t7 - t6)/1000000.0,
+        (t8 - t7)/1000000.0,
+        (t9 - t8)/1000000.0);
+    VTRACE("%s release cm lock", __FUNCTION__);
+    return status;
+}
+
+JpegDecodeStatus JpegBlitter::blitToLinearRgba(RenderTarget &src,
+                                               uint8_t *sysmem,
+                                               uint32_t width, uint32_t height,
+                                               BlitEvent &event, int scale_factor)
+{
+    Mutex::Autolock autoLock(mDecoder->mLock);
+#if BLIT_METHOD_CM
+    return blitToLinearRgba_cm(mDecoder, mDecoder->mDisplay, mContextId, src, sysmem, width, height, event, scale_factor);
+#else
+    return blitToLinearRgba_va_gpucopy(mDecoder, mDecoder->mDisplay, mContextId, src, sysmem, width, height, scale_factor);
+#endif
+}
+
+JpegDecodeStatus JpegBlitter::blitToCameraSurfaces(RenderTarget &src,
+                                                   buffer_handle_t dst_nv12,
+                                                   buffer_handle_t dst_yuy2,
+                                                   uint8_t *dst_nv21,
+                                                   uint8_t *dst_yv12,
+                                                   uint32_t width, uint32_t height,
+                                                   BlitEvent &event)
+{
+#define CM_GPU_TASK_WIDTH 32
+#define CM_GPU_TASK_HEIGHT 8
+    VASurfaceID srcVaId, nv12_surf_id, yuy2_surf_id;
+    srcVaId = nv12_surf_id = yuy2_surf_id = VA_INVALID_ID;
+    srcVaId = mDecoder->getSurfaceID(src);
+    JpegDecodeStatus status = JD_SUCCESS;
+    uint32_t aligned_w = aligned_width(width, SURF_TILING_Y);
+    uint32_t aligned_h = aligned_height(height, SURF_TILING_Y);
+
+    CmThreadSpace *pThreadSpace = NULL;
+    CmTask *pKernelArray = NULL;
+    CmQueue *pQueue = NULL;
+    CmSurface2D *pInSurf= NULL;
+    SurfaceIndex *pInSurfId = NULL;
+    CmSurface2D *pOutNV12Surf= NULL;
+    SurfaceIndex *pOutNV12SurfId = NULL;
+    CmSurface2D *pOutYUY2Surf= NULL;
+    SurfaceIndex *pOutYUY2SurfId = NULL;
+    CmBufferUP *pOutNV21Surf = NULL;
+    SurfaceIndex *pOutNV21SurfId = NULL;
+    CmBufferUP *pOutYV12Surf = NULL;
+    SurfaceIndex *pOutYV12SurfId = NULL;
+    uint8_t do_nv21, do_yv12;
+    do_nv21 = do_yv12 = 0;
+    CmEvent *pEvent = NULL;
+    RenderTarget nv12_target, yuy2_target;
+    UINT ver;
+    int threadswidth, threadsheight;
+    INT result;
+    DWORD dwTimeOutMs = -1;
+    uint32_t cm_in_fourcc;
+    threadswidth = aligned_w/CM_GPU_TASK_WIDTH;
+    threadsheight = aligned_h/CM_GPU_TASK_HEIGHT;
+    nsecs_t t1, t2, t3, t4, t5;
+    t1 = t2 = t3 = t4 = t5 = systemTime();
+    VTRACE("%s before holding cm lock", __FUNCTION__);
+    Mutex::Autolock autoLock(cmLock);
+
+#if PRE_INIT_CM
+    assert(pDev && pProgram);
+#else
+    result = CreateCmDevice(pDev, ver, dp);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    pIsaFile = fopen(ISA_FILE, "rb");
+    if (pIsaFile==NULL) {
+        ETRACE("%s fopen failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        return JD_BLIT_FAILURE;
+    }
+    fseek (pIsaFile, 0, SEEK_END);
+    codeSize = ftell (pIsaFile);
+    rewind(pIsaFile);
+    if (codeSize==0) {
+        ETRACE("%s codesize failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        return JD_BLIT_FAILURE;
+    }
+    pIsaBytes = (BYTE*) malloc(codeSize);
+    if (pIsaBytes==NULL) {
+        ETRACE("%s malloc failed", __FUNCTION__);
+        DestroyCmDevice(pDev);
+        fclose(pIsaFile);
+        return JD_BLIT_FAILURE;
+    }
+    if (fread(pIsaBytes, 1, codeSize, pIsaFile) != codeSize) {
+        ETRACE("%s fread failed", __FUNCTION__);
+        free(pIsaFile);
+        fclose(pIsaFile);
+        DestroyCmDevice(pDev);
+        return JD_BLIT_FAILURE;
+    }
+    fclose(pIsaFile);
+    pIsaFile = NULL;
+
+    result = pDev->LoadProgram(pIsaBytes, codeSize, pProgram);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    free(pIsaBytes);
+    pIsaBytes = NULL;
+    VTRACE("%s cm init succeded", __FUNCTION__);
+#endif
+
+    t2 = systemTime();
+    // create thread space
+    result = pDev->CreateKernel(pProgram, CM_KERNEL_FUNCTION(yuv422h_tiled_to_camera_surfaces), pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+
+    // src surface
+    result = pDev->CreateSurface2D(srcVaId, pInSurf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pInSurf->GetIndex(pInSurfId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // dst nv12 + yuy2
+    nv12_target.handle = (int)dst_nv12;
+    nv12_target.type = RenderTarget::ANDROID_GRALLOC;
+    nv12_target.height = aligned_height(height, SURF_TILING_Y);
+    nv12_target.width = aligned_width(width, SURF_TILING_Y);
+    nv12_target.pixel_format = VA_FOURCC_NV12;
+    nv12_target.stride = nv12_target.width;
+    nv12_target.rect.x = nv12_target.rect.y = 0;
+    nv12_target.rect.width = nv12_target.width;
+    nv12_target.rect.height = nv12_target.height;
+    mDecoder->createSurfaceFromRenderTarget(nv12_target, &nv12_surf_id);
+    yuy2_target.handle = (int)dst_yuy2;
+    yuy2_target.type = RenderTarget::ANDROID_GRALLOC;
+    yuy2_target.height = aligned_height(height, SURF_TILING_Y);
+    yuy2_target.width = aligned_width(width, SURF_TILING_Y);
+    yuy2_target.pixel_format = VA_FOURCC_YUY2;
+    yuy2_target.stride = yuy2_target.width * 2;
+    yuy2_target.rect.x = yuy2_target.rect.y = 0;
+    yuy2_target.rect.width = yuy2_target.width;
+    yuy2_target.rect.height = yuy2_target.height;
+    mDecoder->createSurfaceFromRenderTarget(yuy2_target, &yuy2_surf_id);
+    result = pDev->CreateSurface2D(nv12_surf_id, pOutNV12Surf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pOutNV12Surf->GetIndex(pOutNV12SurfId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pDev->CreateSurface2D(yuy2_surf_id, pOutYUY2Surf);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pOutYUY2Surf->GetIndex(pOutYUY2SurfId);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // dst nv21
+    if (dst_nv21) {
+        result = pDev->CreateBufferUP(aligned_w * aligned_h * 3 / 2, dst_nv21, pOutNV21Surf);
+        JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+        result = pOutNV21Surf->GetIndex(pOutNV21SurfId);
+        JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+        do_nv21 = 1;
+    }
+    else {
+        pOutNV21SurfId = pInSurfId;
+        do_nv21 = 0;
+    }
+    // dst yv12
+    if (dst_yv12) {
+        result = pDev->CreateBufferUP(aligned_w * aligned_h * 3 / 2, dst_yv12, pOutYV12Surf);
+        JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+        result = pOutYV12Surf->GetIndex(pOutYV12SurfId);
+        JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+        do_yv12 = 1;
+    }
+    else {
+        pOutYV12SurfId = pInSurfId;
+        do_yv12 = 0;
+    }
+    result = pDev->CreateQueue( pQueue);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pDev->CreateThreadSpace(threadswidth, threadsheight, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernel->SetThreadCount( threadswidth* threadsheight );
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // enqueue csc
+    pKernel->SetKernelArg(0,sizeof(SurfaceIndex),pInSurfId);
+    pKernel->SetKernelArg(1,sizeof(SurfaceIndex),pOutNV12SurfId);
+    pKernel->SetKernelArg(2,sizeof(SurfaceIndex),pOutYUY2SurfId);
+    pKernel->SetKernelArg(3,sizeof(SurfaceIndex),pOutNV21SurfId);
+    pKernel->SetKernelArg(4,sizeof(SurfaceIndex),pOutYV12SurfId);
+    pKernel->SetKernelArg(5,sizeof(int),&aligned_h);
+    pKernel->SetKernelArg(6,sizeof(int),&aligned_w);
+    pKernel->SetKernelArg(7,sizeof(uint8_t),&do_nv21);
+    pKernel->SetKernelArg(8,sizeof(uint8_t),&do_yv12);
+    result = pDev->CreateTask(pKernelArray);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pKernelArray->AddKernel (pKernel);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    result = pQueue->Enqueue(pKernelArray, pEvent, pThreadSpace);
+    JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    // wait kernel finish
+    t3 = systemTime();
+    //result = pEvent->WaitForTaskFinished(dwTimeOutMs);
+    //JD_CM_CHECK_RET(result, cleanup, JD_BLIT_FAILURE);
+    event = (BlitEvent)pEvent;
+    t4 = systemTime();
+
+cleanup:
+    // destroy thread space/house cleaning
+    if (pOutYV12Surf) pDev->DestroyBufferUP(pOutYV12Surf);
+    if (pOutNV21Surf) pDev->DestroyBufferUP(pOutNV21Surf);
+    if (pOutYUY2Surf) pDev->DestroySurface(pOutYUY2Surf);
+    if (pOutNV12Surf) pDev->DestroySurface(pOutNV12Surf);
+    if (pInSurf) pDev->DestroySurface(pInSurf);
+    if (nv12_surf_id != VA_INVALID_ID) mDecoder->destroySurface(nv12_target);
+    if (yuy2_surf_id != VA_INVALID_ID) mDecoder->destroySurface(yuy2_target);
+    if (pKernelArray) pDev->DestroyTask(pKernelArray);
+    if (pThreadSpace) pDev->DestroyThreadSpace(pThreadSpace);
+    if (pKernel) pDev->DestroyKernel(pKernel);
+#if PRE_INIT_CM
+#else
+    if (pIsaBytes) free(pIsaBytes);
+    if (pIsaFile) fclose(pIsaFile);
+    if (pProgram) pDev->DestroyProgram(pProgram);
+    if (pDev) DestroyCmDevice(pDev);
+#endif
+    t5 = systemTime();
+    VTRACE("%s blit with CM took %.2f + %.2f + %.2f + %.2f ms", __FUNCTION__,
+        (t2 - t1)/1000000.0,
+        (t3 - t2)/1000000.0,
+        (t4 - t3)/1000000.0,
+        (t5 - t4)/1000000.0);
+    return status;
+}
+
+void JpegBlitter::syncBlit(BlitEvent &event)
+{
+    nsecs_t now = systemTime();
+    DWORD dwTimeOutMs = -1;
+    CmEvent *pEvent = (CmEvent*)event;
+    UINT64 executionTime;
+    if (event == NULL)
+        return;
+    INT result = pEvent->WaitForTaskFinished(dwTimeOutMs);
+    if (result != CM_SUCCESS) {
+        ETRACE("%s: Failed to sync blit event", __FUNCTION__);
+    }
+    else {
+        event = NULL;
+        VTRACE("%s: syncBlit took %.2f ms", __FUNCTION__, (systemTime()-now)/1000000.0);
+    }
+}
+
diff --git a/imagedecoder/JPEGBlitter_img.cpp b/imagedecoder/JPEGBlitter_img.cpp
index d56ba98..d714a14 100644
--- a/imagedecoder/JPEGBlitter_img.cpp
+++ b/imagedecoder/JPEGBlitter_img.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -28,8 +27,40 @@
 
 #include "JPEGBlitter.h"
 
-JpegDecodeStatus JpegBlitter::blit(RenderTarget &src, RenderTarget &dst)
+JpegDecodeStatus JpegBlitter::blit(RenderTarget &src, RenderTarget &dst, int scale_factor)
 {
     return JD_OUTPUT_FORMAT_UNSUPPORTED;
 }
 
+JpegDecodeStatus JpegBlitter::blitToLinearRgba(RenderTarget &src, uint8_t *sysmem, uint32_t width, uint32_t height, BlitEvent &event, int scale_factor)
+{
+    return JD_OUTPUT_FORMAT_UNSUPPORTED;
+}
+JpegDecodeStatus JpegBlitter::getRgbaTile(RenderTarget &src,
+                                     uint8_t *sysmem,
+                                     int left, int top, int width, int height, int scale_factor)
+{
+    return JD_OUTPUT_FORMAT_UNSUPPORTED;
+}
+void JpegBlitter::init(JpegDecoder& /*dec*/)
+{
+    // Do nothing
+}
+void JpegBlitter::deinit()
+{
+    // Do nothing
+}
+void JpegBlitter::syncBlit(BlitEvent &event)
+{
+    // Do nothing
+}
+JpegDecodeStatus JpegBlitter::blitToCameraSurfaces(RenderTarget &src,
+                                                   buffer_handle_t dst_nv12,
+                                                   buffer_handle_t dst_yuy2,
+                                                   uint8_t *dst_nv21,
+                                                   uint8_t *dst_yv12,
+                                                   uint32_t width, uint32_t height,
+                                                   BlitEvent &event)
+{
+    return JD_OUTPUT_FORMAT_UNSUPPORTED;
+}
diff --git a/imagedecoder/JPEGCommon.h b/imagedecoder/JPEGCommon.h
index 6df6fcd..790ad35 100644
--- a/imagedecoder/JPEGCommon.h
+++ b/imagedecoder/JPEGCommon.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -33,23 +32,66 @@
 #include <va/va_dec_jpeg.h>
 #include <sys/types.h>
 #include <string.h>
+#include <utils/Vector.h>
+
+using namespace android;
 
 #define JPEG_MAX_COMPONENTS 4
 #define JPEG_MAX_QUANT_TABLES 4
 
+#define SURF_TILING_NONE    0
+#define SURF_TILING_X       1
+#define SURF_TILING_Y       2
 
-#define RENDERTARGET_INTERNAL_BUFFER (RenderTarget::ANDROID_GRALLOC + 1)
+extern uint32_t aligned_width(uint32_t width, int tiling);
+extern uint32_t aligned_height(uint32_t height, int tiling);
+
+struct RenderTarget {
+    enum bufType{
+        KERNEL_DRM,
+        ANDROID_GRALLOC,
+        INTERNAL_BUF,
+        USER_PTR,
+    };
+
+    int width;
+    int height;
+    int stride;
+    bufType type;
+    int format;
+    int pixel_format;
+    int handle;
+    VARectangle rect;
+};
 
 struct JpegInfo
 {
-    // in
-    uint8_t *buf;
-    size_t bufsize;
+    // in: use either buf+bufsize or inputs
+    union {
+        struct {
+            uint8_t *buf;
+            uint32_t bufsize;
+        };
+        android::Vector<uint8_t> *inputs;
+    };
+    bool use_vector_input;
+    bool need_header_only;
+    // internal use
+    uint32_t component_order;
+    uint32_t dqt_ind;
+    uint32_t dht_ind;
+    uint32_t scan_ind;
+    bool frame_marker_found;
+    bool soi_parsed;
+    bool sof_parsed;
+    bool dqt_parsed;
+    bool dht_parsed;
+    bool sos_parsed;
+    bool dri_parsed;
     // out
     uint32_t image_width;
     uint32_t image_height;
     uint32_t image_color_fourcc;
-    int      image_pixel_format;
     VAPictureParameterBufferJPEGBaseline picture_param_buf;
     VASliceParameterBufferJPEGBaseline slice_param_buf[JPEG_MAX_COMPONENTS];
     VAIQMatrixBufferJPEGBaseline qmatrix_buf;
@@ -80,10 +122,13 @@
     JD_BLIT_FAILURE,
     JD_ERROR_BITSTREAM,
     JD_RENDER_TARGET_BUSY,
+    JD_IMAGE_TOO_SMALL,
+    JD_INSUFFICIENT_BYTE,
+    JD_UNIMPLEMENTED,
 };
 
 
-inline char * fourcc2str(char * str, uint32_t fourcc)
+inline char * fourcc2str(uint32_t fourcc, char * str = NULL)
 {
     static char tmp[5];
     if (str == NULL) {
@@ -104,6 +149,7 @@
     case VA_FOURCC_422H:
     case VA_FOURCC_422V:
     case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
         return VA_RT_FORMAT_YUV422;
     case VA_FOURCC_IMC3:
     case VA_FOURCC_YV12:
@@ -112,12 +158,16 @@
     case VA_FOURCC_444P:
         return VA_RT_FORMAT_YUV444;
     case VA_FOURCC_411P:
+    case VA_FOURCC_411R:
         return VA_RT_FORMAT_YUV411;
+    case VA_FOURCC('4','0','0','P'):
+        return VA_RT_FORMAT_YUV400;
     case VA_FOURCC_BGRA:
     case VA_FOURCC_ARGB:
     case VA_FOURCC_RGBA:
         return VA_RT_FORMAT_RGB32;
     default:
+        // Add if needed
         return -1;
     }
 }
@@ -141,6 +191,10 @@
         return VA_FOURCC_411P;
     }
     else if (h1 == 1 && h2 == 1 && h3 == 1 &&
+            v1 == 4 && v2 == 1 && v3 == 1) {
+        return VA_FOURCC_411R;
+    }
+    else if (h1 == 1 && h2 == 1 && h3 == 1 &&
             v1 == 2 && v2 == 1 && v3 == 1) {
         return VA_FOURCC_422V;
     }
@@ -168,18 +222,23 @@
     case VA_FOURCC_NV12:
     case VA_FOURCC_444P:
     case VA_FOURCC_411P:
+    case VA_FOURCC_411R:
+    case VA_FOURCC('4','0','0','P'):
         return 1;
     case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
         return 2;
     case VA_FOURCC_BGRA:
     case VA_FOURCC_ARGB:
     case VA_FOURCC_RGBA:
         return 4;
     default:
+        // Add if needed
         return 1;
     }
 }
 
+// Platform dependent
 extern int fourcc2PixelFormat(uint32_t fourcc);
 extern uint32_t pixelFormat2Fourcc(int pixel_format);
 
diff --git a/imagedecoder/JPEGCommon_Gen.h b/imagedecoder/JPEGCommon_Gen.h
index ce3bf08..a07098a 100644
--- a/imagedecoder/JPEGCommon_Gen.h
+++ b/imagedecoder/JPEGCommon_Gen.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
diff --git a/imagedecoder/JPEGCommon_Img.h b/imagedecoder/JPEGCommon_Img.h
index 3473d20..1b568f9 100644
--- a/imagedecoder/JPEGCommon_Img.h
+++ b/imagedecoder/JPEGCommon_Img.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
diff --git a/imagedecoder/JPEGDecoder.cpp b/imagedecoder/JPEGDecoder.cpp
index 5e05464..624d226 100644
--- a/imagedecoder/JPEGDecoder.cpp
+++ b/imagedecoder/JPEGDecoder.cpp
@@ -26,7 +26,6 @@
 *    Yao Cheng <yao.cheng@intel.com>
 *
 */
-//#define LOG_NDEBUG 0
 
 #include <va/va.h>
 #include <va/va_tpi.h>
@@ -40,8 +39,6 @@
 #endif
 #include <assert.h>
 
-//#define LOG_TAG "ImageDecoder"
-
 #define JPEG_MAX_SETS_HUFFMAN_TABLES 2
 
 #define TABLE_CLASS_DC  0
@@ -67,20 +64,41 @@
             goto label; \
         }
 
-JpegDecoder::JpegDecoder()
+static int handlectr = 0;
+int generateHandle()
+{
+    return handlectr++;
+}
+
+JpegDecoder::JpegDecoder(VADisplay display, VAConfigID vpCfgId, VAContextID vpCtxId, bool use_blitter)
     :mInitialized(false),
-    mDisplay(0),
+    mDisplay(display),
     mConfigId(VA_INVALID_ID),
     mContextId(VA_INVALID_ID),
     mParser(NULL),
-    mBlitter(NULL)
+    mBlitter(NULL),
+    mParserInitialized(false),
+    mDispCreated(false)
 {
     mParser = new CJPEGParse;
-    mBlitter = new JpegBlitter;
-    Display dpy;
-    int va_major_version, va_minor_version;
-    mDisplay = vaGetDisplay(&dpy);
-    vaInitialize(mDisplay, &va_major_version, &va_minor_version);
+    mBsParser = new JpegBitstreamParser;
+    if (!display) {
+        assert(vpCfgId == VA_INVALID_ID);
+        assert(vpCtxId == VA_INVALID_ID);
+        assert(use_blitter == false);
+        Display dpy;
+        int va_major_version, va_minor_version;
+        mDisplay = vaGetDisplay(&dpy);
+        vaInitialize(mDisplay, &va_major_version, &va_minor_version);
+        mDispCreated = true;
+    }
+    if (use_blitter) {
+        assert(display != NULL);
+        assert(vpCfgId != VA_INVALID_ID);
+        assert(vpCtxId != VA_INVALID_ID);
+        mBlitter = new JpegBlitter(display, vpCfgId,vpCtxId);
+    }
+    VTRACE("%s CTOR succeded", __FUNCTION__);
 }
 JpegDecoder::~JpegDecoder()
 {
@@ -88,73 +106,82 @@
         WTRACE("Freeing JpegDecoder: not destroyed yet. Force destroy resource");
         deinit();
     }
+    if (mBlitter)
+        mBlitter->deinit();
     delete mBlitter;
-    vaTerminate(mDisplay);
+    if (mDispCreated)
+        vaTerminate(mDisplay);
     delete mParser;
+    delete mBsParser;
+    VTRACE("%s DTOR succeded", __FUNCTION__);
 }
 
 JpegDecoder::MapHandle JpegDecoder::mapData(RenderTarget &target, void ** data, uint32_t * offsets, uint32_t * pitches)
 {
-    JpegDecoder::MapHandle handle;
-    handle.img = NULL;
-    handle.valid = false;
+    VAImage *img = NULL;
     VASurfaceID surf_id = getSurfaceID(target);
     if (surf_id != VA_INVALID_ID) {
-        handle.img = new VAImage();
-        if (handle.img == NULL) {
+        img = new VAImage();
+        if (img == NULL) {
             ETRACE("%s: create VAImage fail", __FUNCTION__);
-            return handle;
+            return 0;
         }
         VAStatus st;
-        st = vaDeriveImage(mDisplay, surf_id, handle.img);
+        st = vaDeriveImage(mDisplay, surf_id, img);
         if (st != VA_STATUS_SUCCESS) {
-            delete handle.img;
-            handle.img = NULL;
+            delete img;
+            img = NULL;
             ETRACE("%s: vaDeriveImage fail %d", __FUNCTION__, st);
-            return handle;
+            return 0;
         }
-        st = vaMapBuffer(mDisplay, handle.img->buf, data);
+        st = vaMapBuffer(mDisplay, img->buf, data);
         if (st != VA_STATUS_SUCCESS) {
-            vaDestroyImage(mDisplay, handle.img->image_id);
-            delete handle.img;
-            handle.img = NULL;
+            vaDestroyImage(mDisplay, img->image_id);
+            delete img;
+            img = NULL;
             ETRACE("%s: vaMapBuffer fail %d", __FUNCTION__, st);
-            return handle;
+            return 0;
         }
-        handle.valid = true;
-        offsets[0] = handle.img->offsets[0];
-        offsets[1] = handle.img->offsets[1];
-        offsets[2] = handle.img->offsets[2];
-        pitches[0] = handle.img->pitches[0];
-        pitches[1] = handle.img->pitches[1];
-        pitches[2] = handle.img->pitches[2];
-        return handle;
+        offsets[0] = img->offsets[0];
+        offsets[1] = img->offsets[1];
+        offsets[2] = img->offsets[2];
+        pitches[0] = img->pitches[0];
+        pitches[1] = img->pitches[1];
+        pitches[2] = img->pitches[2];
+        VTRACE("%s: successfully mapped RenderTarget %p, handle %d, data=%p, offsets=[%u,%u,%u], pitches=[%u,%u,%u], size=%u, %ux%u, to handle.img %p",
+            __FUNCTION__, &target, target.handle, *data, offsets[0], offsets[1], offsets[2],
+            pitches[0], pitches[1], pitches[2], img->data_size,
+            img->width, img->height, img);
+
+        return (uint32_t)img;
     }
     ETRACE("%s: get Surface ID fail", __FUNCTION__);
-    return handle;
+    return 0;
 }
 
 void JpegDecoder::unmapData(RenderTarget &target, JpegDecoder::MapHandle maphandle)
 {
-    if (maphandle.valid == false)
-        return;
-    if (maphandle.img != NULL) {
-        vaUnmapBuffer(mDisplay, maphandle.img->buf);
-        vaDestroyImage(mDisplay, maphandle.img->image_id);
-        delete maphandle.img;
+    if (maphandle != 0) {
+        vaUnmapBuffer(mDisplay, ((VAImage*)maphandle)->buf);
+        vaDestroyImage(mDisplay, ((VAImage*)maphandle)->image_id);
+        VTRACE("%s deleting VAImage %p", __FUNCTION__, ((VAImage*)maphandle));
+        delete ((VAImage*)maphandle);
     }
 }
 
 JpegDecodeStatus JpegDecoder::init(int w, int h, RenderTarget **targets, int num)
 {
-    if (mInitialized)
+    if (mInitialized) {
+        VTRACE("%s already initialized", __FUNCTION__);
         return JD_ALREADY_INITIALIZED;
+    }
     Mutex::Autolock autoLock(mLock);
-    mBlitter->setDecoder(*this);
     if (!mInitialized) {
+        nsecs_t now = systemTime();
         mGrallocSurfaceMap.clear();
         mDrmSurfaceMap.clear();
         mNormalSurfaceMap.clear();
+        mUserptrSurfaceMap.clear();
         VAStatus st;
         VASurfaceID surfid;
         for (int i = 0; i < num; ++i) {
@@ -164,6 +191,8 @@
                     __FUNCTION__, targets[i]->handle);
                 return JD_RESOURCE_FAILURE;
             }
+            VTRACE("%s successfully created surface %u for renderTarget %p, handle %d",
+                __FUNCTION__, surfid, targets[i], targets[i]->handle);
         }
         VAConfigAttrib attrib;
 
@@ -182,14 +211,17 @@
         size_t gmsize = mGrallocSurfaceMap.size();
         size_t dmsize = mDrmSurfaceMap.size();
         size_t nmsize = mNormalSurfaceMap.size();
+        size_t umsize = mUserptrSurfaceMap.size();
         VASurfaceID *surfaces = new VASurfaceID[gmsize + dmsize + nmsize];
-        for (size_t i = 0; i < gmsize + dmsize + nmsize; ++i) {
+        for (size_t i = 0; i < gmsize + dmsize + nmsize + umsize; ++i) {
             if (i < gmsize)
                 surfaces[i] = mGrallocSurfaceMap.valueAt(i);
             else if (i < gmsize + dmsize)
                 surfaces[i] = mDrmSurfaceMap.valueAt(i - gmsize);
-            else
+            else if (i < gmsize + dmsize + nmsize)
                 surfaces[i] = mNormalSurfaceMap.valueAt(i - gmsize - dmsize);
+            else
+                surfaces[i] = mUserptrSurfaceMap.valueAt(i - gmsize - dmsize - nmsize);
         }
         st = vaCreateContext(mDisplay, mConfigId,
             w, h,
@@ -202,34 +234,121 @@
             return JD_INITIALIZATION_ERROR;
         }
 
-        VTRACE("vaconfig = %u, vacontext = %u", mConfigId, mContextId);
+        VTRACE("JpegDecoder::init took %.2f ms", (systemTime() - now)/1000000.0);
         mInitialized = true;
     }
     return JD_SUCCESS;
 }
 
-JpegDecodeStatus JpegDecoder::blit(RenderTarget &src, RenderTarget &dst)
+JpegDecodeStatus JpegDecoder::blit(RenderTarget &src, RenderTarget &dst, int scale_factor)
 {
-    return mBlitter->blit(src, dst);
+    if (mBlitter) {
+        mBlitter->init(*this);
+        return mBlitter->blit(src, dst, scale_factor);
+    }
+    else
+        return JD_BLIT_FAILURE;
 }
 
-JpegDecodeStatus JpegDecoder::parse(JpegInfo &jpginfo)
+JpegDecodeStatus JpegDecoder::getRgbaTile(RenderTarget &src,
+                                     uint8_t *sysmem,
+                                     int left, int top, int width, int height, int scale_factor)
 {
-    uint32_t component_order = 0 ;
-    uint32_t dqt_ind = 0;
-    uint32_t dht_ind = 0;
-    uint32_t scan_ind = 0;
-    bool frame_marker_found = false;
+    if (mBlitter) {
+        nsecs_t now = systemTime();
+        mBlitter->init(*this);
+        nsecs_t t1 = systemTime();
+        JpegDecodeStatus st = mBlitter->getRgbaTile(src, sysmem, left, top, width, height, scale_factor);
+        VTRACE("Decoder::%s took %.2f + %.2f ms", __FUNCTION__,
+            (t1-now)/1000000.0, (systemTime()-t1)/1000000.0);
+        return st;
+    }
+    else
+        return JD_BLIT_FAILURE;
+
+}
+
+JpegDecodeStatus JpegDecoder::blitToLinearRgba(RenderTarget &src, uint8_t *sysmem, uint32_t width, uint32_t height, BlitEvent &event, int scale_factor)
+{
+    if (mBlitter) {
+        nsecs_t now = systemTime();
+        mBlitter->init(*this);
+        nsecs_t t1 = systemTime();
+        JpegDecodeStatus st = mBlitter->blitToLinearRgba(src, sysmem, width, height, event, scale_factor);
+        VTRACE("Decoder::%s took %.2f + %.2f ms", __FUNCTION__,
+            (t1-now)/1000000.0, (systemTime()-t1)/1000000.0);
+        return st;
+    }
+    else
+        return JD_BLIT_FAILURE;
+}
+
+JpegDecodeStatus JpegDecoder::blitToCameraSurfaces(RenderTarget &src,
+                                                   buffer_handle_t dst_nv12,
+                                                   buffer_handle_t dst_yuy2,
+                                                   uint8_t *dst_nv21,
+                                                   uint8_t *dst_yv12,
+                                                   uint32_t width, uint32_t height, BlitEvent &event)
+{
+    if (mBlitter) {
+        nsecs_t now = systemTime();
+        mBlitter->init(*this);
+        nsecs_t t1 = systemTime();
+        JpegDecodeStatus st = mBlitter->blitToCameraSurfaces(src, dst_nv12, dst_yuy2, dst_nv21, dst_yv12, width, height, event);
+        VTRACE("Decoder::%s took %.2f + %.2f ms", __FUNCTION__,
+            (t1-now)/1000000.0, (systemTime()-t1)/1000000.0);
+        return st;
+    }
+    else
+        return JD_BLIT_FAILURE;
+}
+
+void JpegDecoder::syncBlit(BlitEvent &event)
+{
+    assert(mBlitter);
+    mBlitter->syncBlit(event);
+}
+
+JpegDecodeStatus JpegDecoder::parseHeader(JpegInfo &jpginfo)
+{
+#define ROLLBACK_IF_FAIL(stmt) \
+    do { \
+        if (!stmt) { \
+            VTRACE("%s::%d, parser failed at offset %u, remaining bytes %u, total bytes %zu", \
+                __FUNCTION__, __LINE__, mBsParser->getByteOffset(), mBsParser->getRemainingBytes(), \
+                bufsize); \
+            goto rollback; \
+        } \
+    } while(0);
+
     int i;
+    uint32_t bufsize;
+    if (!mParserInitialized) {
+        Mutex::Autolock autoLock(mLock);
+        if (!mParserInitialized) {
+            if (jpginfo.use_vector_input)
+                mBsParser->set(jpginfo.inputs);
+            else
+                mBsParser->set(jpginfo.buf, jpginfo.bufsize);
+            mParserInitialized = true;
+        }
+    }
+    if (jpginfo.use_vector_input)
+        bufsize = jpginfo.inputs->size();
+    else
+        bufsize = jpginfo.bufsize;
 
-    parserInitialize(mParser, jpginfo.buf, jpginfo.bufsize);
+    uint8_t marker;
+    uint32_t rollbackoff;
+    rollbackoff = mBsParser->getByteOffset();
+    ROLLBACK_IF_FAIL(mBsParser->tryGetNextMarker(&marker));
 
-    uint8_t marker = mParser->getNextMarker(mParser);
-
-    while (marker != CODE_EOI &&( !mParser->endOfBuffer(mParser))) {
+    while (marker != CODE_EOI &&( !mBsParser->endOfBuffer())) {
         switch (marker) {
             case CODE_SOI: {
-                 jpginfo.soi_offset = mParser->getByteOffset(mParser) - 2;
+                VTRACE("%s SOI at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                jpginfo.soi_offset = mBsParser->getByteOffset() - 2;
+                jpginfo.soi_parsed = true;
                 break;
             }
             // If the marker is an APP marker skip over the data
@@ -249,22 +368,28 @@
             case CODE_APP13:
             case CODE_APP14:
             case CODE_APP15: {
-
-                uint32_t bytes_to_burn = mParser->readBytes(mParser, 2) - 2;
-                mParser->burnBytes(mParser, bytes_to_burn);
-                    break;
+                VTRACE("%s APP %x at 0x%08x", __FUNCTION__, marker, mBsParser->getByteOffset());
+                uint32_t bytes_to_burn;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&bytes_to_burn, 2));
+                bytes_to_burn -= 2;
+                ROLLBACK_IF_FAIL(mBsParser->tryBurnBytes(bytes_to_burn));
+                break;
             }
             // Store offset to DQT data to avoid parsing bitstream in user mode
             case CODE_DQT: {
-                if (dqt_ind < 4) {
-                    jpginfo.dqt_byte_offset[dqt_ind] = mParser->getByteOffset(mParser) - jpginfo.soi_offset;
-                    dqt_ind++;
-                    uint32_t bytes_to_burn = mParser->readBytes(mParser, 2 ) - 2;
-                    mParser->burnBytes( mParser, bytes_to_burn );
+                VTRACE("%s DQT at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                if (jpginfo.dqt_ind < 4) {
+                    jpginfo.dqt_byte_offset[jpginfo.dqt_ind] = mBsParser->getByteOffset() - jpginfo.soi_offset;
+                    jpginfo.dqt_ind++;
+                    uint32_t bytes_to_burn;
+                    ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&bytes_to_burn, 2));
+                    bytes_to_burn -= 2;
+                    ROLLBACK_IF_FAIL(mBsParser->tryBurnBytes(bytes_to_burn));
                 } else {
                     ETRACE("ERROR: Decoder does not support more than 4 Quant Tables\n");
-                    return JD_ERROR_BITSTREAM;
+                    return JD_CODEC_UNSUPPORTED;
                 }
+                jpginfo.dqt_parsed = true;
                 break;
             }
             // Throw exception for all SOF marker other than SOF0
@@ -284,20 +409,31 @@
                 ETRACE("ERROR: unsupport SOF\n");
                 break;
             }
-            // Parse component information in SOF marker
             case CODE_SOF_BASELINE: {
-                frame_marker_found = true;
-
-                mParser->burnBytes(mParser, 2); // Throw away frame header length
-                uint8_t sample_precision = mParser->readNextByte(mParser);
+                VTRACE("%s SOF_BASELINE at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                ROLLBACK_IF_FAIL((mBsParser->getRemainingBytes() >= 10));
+                jpginfo.frame_marker_found = true;
+                bool r;
+                ROLLBACK_IF_FAIL(mBsParser->tryBurnBytes(2)); // Throw away frame header length 
+                uint8_t sample_precision;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&sample_precision));
                 if (sample_precision != 8) {
                     ETRACE("sample_precision is not supported\n");
-                    return JD_ERROR_BITSTREAM;
+                    return JD_INPUT_FORMAT_UNSUPPORTED;
                 }
                 // Extract pic width and height
-                jpginfo.picture_param_buf.picture_height = mParser->readBytes(mParser, 2);
-                jpginfo.picture_param_buf.picture_width = mParser->readBytes(mParser, 2);
-                jpginfo.picture_param_buf.num_components = mParser->readNextByte(mParser);
+                uint32_t w, h;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&h, 2));
+                ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&w, 2));
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.picture_param_buf.num_components));
+                jpginfo.picture_param_buf.picture_width = w;
+                jpginfo.picture_param_buf.picture_height = h;
+                VTRACE("%s pic wxh=%ux%u, %u components", __FUNCTION__, 
+                    jpginfo.picture_param_buf.picture_width,
+                    jpginfo.picture_param_buf.picture_height,
+                    jpginfo.picture_param_buf.num_components);
+
+                ROLLBACK_IF_FAIL((mBsParser->getRemainingBytes() >= jpginfo.picture_param_buf.num_components * 3));
 
                 if (jpginfo.picture_param_buf.num_components > JPEG_MAX_COMPONENTS) {
                     ETRACE("ERROR: reached max components\n");
@@ -308,110 +444,225 @@
                     VTRACE("PERFORMANCE: %ux%u JPEG will decode faster with SW\n",
                         jpginfo.picture_param_buf.picture_width,
                         jpginfo.picture_param_buf.picture_height);
-                    return JD_ERROR_BITSTREAM;
+                    return JD_IMAGE_TOO_SMALL;
                 }
                 uint8_t comp_ind = 0;
                 for (comp_ind = 0; comp_ind < jpginfo.picture_param_buf.num_components; comp_ind++) {
-                    jpginfo.picture_param_buf.components[comp_ind].component_id = mParser->readNextByte(mParser);
-
-                    uint8_t hv_sampling = mParser->readNextByte(mParser);
+                    ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.picture_param_buf.components[comp_ind].component_id));
+                    uint8_t hv_sampling;
+                    ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&hv_sampling));
                     jpginfo.picture_param_buf.components[comp_ind].h_sampling_factor = hv_sampling >> 4;
                     jpginfo.picture_param_buf.components[comp_ind].v_sampling_factor = hv_sampling & 0xf;
-                    jpginfo.picture_param_buf.components[comp_ind].quantiser_table_selector = mParser->readNextByte(mParser);
+                    ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.picture_param_buf.components[comp_ind].quantiser_table_selector));
                 }
+                jpginfo.image_width = jpginfo.picture_param_buf.picture_width;
+                jpginfo.image_height = jpginfo.picture_param_buf.picture_height;
+                jpginfo.image_color_fourcc = sampFactor2Fourcc(jpginfo.picture_param_buf.components[0].h_sampling_factor,
+                    jpginfo.picture_param_buf.components[1].h_sampling_factor,
+                    jpginfo.picture_param_buf.components[2].h_sampling_factor,
+                    jpginfo.picture_param_buf.components[0].v_sampling_factor,
+                    jpginfo.picture_param_buf.components[1].v_sampling_factor,
+                    jpginfo.picture_param_buf.components[2].v_sampling_factor);
 
+                VTRACE("%s jpg %ux%u, fourcc=%s",
+                    __FUNCTION__, jpginfo.image_width, jpginfo.image_height, fourcc2str(jpginfo.image_color_fourcc));
 
+                if (!jpegColorFormatSupported(jpginfo)) {
+                    ETRACE("%s color format not supported", fourcc2str(jpginfo.image_color_fourcc));
+                    return JD_INPUT_FORMAT_UNSUPPORTED;
+                }
+                jpginfo.sof_parsed = true;
                 break;
             }
-            // Store offset to DHT data to avoid parsing bitstream in user mode
             case CODE_DHT: {
-                if (dht_ind < 4) {
-                    jpginfo.dht_byte_offset[dht_ind] = mParser->getByteOffset(mParser) - jpginfo.soi_offset;
-                    dht_ind++;
-                    uint32_t bytes_to_burn = mParser->readBytes(mParser, 2) - 2;
-                    mParser->burnBytes(mParser, bytes_to_burn );
+                VTRACE("%s DHT at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                if (jpginfo.dht_ind < 4) {
+                    jpginfo.dht_byte_offset[jpginfo.dht_ind] = mBsParser->getByteOffset() - jpginfo.soi_offset;
+                    jpginfo.dht_ind++;
+                    uint32_t bytes_to_burn;
+                    if (!mBsParser->tryReadBytes(&bytes_to_burn, 2)) {
+                        VTRACE("%s failed to read 2 bytes from 0x%08x, remaining 0x%08x, total 0x%08x",
+                            __FUNCTION__, mBsParser->getByteOffset(),
+                            mBsParser->getRemainingBytes(), bufsize);
+                        jpginfo.dht_ind--;
+                        goto rollback;
+                    }
+                    bytes_to_burn -= 2;
+                    if (!mBsParser->tryBurnBytes(bytes_to_burn)) {
+                        VTRACE("%s failed to burn %x bytes from 0x%08x, remaining 0x%08x, total 0x%08x",
+                            __FUNCTION__, bytes_to_burn, mBsParser->getByteOffset(),
+                            mBsParser->getRemainingBytes(), bufsize);
+                        jpginfo.dht_ind--;
+                        goto rollback;
+                    }
                 } else {
                     ETRACE("ERROR: Decoder does not support more than 4 Huff Tables\n");
                     return JD_ERROR_BITSTREAM;
                 }
+                jpginfo.dht_parsed = true;
                 break;
             }
             // Parse component information in SOS marker
             case CODE_SOS: {
-                mParser->burnBytes(mParser, 2);
-                uint32_t component_in_scan = mParser->readNextByte(mParser);
+                VTRACE("%s SOS at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                ROLLBACK_IF_FAIL(mBsParser->tryBurnBytes(2));
+                uint8_t component_in_scan;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&component_in_scan));
                 uint8_t comp_ind = 0;
-
+                ROLLBACK_IF_FAIL((mBsParser->getRemainingBytes() >= 2 * component_in_scan + 3));
                 for (comp_ind = 0; comp_ind < component_in_scan; comp_ind++) {
-                    uint8_t comp_id = mParser->readNextByte(mParser);
+                    uint8_t comp_id;
+                    mBsParser->tryReadNextByte(&comp_id);
                     uint8_t comp_data_ind;
                     for (comp_data_ind = 0; comp_data_ind < jpginfo.picture_param_buf.num_components; comp_data_ind++) {
                         if (comp_id == jpginfo.picture_param_buf.components[comp_data_ind].component_id) {
-                            jpginfo.slice_param_buf[scan_ind].components[comp_ind].component_selector = comp_data_ind + 1;
+                            jpginfo.slice_param_buf[jpginfo.scan_ind].components[comp_ind].component_selector = comp_data_ind + 1;
                             break;
                         }
                     }
-                    uint8_t huffman_tables = mParser->readNextByte(mParser);
-                    jpginfo.slice_param_buf[scan_ind].components[comp_ind].dc_table_selector = huffman_tables >> 4;
-                    jpginfo.slice_param_buf[scan_ind].components[comp_ind].ac_table_selector = huffman_tables & 0xf;
+                    uint8_t huffman_tables;
+                    ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&huffman_tables));
+                    jpginfo.slice_param_buf[jpginfo.scan_ind].components[comp_ind].dc_table_selector = huffman_tables >> 4;
+                    jpginfo.slice_param_buf[jpginfo.scan_ind].components[comp_ind].ac_table_selector = huffman_tables & 0xf;
                 }
-                uint32_t curr_byte = mParser->readNextByte(mParser); // Ss
+                uint8_t curr_byte;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&curr_byte)); // Ss
                 if (curr_byte != 0) {
-                    ETRACE("ERROR: curr_byte 0x%08x != 0\n", curr_byte);
+                    ETRACE("ERROR: curr_byte 0x%08x (position 0x%08x) != 0\n", curr_byte, mBsParser->getByteOffset());
                     return JD_ERROR_BITSTREAM;
                 }
-                curr_byte = mParser->readNextByte(mParser);  // Se
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&curr_byte));  // Se
                 if (curr_byte != 0x3f) {
-                    ETRACE("ERROR: curr_byte 0x%08x != 0x3f\n", curr_byte);
+                    ETRACE("ERROR: curr_byte 0x%08x (position 0x%08x) != 0x3f\n", curr_byte, mBsParser->getByteOffset());
                     return JD_ERROR_BITSTREAM;
                 }
-                curr_byte = mParser->readNextByte(mParser);  // Ah, Al
+                ROLLBACK_IF_FAIL(mBsParser->tryReadNextByte(&curr_byte));  // Ah, Al
                 if (curr_byte != 0) {
-                    ETRACE("ERROR: curr_byte 0x%08x != 0\n", curr_byte);
+                    ETRACE("ERROR: curr_byte 0x%08x (position 0x%08x) != 0\n", curr_byte, mBsParser->getByteOffset());
                     return JD_ERROR_BITSTREAM;
                 }
                 // Set slice control variables needed
-                jpginfo.slice_param_buf[scan_ind].slice_data_offset = mParser->getByteOffset(mParser) - jpginfo.soi_offset;
-                jpginfo.slice_param_buf[scan_ind].num_components = component_in_scan;
-                if (scan_ind) {
+                jpginfo.slice_param_buf[jpginfo.scan_ind].slice_data_offset = mBsParser->getByteOffset() - jpginfo.soi_offset;
+                jpginfo.slice_param_buf[jpginfo.scan_ind].num_components = component_in_scan;
+                jpginfo.sos_parsed = true;
+                if (jpginfo.scan_ind) {
                     /* If there is more than one scan, the slice for all but the final scan should only run up to the beginning of the next scan */
-                    jpginfo.slice_param_buf[scan_ind - 1].slice_data_size =
-                        (jpginfo.slice_param_buf[scan_ind].slice_data_offset - jpginfo.slice_param_buf[scan_ind - 1].slice_data_offset );;
+                    jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_size =
+                        (jpginfo.slice_param_buf[jpginfo.scan_ind].slice_data_offset - jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_offset );;
                     }
-                    scan_ind++;
+                    jpginfo.scan_ind++;
                     jpginfo.scan_ctrl_count++;   // gsDXVA2Globals.uiScanCtrlCount
                     break;
                 }
             case CODE_DRI: {
-                uint32_t size =  mParser->readBytes(mParser, 2);
-                jpginfo.slice_param_buf[scan_ind].restart_interval =  mParser->readBytes(mParser, 2);
-                mParser->burnBytes(mParser, (size - 4));
+                rollbackoff = mBsParser->getByteOffset() - 2;
+                VTRACE("%s DRI at 0x%08x", __FUNCTION__, mBsParser->getByteOffset());
+                uint32_t size;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&size, 2));
+                uint32_t ri;
+                ROLLBACK_IF_FAIL(mBsParser->tryReadBytes(&ri, 2));
+                jpginfo.slice_param_buf[jpginfo.scan_ind].restart_interval = ri;
+                ROLLBACK_IF_FAIL(mBsParser->tryBurnBytes(size - 4));
+                jpginfo.dri_parsed = true;
                 break;
             }
             default:
                 break;
         }
-
-        marker = mParser->getNextMarker(mParser);
-        // If the EOI code is found, store the byte offset before the parsing finishes
-        if( marker == CODE_EOI ) {
-            jpginfo.eoi_offset = mParser->getByteOffset(mParser);
+        if (jpginfo.need_header_only &&
+            jpginfo.soi_parsed && jpginfo.sos_parsed &&
+            jpginfo.sof_parsed && jpginfo.dqt_parsed &&
+            jpginfo.dht_parsed) {
+            VTRACE("%s: for header_only, we've got all what we need. return now", __FUNCTION__);
+            return JD_SUCCESS;
+        }
+        else {
+            VTRACE("%s: soi %d, sos %d, sof %d, dqt %d, dht %d, dri %d, remaining %u", __FUNCTION__,
+                jpginfo.soi_parsed,
+                jpginfo.sos_parsed,
+                jpginfo.sof_parsed,
+                jpginfo.dqt_parsed,
+                jpginfo.dht_parsed,
+                jpginfo.dri_parsed,
+                mBsParser->getRemainingBytes());
+        }
+        rollbackoff = mBsParser->getByteOffset();
+        if (!mBsParser->tryGetNextMarker(&marker)) {
+            VTRACE("%s: can't get next marker, offset 0x%08x, need_header_only=%d",
+                __FUNCTION__,
+                mBsParser->getByteOffset(),
+                jpginfo.need_header_only);
+            if (jpginfo.need_header_only) {
+                mBsParser->trySetByteOffset(rollbackoff);
+                return JD_INSUFFICIENT_BYTE;
+            }
+            else {
+                return JD_SUCCESS;
+            }
+        }
+        else if (marker == 0) {
+            VTRACE("%s: got non-marker %x at offset 0x%08x", __FUNCTION__, marker, mBsParser->getByteOffset());
+            return JD_SUCCESS;
         }
 
+        // If the EOI code is found, store the byte offset before the parsing finishes
+        if( marker == CODE_EOI ) {
+            jpginfo.eoi_offset = mBsParser->getByteOffset();
+            VTRACE("%s: got EOI at 0x%08x, stop parsing now", __FUNCTION__, jpginfo.eoi_offset);
+            return JD_SUCCESS;
+        }
     }
+    return JD_SUCCESS;
+rollback:
+    mBsParser->trySetByteOffset(rollbackoff);
+    return JD_INSUFFICIENT_BYTE;
+}
 
-    jpginfo.quant_tables_num = dqt_ind;
-    jpginfo.huffman_tables_num = dht_ind;
+JpegDecodeStatus JpegDecoder::parse(JpegInfo &jpginfo)
+{
+    if (!mParserInitialized) {
+        Mutex::Autolock autoLock(mLock);
+        if (!mParserInitialized) {
+            if (jpginfo.use_vector_input)
+                mBsParser->set(jpginfo.inputs);
+            else
+                mBsParser->set(jpginfo.buf, jpginfo.bufsize);
+            mParserInitialized = true;
+        }
+    }
+    JpegDecodeStatus st = parseHeader(jpginfo);
+    if (st) {
+        if (st != JD_INSUFFICIENT_BYTE)
+            ETRACE("%s header parsing failure: %d", __FUNCTION__, st);
+        return st;
+    }
+    if (jpginfo.need_header_only)
+        return JD_SUCCESS;
+    uint32_t bufsize;
+    if (jpginfo.use_vector_input) {
+        mBsParser->set(jpginfo.inputs);
+        bufsize = jpginfo.inputs->size();
+    }
+    else {
+        mBsParser->set(jpginfo.buf, jpginfo.bufsize);
+        bufsize = jpginfo.bufsize;
+    }
+    assert(mParserInitialized);
+    assert (jpginfo.soi_parsed && jpginfo.sos_parsed &&
+        jpginfo.sof_parsed && jpginfo.dqt_parsed &&
+        jpginfo.dht_parsed);
+    jpginfo.quant_tables_num = jpginfo.dqt_ind;
+    jpginfo.huffman_tables_num = jpginfo.dht_ind;
 
     /* The slice for the last scan should run up to the end of the picture */
     if (jpginfo.eoi_offset) {
-        jpginfo.slice_param_buf[scan_ind - 1].slice_data_size = (jpginfo.eoi_offset - jpginfo.slice_param_buf[scan_ind - 1].slice_data_offset);
+        jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_size = (jpginfo.eoi_offset - jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_offset);
     }
     else {
-        jpginfo.slice_param_buf[scan_ind - 1].slice_data_size = (jpginfo.bufsize - jpginfo.slice_param_buf[scan_ind - 1].slice_data_offset);
+        jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_size = (bufsize - jpginfo.slice_param_buf[jpginfo.scan_ind - 1].slice_data_offset);
     }
     // throw AppException if SOF0 isn't found
-    if (!frame_marker_found) {
+    if (!jpginfo.frame_marker_found) {
         ETRACE("EEORR: Reached end of bitstream while trying to parse headers\n");
         return JD_ERROR_BITSTREAM;
     }
@@ -419,7 +670,7 @@
     JpegDecodeStatus status = parseTableData(jpginfo);
     if (status != JD_SUCCESS) {
         ETRACE("ERROR: Parsing table data returns %d", status);
-        return JD_ERROR_BITSTREAM;
+        return status;
     }
 
     jpginfo.image_width = jpginfo.picture_param_buf.picture_width;
@@ -430,81 +681,94 @@
         jpginfo.picture_param_buf.components[0].v_sampling_factor,
         jpginfo.picture_param_buf.components[1].v_sampling_factor,
         jpginfo.picture_param_buf.components[2].v_sampling_factor);
-    jpginfo.image_pixel_format = fourcc2PixelFormat(jpginfo.image_color_fourcc);
 
-    VTRACE("%s jpg %ux%u, fourcc=%s, pixelformat=0x%x",
-        __FUNCTION__, jpginfo.image_width, jpginfo.image_height, fourcc2str(NULL, jpginfo.image_color_fourcc),
-        jpginfo.image_pixel_format);
+    VTRACE("%s jpg %ux%u, fourcc=%s",
+        __FUNCTION__, jpginfo.image_width, jpginfo.image_height, fourcc2str(jpginfo.image_color_fourcc));
 
-    if (!jpegColorFormatSupported(jpginfo))
+    if (!jpegColorFormatSupported(jpginfo)) {
+        ETRACE("%s color format not supported", fourcc2str(jpginfo.image_color_fourcc));
         return JD_INPUT_FORMAT_UNSUPPORTED;
+    }
     return JD_SUCCESS;
 }
 
 JpegDecodeStatus JpegDecoder::createSurfaceFromRenderTarget(RenderTarget &target, VASurfaceID *surfid)
 {
-    if (target.type == RENDERTARGET_INTERNAL_BUFFER) {
-        JpegDecodeStatus st = createSurfaceInternal(target.width,
-            target.height,
-            target.pixel_format,
-            target.handle,
-            surfid);
-        if (st != JD_SUCCESS)
-            return st;
-        mNormalSurfaceMap.add(target.handle, *surfid);
-        VTRACE("%s added surface %u (internal buffer id %d) to SurfaceList",
-            __PRETTY_FUNCTION__, *surfid, target.handle);
-    }
-    else {
-        switch (target.type) {
-        case RenderTarget::KERNEL_DRM:
-            {
-                JpegDecodeStatus st = createSurfaceDrm(target.width,
-                    target.height,
-                    target.pixel_format,
-                    (unsigned long)target.handle,
-                    target.stride,
-                    surfid);
-                if (st != JD_SUCCESS)
-                    return st;
-                mDrmSurfaceMap.add((unsigned long)target.handle, *surfid);
-                VTRACE("%s added surface %u (Drm handle %d) to DrmSurfaceMap",
-                    __PRETTY_FUNCTION__, *surfid, target.handle);
-            }
-            break;
-        case RenderTarget::ANDROID_GRALLOC:
-            {
-                JpegDecodeStatus st = createSurfaceGralloc(target.width,
-                    target.height,
-                    target.pixel_format,
-                    (buffer_handle_t)target.handle,
-                    target.stride,
-                    surfid);
-                if (st != JD_SUCCESS)
-                    return st;
-                mGrallocSurfaceMap.add((buffer_handle_t)target.handle, *surfid);
-                VTRACE("%s added surface %u (Gralloc handle %d) to DrmSurfaceMap",
-                    __PRETTY_FUNCTION__, *surfid, target.handle);
-            }
-            break;
-        default:
-            return JD_RENDER_TARGET_TYPE_UNSUPPORTED;
+    switch (target.type) {
+    case RenderTarget::KERNEL_DRM:
+        {
+            JpegDecodeStatus st = createSurfaceDrm(target.width,
+                target.height,
+                target.pixel_format,
+                (unsigned long)target.handle,
+                target.stride,
+                surfid);
+            if (st != JD_SUCCESS)
+                return st;
+            mDrmSurfaceMap.add((unsigned long)target.handle, *surfid);
+            VTRACE("%s added surface %u (Drm handle %d) to DrmSurfaceMap",
+                __PRETTY_FUNCTION__, *surfid, target.handle);
         }
+        break;
+    case RenderTarget::ANDROID_GRALLOC:
+        {
+            JpegDecodeStatus st = createSurfaceGralloc(target.width,
+                target.height,
+                target.pixel_format,
+                (buffer_handle_t)target.handle,
+                target.stride,
+                surfid);
+            if (st != JD_SUCCESS)
+                return st;
+            mGrallocSurfaceMap.add((buffer_handle_t)target.handle, *surfid);
+            VTRACE("%s added surface %u (Gralloc handle %d) to DrmSurfaceMap",
+                __PRETTY_FUNCTION__, *surfid, target.handle);
+        }
+        break;
+    case RenderTarget::INTERNAL_BUF:
+        {
+            JpegDecodeStatus st = createSurfaceInternal(target.width,
+                target.height,
+                target.pixel_format,
+                target.handle,
+                surfid);
+            if (st != JD_SUCCESS)
+                return st;
+            mNormalSurfaceMap.add(target.handle, *surfid);
+            VTRACE("%s added surface %u (internal buffer id %d) to SurfaceList",
+                __PRETTY_FUNCTION__, *surfid, target.handle);
+        }
+        break;
+    case RenderTarget::USER_PTR:
+        {
+            JpegDecodeStatus st = createSurfaceUserptr(target.width,
+                target.height,
+                target.pixel_format,
+                (uint8_t*)target.handle,
+                surfid);
+            if (st != JD_SUCCESS)
+                return st;
+            mUserptrSurfaceMap.add(target.handle, *surfid);
+            VTRACE("%s added surface %u (internal buffer id %d) to SurfaceList",
+                __PRETTY_FUNCTION__, *surfid, target.handle);
+        }
+        break;
+    default:
+        return JD_RENDER_TARGET_TYPE_UNSUPPORTED;
     }
     return JD_SUCCESS;
 }
 
-JpegDecodeStatus JpegDecoder::createSurfaceInternal(int width, int height, int pixel_format, int handle, VASurfaceID *surf_id)
+JpegDecodeStatus JpegDecoder::createSurfaceInternal(int width, int height, uint32_t fourcc, int handle, VASurfaceID *surf_id)
 {
     VAStatus va_status;
     VASurfaceAttrib attrib;
     attrib.type = VASurfaceAttribPixelFormat;
     attrib.flags = VA_SURFACE_ATTRIB_SETTABLE;
     attrib.value.type = VAGenericValueTypeInteger;
-    uint32_t fourcc = pixelFormat2Fourcc(pixel_format);
     uint32_t vaformat = fourcc2VaFormat(fourcc);
     attrib.value.value.i = fourcc;
-    VTRACE("enter %s, pixel_format 0x%x, fourcc %s", __FUNCTION__, pixel_format, fourcc2str(NULL, fourcc));
+    VTRACE("enter %s, fourcc 0x%x, fourcc %s", __FUNCTION__, fourcc, fourcc2str(fourcc));
     va_status = vaCreateSurfaces(mDisplay,
                                 vaformat,
                                 width,
@@ -514,22 +778,48 @@
                                 &attrib,
                                 1);
     if (va_status != VA_STATUS_SUCCESS) {
-        ETRACE("%s: createSurface (format %u, fourcc %s) returns %d", __PRETTY_FUNCTION__, vaformat, fourcc2str(NULL, fourcc), va_status);
+        ETRACE("%s: createSurface (format %u, fourcc %s) returns %d", __PRETTY_FUNCTION__, vaformat, fourcc2str(fourcc), va_status);
         return JD_RESOURCE_FAILURE;
     }
     return JD_SUCCESS;
 }
 
+JpegDecodeStatus JpegDecoder::destroySurface(RenderTarget &target)
+{
+    Mutex::Autolock autoLock(mLock);
+    VASurfaceID surf = getSurfaceID(target);
+    if (surf == VA_INVALID_ID) {
+        ETRACE("%s: failed to destroy surface type %d, handle %d", __FUNCTION__, target.type, target.handle);
+        return JD_INVALID_RENDER_TARGET;
+    }
+    switch(target.type) {
+    case RenderTarget::KERNEL_DRM:
+        mDrmSurfaceMap.removeItem((unsigned long)target.handle);
+        break;
+    case RenderTarget::ANDROID_GRALLOC:
+        mGrallocSurfaceMap.removeItem((buffer_handle_t)target.handle);
+        break;
+    case RenderTarget::INTERNAL_BUF:
+        mNormalSurfaceMap.removeItem(target.handle);
+        break;
+    case RenderTarget::USER_PTR:
+        mUserptrSurfaceMap.removeItem(target.handle);
+        break;
+    default:
+        break;
+    }
+    VTRACE("%s: succeeded destroying surface type %d, handle %d", __FUNCTION__, target.type, target.handle);
+    return JD_SUCCESS;
+}
+
+JpegDecodeStatus JpegDecoder::destroySurface(VASurfaceID surf)
+{
+    return JD_UNIMPLEMENTED;
+}
+
 VASurfaceID JpegDecoder::getSurfaceID(RenderTarget &target) const
 {
     int index;
-    if (target.type == RENDERTARGET_INTERNAL_BUFFER) {
-        index = mNormalSurfaceMap.indexOfKey(target.handle);
-        if (index < 0)
-            return VA_INVALID_ID;
-        else
-            return mNormalSurfaceMap.valueAt(index);
-    }
     switch (target.type) {
     case RenderTarget::KERNEL_DRM:
         index = mDrmSurfaceMap.indexOfKey((unsigned long)target.handle);
@@ -543,6 +833,18 @@
             return VA_INVALID_ID;
         else
             return mGrallocSurfaceMap.valueAt(index);
+    case RenderTarget::INTERNAL_BUF:
+        index = mNormalSurfaceMap.indexOfKey(target.handle);
+        if (index < 0)
+            return VA_INVALID_ID;
+        else
+            return mNormalSurfaceMap.valueAt(index);
+    case RenderTarget::USER_PTR:
+        index = mUserptrSurfaceMap.indexOfKey(target.handle);
+        if (index < 0)
+            return VA_INVALID_ID;
+        else
+            return mUserptrSurfaceMap.valueAt(index);
     default:
         assert(false);
     }
@@ -576,24 +878,39 @@
     VASurfaceStatus surf_status;
     VABufferID desc_buf[5];
     uint32_t bitstream_buffer_size = 0;
+    uint8_t* bufaddr = NULL;
     uint32_t scan_idx = 0;
     uint32_t buf_idx = 0;
     uint32_t chopping = VA_SLICE_DATA_FLAG_ALL;
     uint32_t bytes_remaining;
     VASurfaceID surf_id = getSurfaceID(target);
-    if (surf_id == VA_INVALID_ID)
+    nsecs_t now = systemTime();
+    if (surf_id == VA_INVALID_ID) {
+        ETRACE("%s render_target %p, handle %d is not initailized by JpegDecoder", __FUNCTION__, &target, target.handle);
         return JD_RENDER_TARGET_NOT_INITIALIZED;
+    }
     va_status = vaQuerySurfaceStatus(mDisplay, surf_id, &surf_status);
-    if (surf_status != VASurfaceReady)
+    if (surf_status != VASurfaceReady) {
+        ETRACE("%s render_target %p, handle %d is still busy", __FUNCTION__, &target, target.handle);
         return JD_RENDER_TARGET_BUSY;
+    }
+
+    if (jpginfo.use_vector_input) {
+        bitstream_buffer_size = jpginfo.inputs->size();
+        bufaddr = const_cast<uint8_t*>(jpginfo.inputs->array());
+    }
+    else {
+        bitstream_buffer_size = jpginfo.bufsize;
+        bufaddr = jpginfo.buf;
+    }
 
     if (jpginfo.eoi_offset)
         bytes_remaining = jpginfo.eoi_offset - jpginfo.soi_offset;
     else
-        bytes_remaining = jpginfo.bufsize - jpginfo.soi_offset;
+        bytes_remaining = bitstream_buffer_size - jpginfo.soi_offset;
+
     uint32_t src_offset = jpginfo.soi_offset;
     uint32_t cpy_row;
-    bitstream_buffer_size = jpginfo.bufsize;//cinfo->src->bytes_in_buffer;//1024*1024*5;
 
     Vector<VABufferID> buf_list;
     va_status = vaBeginPicture(mDisplay, mContextId, surf_id);
@@ -601,24 +918,29 @@
         ETRACE("vaBeginPicture failed. va_status = 0x%x", va_status);
         return JD_DECODE_FAILURE;
     }
+    VTRACE("%s begin decode render target %p, handle %d", __FUNCTION__, &target, target.handle);
     va_status = vaCreateBuffer(mDisplay, mContextId, VAPictureParameterBufferType, sizeof(VAPictureParameterBufferJPEGBaseline), 1, &jpginfo.picture_param_buf, &desc_buf[buf_idx]);
     if (va_status != VA_STATUS_SUCCESS) {
         ETRACE("vaCreateBuffer VAPictureParameterBufferType failed. va_status = 0x%x", va_status);
         return JD_RESOURCE_FAILURE;
     }
+    VTRACE("%s successfully created PicParamBuf, id=%u", __FUNCTION__, desc_buf[buf_idx]);
     buf_list.add(desc_buf[buf_idx++]);
-    va_status = vaCreateBuffer(mDisplay, mContextId, VAIQMatrixBufferType, sizeof(VAIQMatrixBufferJPEGBaseline), 1, &jpginfo.qmatrix_buf, &desc_buf[buf_idx]);
 
+    va_status = vaCreateBuffer(mDisplay, mContextId, VAIQMatrixBufferType, sizeof(VAIQMatrixBufferJPEGBaseline), 1, &jpginfo.qmatrix_buf, &desc_buf[buf_idx]);
     if (va_status != VA_STATUS_SUCCESS) {
         ETRACE("vaCreateBuffer VAIQMatrixBufferType failed. va_status = 0x%x", va_status);
         return JD_RESOURCE_FAILURE;
     }
+    VTRACE("%s successfully created IQMatrixBuf, id=%u", __FUNCTION__, desc_buf[buf_idx]);
     buf_list.add(desc_buf[buf_idx++]);
+
     va_status = vaCreateBuffer(mDisplay, mContextId, VAHuffmanTableBufferType, sizeof(VAHuffmanTableBufferJPEGBaseline), 1, &jpginfo.hufman_table_buf, &desc_buf[buf_idx]);
     if (va_status != VA_STATUS_SUCCESS) {
         ETRACE("vaCreateBuffer VAHuffmanTableBufferType failed. va_status = 0x%x", va_status);
         return JD_RESOURCE_FAILURE;
     }
+    VTRACE("%s successfully created HuffmanTableBuf, id=%u", __FUNCTION__, desc_buf[buf_idx]);
     buf_list.add(desc_buf[buf_idx++]);
 
     do {
@@ -676,16 +998,18 @@
         /* Get Slice Control Buffer */
         va_status = vaCreateBuffer(mDisplay, mContextId, VASliceParameterBufferType, sizeof(VASliceParameterBufferJPEGBaseline) * dest_idx, 1, dest_scan_ctrl, &desc_buf[buf_idx]);
         if (va_status != VA_STATUS_SUCCESS) {
-            ETRACE("vaCreateBuffer VASliceParameterBufferType failed. va_status = 0x%x", va_status);
+            ETRACE("vaCreateBuffer VASliceParameterBufferType failed. va_status = 0x%x, dest_idx=%d, buf_idx=%d", va_status, dest_idx, buf_idx);
             return JD_RESOURCE_FAILURE;
         }
+        VTRACE("vaCreateBuffer VASliceParameterBufferType succeeded. va_status = 0x%x, dest_idx=%d, buf_idx=%d", va_status, dest_idx, buf_idx);
         buf_list.add(desc_buf[buf_idx++]);
-        va_status = vaCreateBuffer(mDisplay, mContextId, VASliceDataBufferType, bytes, 1, &jpginfo.buf[ src_offset ], &desc_buf[buf_idx]);
-        buf_list.add(desc_buf[buf_idx++]);
+        va_status = vaCreateBuffer(mDisplay, mContextId, VASliceDataBufferType, bytes, 1, bufaddr + src_offset, &desc_buf[buf_idx]);
         if (va_status != VA_STATUS_SUCCESS) {
             ETRACE("vaCreateBuffer VASliceDataBufferType (%u bytes) failed. va_status = 0x%x", bytes, va_status);
             return JD_RESOURCE_FAILURE;
         }
+        VTRACE("%s successfully created SliceDataBuf, id=%u", __FUNCTION__, desc_buf[buf_idx]);
+        buf_list.add(desc_buf[buf_idx++]);
         va_status = vaRenderPicture( mDisplay, mContextId, desc_buf, buf_idx);
         if (va_status != VA_STATUS_SUCCESS) {
             ETRACE("vaRenderPicture failed. va_status = 0x%x", va_status);
@@ -706,6 +1030,9 @@
         ETRACE("vaEndPicture failed. va_status = 0x%x", va_status);
         return JD_DECODE_FAILURE;
     }
+
+    VTRACE("%s successfully ended picture, rendertarget %p, handle %d", __FUNCTION__, &target, target.handle);
+    VTRACE("JpegDecoder decode took %.2f ms", (systemTime() - now)/1000000.0);
     return JD_SUCCESS;
 }
 void JpegDecoder::deinit()
@@ -719,6 +1046,7 @@
             size_t gralloc_size = mGrallocSurfaceMap.size();
             size_t drm_size = mDrmSurfaceMap.size();
             size_t internal_surf_size = mNormalSurfaceMap.size();
+            size_t up_surf_size = mUserptrSurfaceMap.size();
             for (size_t i = 0; i < gralloc_size; ++i) {
                 VASurfaceID surf_id = mGrallocSurfaceMap.valueAt(i);
                 vaDestroySurfaces(mDisplay, &surf_id, 1);
@@ -731,117 +1059,141 @@
                 VASurfaceID surf_id = mNormalSurfaceMap.valueAt(i);
                 vaDestroySurfaces(mDisplay, &surf_id, 1);
             }
+            for (size_t i = 0; i < up_surf_size; ++i) {
+                VASurfaceID surf_id = mUserptrSurfaceMap.valueAt(i);
+                vaDestroySurfaces(mDisplay, &surf_id, 1);
+            }
             mGrallocSurfaceMap.clear();
             mDrmSurfaceMap.clear();
             mNormalSurfaceMap.clear();
+            mUserptrSurfaceMap.clear();
+            mBsParser->reset();
         }
     }
 }
 
 JpegDecodeStatus JpegDecoder::parseTableData(JpegInfo &jpginfo) {
-    parserInitialize(mParser, jpginfo.buf, jpginfo.bufsize);
-    // Parse Quant tables
+#define REPORT_BS_ERR_IF_FAIL(stmt) \
+            do { \
+                if (!stmt) { \
+                    ETRACE("%s::%d, bitstream error at offset %u, remaining bytes %u, total bytes %zu", \
+                        __FUNCTION__, __LINE__, mBsParser->getByteOffset(), mBsParser->getRemainingBytes(), \
+                        bufsize); \
+                    return JD_ERROR_BITSTREAM; \
+                } \
+            } while(0);
+
+    assert(mParserInitialized);
     memset(&jpginfo.qmatrix_buf, 0, sizeof(jpginfo.qmatrix_buf));
     uint32_t dqt_ind = 0;
+    uint32_t bufsize;
+
+    if (jpginfo.use_vector_input)
+        bufsize = jpginfo.inputs->size();
+    else
+        bufsize = jpginfo.bufsize;
+
     for (dqt_ind = 0; dqt_ind < jpginfo.quant_tables_num; dqt_ind++) {
-        if (mParser->setByteOffset(mParser, jpginfo.dqt_byte_offset[dqt_ind])) {
-            // uint32_t uiTableBytes = mParser->readBytes( 2 ) - 2;
-            uint32_t table_bytes = mParser->readBytes( mParser, 2 ) - 2;
-            do {
-                uint32_t table_info = mParser->readNextByte(mParser);
-                table_bytes--;
-                uint32_t table_length = table_bytes > 64 ? 64 : table_bytes;
-                uint32_t table_precision = table_info >> 4;
-                if (table_precision != 0) {
-                    ETRACE("%s ERROR: Parsing table data returns %d", __FUNCTION__, JD_ERROR_BITSTREAM);
-                    return JD_ERROR_BITSTREAM;
-                }
-                uint32_t table_id = table_info & 0xf;
+        REPORT_BS_ERR_IF_FAIL(mBsParser->trySetByteOffset(jpginfo.dqt_byte_offset[dqt_ind]));
+        uint32_t table_bytes;
+        REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadBytes(&table_bytes, 2 ));
+        table_bytes -= 2;
+        do {
+            uint8_t table_info;
+            REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&table_info));
+            table_bytes--;
+            uint32_t table_length = table_bytes > 64 ? 64 : table_bytes;
+            uint32_t table_precision = table_info >> 4;
+            REPORT_BS_ERR_IF_FAIL ((table_precision == 0));
+            uint32_t table_id = table_info & 0xf;
 
-                jpginfo.qmatrix_buf.load_quantiser_table[table_id] = 1;
+            jpginfo.qmatrix_buf.load_quantiser_table[table_id] = 1;
 
-                if (table_id < JPEG_MAX_QUANT_TABLES) {
-                    // Pull Quant table data from bitstream
-                    uint32_t byte_ind;
-                    for (byte_ind = 0; byte_ind < table_length; byte_ind++) {
-                        jpginfo.qmatrix_buf.quantiser_table[table_id][byte_ind] = mParser->readNextByte(mParser);
-                    }
-                } else {
-                    ETRACE("%s DQT table ID is not supported", __FUNCTION__);
-                    mParser->burnBytes(mParser, table_length);
+            if (table_id < JPEG_MAX_QUANT_TABLES) {
+                // Pull Quant table data from bitstream
+                uint32_t byte_ind;
+                for (byte_ind = 0; byte_ind < table_length; byte_ind++) {
+                    REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.qmatrix_buf.quantiser_table[table_id][byte_ind]));
                 }
-                table_bytes -= table_length;
-            } while (table_bytes);
-        }
+            } else {
+                ETRACE("%s DQT table ID is not supported", __FUNCTION__);
+                REPORT_BS_ERR_IF_FAIL(mBsParser->tryBurnBytes(table_length));
+            }
+            table_bytes -= table_length;
+        } while (table_bytes);
     }
 
     // Parse Huffman tables
     memset(&jpginfo.hufman_table_buf, 0, sizeof(jpginfo.hufman_table_buf));
     uint32_t dht_ind = 0;
     for (dht_ind = 0; dht_ind < jpginfo.huffman_tables_num; dht_ind++) {
-        if (mParser->setByteOffset(mParser, jpginfo.dht_byte_offset[dht_ind])) {
-            uint32_t table_bytes = mParser->readBytes( mParser, 2 ) - 2;
-            do {
-                uint32_t table_info = mParser->readNextByte(mParser);
-                table_bytes--;
-                uint32_t table_class = table_info >> 4; // Identifies whether the table is for AC or DC
-                uint32_t table_id = table_info & 0xf;
-                jpginfo.hufman_table_buf.load_huffman_table[table_id] = 1;
+        REPORT_BS_ERR_IF_FAIL(mBsParser->trySetByteOffset(jpginfo.dht_byte_offset[dht_ind]));
+        uint32_t table_bytes;
+        REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadBytes( &table_bytes, 2 ));
+        table_bytes -= 2;
+        do {
+            uint8_t table_info;
+            REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&table_info));
+            table_bytes--;
+            uint32_t table_class = table_info >> 4; // Identifies whether the table is for AC or DC
+            uint32_t table_id = table_info & 0xf;
+            jpginfo.hufman_table_buf.load_huffman_table[table_id] = 1;
 
-                if ((table_class < TABLE_CLASS_NUM) && (table_id < JPEG_MAX_SETS_HUFFMAN_TABLES)) {
-                    if (table_class == 0) {
-                        uint8_t* bits = mParser->getCurrentIndex(mParser);
-                        // Find out the number of entries in the table
-                        uint32_t table_entries = 0;
-                        uint32_t bit_ind;
-                        for (bit_ind = 0; bit_ind < 16; bit_ind++) {
-                            jpginfo.hufman_table_buf.huffman_table[table_id].num_dc_codes[bit_ind] = bits[bit_ind];
-                            table_entries += jpginfo.hufman_table_buf.huffman_table[table_id].num_dc_codes[bit_ind];
-                        }
-
-                        // Create table of code values
-                        mParser->burnBytes(mParser, 16);
-                        table_bytes -= 16;
-                        uint32_t tbl_ind;
-                        for (tbl_ind = 0; tbl_ind < table_entries; tbl_ind++) {
-                            jpginfo.hufman_table_buf.huffman_table[table_id].dc_values[tbl_ind] = mParser->readNextByte(mParser);
-                            table_bytes--;
-                        }
-
-                    } else { // for AC class
-                        uint8_t* bits = mParser->getCurrentIndex(mParser);
-                        // Find out the number of entries in the table
-                        uint32_t table_entries = 0;
-                        uint32_t bit_ind = 0;
-                        for (bit_ind = 0; bit_ind < 16; bit_ind++) {
-                            jpginfo.hufman_table_buf.huffman_table[table_id].num_ac_codes[bit_ind] = bits[bit_ind];
-                            table_entries += jpginfo.hufman_table_buf.huffman_table[table_id].num_ac_codes[bit_ind];
-                        }
-
-                        // Create table of code values
-                        mParser->burnBytes(mParser, 16);
-                        table_bytes -= 16;
-                        uint32_t tbl_ind = 0;
-                        for (tbl_ind = 0; tbl_ind < table_entries; tbl_ind++) {
-                            jpginfo.hufman_table_buf.huffman_table[table_id].ac_values[tbl_ind] = mParser->readNextByte(mParser);
-                            table_bytes--;
-                        }
-                    }//end of else
-                } else {
+            if ((table_class < TABLE_CLASS_NUM) && (table_id < JPEG_MAX_SETS_HUFFMAN_TABLES)) {
+                if (table_class == 0) {
+                    //const uint8_t* bits = mBsParser->getCurrentIndex();
                     // Find out the number of entries in the table
-                    ETRACE("%s DHT table ID is not supported", __FUNCTION__);
                     uint32_t table_entries = 0;
-                    uint32_t bit_ind = 0;
-                    for(bit_ind = 0; bit_ind < 16; bit_ind++) {
-                        table_entries += mParser->readNextByte(mParser);
+                    uint32_t bit_ind;
+                    for (bit_ind = 0; bit_ind < 16; bit_ind++) {
+                        jpginfo.hufman_table_buf.huffman_table[table_id].num_dc_codes[bit_ind] = mBsParser->itemAt(mBsParser->getByteOffset() + bit_ind);
+                        table_entries += jpginfo.hufman_table_buf.huffman_table[table_id].num_dc_codes[bit_ind];
+                    }
+
+                    // Create table of code values
+                    REPORT_BS_ERR_IF_FAIL(mBsParser->tryBurnBytes(16));
+                    table_bytes -= 16;
+                    uint32_t tbl_ind;
+                    for (tbl_ind = 0; tbl_ind < table_entries; tbl_ind++) {
+                        REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.hufman_table_buf.huffman_table[table_id].dc_values[tbl_ind]));
                         table_bytes--;
                     }
-                    mParser->burnBytes(mParser, table_entries);
-                    table_bytes -= table_entries;
-                }
 
-            } while (table_bytes);
-        }
+                } else { // for AC class
+                    //const uint8_t* bits = mBsParser->getCurrentIndex();
+                    // Find out the number of entries in the table
+                    uint32_t table_entries = 0;
+                    uint32_t bit_ind = 0;
+                    for (bit_ind = 0; bit_ind < 16; bit_ind++) {
+                        jpginfo.hufman_table_buf.huffman_table[table_id].num_ac_codes[bit_ind] = mBsParser->itemAt(mBsParser->getByteOffset() + bit_ind);//bits[bit_ind];
+                        table_entries += jpginfo.hufman_table_buf.huffman_table[table_id].num_ac_codes[bit_ind];
+                    }
+
+                    // Create table of code values
+                    REPORT_BS_ERR_IF_FAIL(mBsParser->tryBurnBytes(16));
+                    table_bytes -= 16;
+                    uint32_t tbl_ind = 0;
+                    for (tbl_ind = 0; tbl_ind < table_entries; tbl_ind++) {
+                        REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&jpginfo.hufman_table_buf.huffman_table[table_id].ac_values[tbl_ind]));
+                        table_bytes--;
+                    }
+                }//end of else
+            } else {
+                // Find out the number of entries in the table
+                ETRACE("%s DHT table ID is not supported", __FUNCTION__);
+                uint32_t table_entries = 0;
+                uint32_t bit_ind = 0;
+                for(bit_ind = 0; bit_ind < 16; bit_ind++) {
+                    uint8_t tmp;
+                    REPORT_BS_ERR_IF_FAIL(mBsParser->tryReadNextByte(&tmp));
+                    table_entries += tmp;
+                    table_bytes--;
+                }
+                REPORT_BS_ERR_IF_FAIL(mBsParser->tryBurnBytes(table_entries));
+                table_bytes -= table_entries;
+            }
+
+        } while (table_bytes);
     }
 
     return JD_SUCCESS;
diff --git a/imagedecoder/JPEGDecoder.h b/imagedecoder/JPEGDecoder.h
index 9c0cd9a..754bf6d 100644
--- a/imagedecoder/JPEGDecoder.h
+++ b/imagedecoder/JPEGDecoder.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -31,29 +30,35 @@
 #ifndef JPEGDEC_H
 #define JPEGDEC_H
 
-#include <VideoVPPBase.h>
 #include <utils/KeyedVector.h>
 #include <utils/threads.h>
+#include <hardware/gralloc.h>
 #include "JPEGCommon.h"
+#include <va/va.h>
+#include <va/va_drmcommon.h>
+#include <va/va_vpp.h>
+#include <va/va_android.h>
+#include <va/va_tpi.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
 using namespace android;
 
 struct CJPEGParse;
+class JpegBitstreamParser;
 class JpegBlitter;
+typedef void* BlitEvent;
+
+extern int generateHandle();
 
 // Non thread-safe
 class JpegDecoder
 {
 friend class JpegBlitter;
 public:
-    struct MapHandle
-    {
-    friend class JpegDecoder;
-    public:
-        bool valid;
-    private:
-        VAImage *img;
-    };
-    JpegDecoder();
+    typedef uint32_t MapHandle;
+    JpegDecoder(VADisplay display = NULL, VAConfigID vpCfgId = VA_INVALID_ID, VAContextID vpCtxId = VA_INVALID_ID, bool use_blitter = false);
     virtual ~JpegDecoder();
     virtual JpegDecodeStatus init(int width, int height, RenderTarget **targets, int num);
     virtual void deinit();
@@ -61,27 +66,50 @@
     virtual JpegDecodeStatus decode(JpegInfo &jpginfo, RenderTarget &target);
     virtual JpegDecodeStatus sync(RenderTarget &target);
     virtual bool busy(RenderTarget &target) const;
-    virtual JpegDecodeStatus blit(RenderTarget &src, RenderTarget &dst);
+    virtual JpegDecodeStatus blit(RenderTarget &src, RenderTarget &dst, int scale_factor);
+    virtual JpegDecodeStatus getRgbaTile(RenderTarget &src,
+                                         uint8_t *sysmem,
+                                         int left, int top, int width, int height, int scale_factor);
+    virtual JpegDecodeStatus blitToLinearRgba(RenderTarget &src,
+                                              uint8_t *sysmem,
+                                              uint32_t width, uint32_t height,
+                                              BlitEvent &event, int scale_factor);
+    virtual JpegDecodeStatus blitToCameraSurfaces(RenderTarget &src,
+                                                   buffer_handle_t dst_nv12,
+                                                   buffer_handle_t dst_yuy2,
+                                                   uint8_t *dst_nv21,
+                                                   uint8_t *dst_yv12,
+                                                   uint32_t width, uint32_t height,
+                                                   BlitEvent &event);
+    virtual void syncBlit(BlitEvent &event);
     virtual MapHandle mapData(RenderTarget &target, void ** data, uint32_t * offsets, uint32_t * pitches);
     virtual void unmapData(RenderTarget &target, MapHandle maphandle);
-private:
+    virtual VASurfaceID getSurfaceID(RenderTarget &target) const;
+    virtual JpegDecodeStatus createSurfaceFromRenderTarget(RenderTarget &target, VASurfaceID *surf_id);
+    virtual JpegDecodeStatus destroySurface(RenderTarget &target);
+    virtual JpegDecodeStatus destroySurface(VASurfaceID surf_id);
+protected:
     bool mInitialized;
     mutable Mutex mLock;
     VADisplay mDisplay;
     VAConfigID mConfigId;
     VAContextID mContextId;
     CJPEGParse *mParser;
+    JpegBitstreamParser *mBsParser;
+    bool mParserInitialized;
     JpegBlitter *mBlitter;
+    bool mDispCreated;
     KeyedVector<buffer_handle_t, VASurfaceID> mGrallocSurfaceMap;
     KeyedVector<unsigned long, VASurfaceID> mDrmSurfaceMap;
     KeyedVector<int, VASurfaceID> mNormalSurfaceMap;
-    virtual VASurfaceID getSurfaceID(RenderTarget &target) const;
+    KeyedVector<int, VASurfaceID> mUserptrSurfaceMap;
+    virtual JpegDecodeStatus parseHeader(JpegInfo &jpginfo);
     virtual JpegDecodeStatus parseTableData(JpegInfo &jpginfo);
     virtual bool jpegColorFormatSupported(JpegInfo &jpginfo) const;
-    virtual JpegDecodeStatus createSurfaceFromRenderTarget(RenderTarget &target, VASurfaceID *surf_id);
-    virtual JpegDecodeStatus createSurfaceInternal(int width, int height, int pixel_format, int handle, VASurfaceID *surf_id);
-    virtual JpegDecodeStatus createSurfaceDrm(int width, int height, int pixel_format, unsigned long boname, int stride, VASurfaceID *surf_id);
-    virtual JpegDecodeStatus createSurfaceGralloc(int width, int height, int pixel_format, buffer_handle_t handle, int stride, VASurfaceID *surf_id);
+    virtual JpegDecodeStatus createSurfaceInternal(int width, int height, uint32_t fourcc, int handle, VASurfaceID *surf_id);
+    virtual JpegDecodeStatus createSurfaceUserptr(int width, int height, uint32_t fourcc, uint8_t* ptr, VASurfaceID *surf_id);
+    virtual JpegDecodeStatus createSurfaceDrm(int width, int height, uint32_t fourcc, unsigned long boname, int stride, VASurfaceID *surf_id);
+    virtual JpegDecodeStatus createSurfaceGralloc(int width, int height, uint32_t fourcc, buffer_handle_t handle, int stride, VASurfaceID *surf_id);
 };
 
 
diff --git a/imagedecoder/JPEGDecoder_gen.cpp b/imagedecoder/JPEGDecoder_gen.cpp
index 8fa25a8..0f1fb2a 100644
--- a/imagedecoder/JPEGDecoder_gen.cpp
+++ b/imagedecoder/JPEGDecoder_gen.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -25,11 +24,11 @@
 *    Yao Cheng <yao.cheng@intel.com>
 *
 */
-//#define LOG_NDEBUG 0
 
 #include "va/va.h"
 #include "va/va_vpp.h"
 #include "va/va_drmcommon.h"
+#include "va/va_tpi.h"
 #include "JPEGDecoder.h"
 #include "ImageDecoderTrace.h"
 #include <string.h>
@@ -37,6 +36,35 @@
 #include <time.h>
 #include "JPEGCommon_Gen.h"
 
+uint32_t aligned_height(uint32_t height, int tiling)
+{
+    switch(tiling) {
+    // Y-tile (128 x 32): NV12, 411P, IMC3, 422H, 422V, 444P
+    case SURF_TILING_Y:
+        return (height + (32-1)) & ~(32-1);
+    // X-tile (512 x 8):
+    case SURF_TILING_X:
+        return (height + (8-1)) & ~(8-1);
+    // Linear: other
+    default:
+        return height;
+    }
+}
+uint32_t aligned_width(uint32_t width, int tiling)
+{
+    switch(tiling) {
+    // Y-tile (128 x 32): NV12, 411P, IMC3, 422H, 422V, 444P
+    case SURF_TILING_Y:
+        return (width + (128-1)) & ~(128-1);
+    // X-tile (512 x 8):
+    case SURF_TILING_X:
+        return (width + (512-1)) & ~(512-1);
+    // Linear: other
+    default:
+        return width;
+    }
+}
+
 int fourcc2PixelFormat(uint32_t fourcc)
 {
     switch(fourcc) {
@@ -50,8 +78,6 @@
         return HAL_PIXEL_FORMAT_NV12_TILED_INTEL;
     case VA_FOURCC_RGBA:
         return HAL_PIXEL_FORMAT_RGBA_8888;
-    case VA_FOURCC_422V:
-    case VA_FOURCC_411P:
     default:
         return -1;
     }
@@ -74,8 +100,6 @@
     }
 }
 
-//#define LOG_TAG "ImageDecoder"
-
 #define JD_CHECK(err, label) \
         if (err) { \
             ETRACE("%s::%d: failed: %d", __PRETTY_FUNCTION__, __LINE__, err); \
@@ -93,17 +117,19 @@
 {
     return (jpginfo.image_color_fourcc == VA_FOURCC_IMC3) ||
         (jpginfo.image_color_fourcc == VA_FOURCC_422H) ||
+        (jpginfo.image_color_fourcc == VA_FOURCC_422V) ||
+        (jpginfo.image_color_fourcc == VA_FOURCC_411P) ||
+        (jpginfo.image_color_fourcc == VA_FOURCC('4','0','0','P')) ||
         (jpginfo.image_color_fourcc == VA_FOURCC_444P);
 }
 
-JpegDecodeStatus JpegDecoder::createSurfaceDrm(int width, int height, int pixel_format, unsigned long boname, int stride, VASurfaceID *surf_id)
+JpegDecodeStatus JpegDecoder::createSurfaceDrm(int width, int height, uint32_t fourcc, unsigned long boname, int stride, VASurfaceID *surf_id)
 {
     VAStatus st;
     VASurfaceAttrib                 attrib_list;
     VASurfaceAttribExternalBuffers  vaSurfaceExternBuf;
-    uint32_t fourcc = pixelFormat2Fourcc(pixel_format);
+    memset(&vaSurfaceExternBuf, 0, sizeof (VASurfaceAttribExternalBuffers));
     vaSurfaceExternBuf.pixel_format = fourcc;
-    VTRACE("%s extBuf.pixel_format is %s", __FUNCTION__, fourcc2str(NULL, fourcc));
     vaSurfaceExternBuf.width        = width;
     vaSurfaceExternBuf.height       = height;
     vaSurfaceExternBuf.pitches[0]   = stride;
@@ -115,6 +141,22 @@
     attrib_list.value.type    = VAGenericValueTypePointer;
     attrib_list.value.value.p = (void *)&vaSurfaceExternBuf;
 
+    VTRACE("%s, vaformat=0x%x, width=%d, height=%d, attrib=", __FUNCTION__, fourcc2VaFormat(fourcc),
+        width, height);
+    VTRACE("            ext.pixel_format=0x%x", vaSurfaceExternBuf.pixel_format);
+    VTRACE("            ext.width=%u", vaSurfaceExternBuf.width);
+    VTRACE("            ext.height=%u", vaSurfaceExternBuf.height);
+    VTRACE("            ext.data_size=%u", vaSurfaceExternBuf.data_size);
+    VTRACE("            ext.num_planes=%u", vaSurfaceExternBuf.num_planes);
+    VTRACE("            ext.pitches=%u,%u,%u,%u", vaSurfaceExternBuf.pitches[0],vaSurfaceExternBuf.pitches[1],vaSurfaceExternBuf.pitches[2],vaSurfaceExternBuf.pitches[3]);
+    VTRACE("            ext.offsets=%u,%u,%u,%u", vaSurfaceExternBuf.offsets[0],vaSurfaceExternBuf.offsets[1],vaSurfaceExternBuf.offsets[2],vaSurfaceExternBuf.offsets[3]);
+    VTRACE("            ext.buffers[0]=%lu", vaSurfaceExternBuf.buffers[0]);
+    VTRACE("            ext.num_buffers=%u", vaSurfaceExternBuf.num_buffers);
+    VTRACE("            ext.flags=%u", vaSurfaceExternBuf.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.type);
+    VTRACE("            attrib_list.flags=%u", attrib_list.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.value.type);
+
     st = vaCreateSurfaces(mDisplay,
             fourcc2VaFormat(fourcc),
             width,
@@ -123,7 +165,7 @@
             1,
             &attrib_list,
             1);
-    VTRACE("%s createSurface DRM for vaformat %u, fourcc %s", __FUNCTION__, fourcc2VaFormat(fourcc), fourcc2str(NULL, fourcc));
+    VTRACE("%s createSurface DRM for vaformat %u, fourcc %s", __FUNCTION__, fourcc2VaFormat(fourcc), fourcc2str(fourcc));
     if (st != VA_STATUS_SUCCESS) {
         ETRACE("%s: vaCreateSurfaces returns %d", __PRETTY_FUNCTION__, st);
         return JD_RESOURCE_FAILURE;
@@ -131,51 +173,201 @@
     return JD_SUCCESS;
 }
 
-JpegDecodeStatus JpegDecoder::createSurfaceGralloc(int width, int height, int pixel_format, buffer_handle_t handle, int stride, VASurfaceID *surf_id)
+JpegDecodeStatus JpegDecoder::createSurfaceGralloc(int width, int height, uint32_t fourcc, buffer_handle_t handle, int stride, VASurfaceID *surf_id)
 {
-    unsigned long boname;
-    hw_module_t const* module = NULL;
-    alloc_device_t *allocdev = NULL;
-    struct gralloc_module_t *gralloc_module = NULL;
-    JpegDecodeStatus st;
+    VAStatus st;
+    VASurfaceAttrib                 attrib_list;
+    VASurfaceAttribExternalBuffers  vaSurfaceExternBuf;
+    memset(&vaSurfaceExternBuf, 0, sizeof (VASurfaceAttribExternalBuffers));
+    vaSurfaceExternBuf.pixel_format = fourcc;
+    vaSurfaceExternBuf.width        = width;
+    vaSurfaceExternBuf.height       = height;
+    vaSurfaceExternBuf.pitches[0]   = stride;
+    vaSurfaceExternBuf.buffers      = (unsigned long*)&handle;
+    vaSurfaceExternBuf.num_buffers  = 1;
+    vaSurfaceExternBuf.flags        = VA_SURFACE_ATTRIB_MEM_TYPE_ANDROID_GRALLOC;
+    attrib_list.type          = VASurfaceAttribExternalBufferDescriptor;
+    attrib_list.flags         = VA_SURFACE_ATTRIB_SETTABLE;
+    attrib_list.value.type    = VAGenericValueTypePointer;
+    attrib_list.value.value.p = (void *)&vaSurfaceExternBuf;
+    VTRACE("%s, vaformat=0x%x, width=%d, height=%d, attrib=", __FUNCTION__, fourcc2VaFormat(fourcc),
+        width, height);
+    VTRACE("            ext.pixel_format=0x%x", vaSurfaceExternBuf.pixel_format);
+    VTRACE("            ext.width=%u", vaSurfaceExternBuf.width);
+    VTRACE("            ext.height=%u", vaSurfaceExternBuf.height);
+    VTRACE("            ext.data_size=%u", vaSurfaceExternBuf.data_size);
+    VTRACE("            ext.num_planes=%u", vaSurfaceExternBuf.num_planes);
+    VTRACE("            ext.pitches=%u,%u,%u,%u", vaSurfaceExternBuf.pitches[0],vaSurfaceExternBuf.pitches[1],vaSurfaceExternBuf.pitches[2],vaSurfaceExternBuf.pitches[3]);
+    VTRACE("            ext.offsets=%u,%u,%u,%u", vaSurfaceExternBuf.offsets[0],vaSurfaceExternBuf.offsets[1],vaSurfaceExternBuf.offsets[2],vaSurfaceExternBuf.offsets[3]);
+    VTRACE("            ext.buffers[0]=%lu", vaSurfaceExternBuf.buffers[0]);
+    VTRACE("            ext.num_buffers=%u", vaSurfaceExternBuf.num_buffers);
+    VTRACE("            ext.flags=%u", vaSurfaceExternBuf.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.type);
+    VTRACE("            attrib_list.flags=%u", attrib_list.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.value.type);
 
-    uint32_t fourcc = pixelFormat2Fourcc(pixel_format);
-    VTRACE("enter %s, pixel_format 0x%x, fourcc %s", __FUNCTION__, pixel_format, fourcc2str(NULL, fourcc));
-
-    int err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
-    if (err) {
-        ETRACE("%s failed to get gralloc module", __PRETTY_FUNCTION__);
-        st = JD_RESOURCE_FAILURE;
-    }
-    JD_CHECK(err, cleanup);
-    gralloc_module = (struct gralloc_module_t *)module;
-    err = gralloc_open(module, &allocdev);
-    if (err) {
-        ETRACE("%s failed to open alloc device", __PRETTY_FUNCTION__);
-        st = JD_RESOURCE_FAILURE;
-    }
-    JD_CHECK(err, cleanup);
-    err = gralloc_module->perform(gralloc_module,
-        INTEL_UFO_GRALLOC_MODULE_PERFORM_GET_BO_NAME,
-        handle,
-        &boname);
-    if (err) {
-        ETRACE("%s failed to get boname via gralloc->perform", __PRETTY_FUNCTION__);
-        st = JD_RESOURCE_FAILURE;
-    }
-    JD_CHECK(err, cleanup);
-    VTRACE("YAO %s fourcc %s luma_stride is %d", __FUNCTION__,
-        fourcc2str(NULL, fourcc), stride);
-
-    gralloc_close(allocdev);
-    return createSurfaceDrm(width, height, pixel_format, boname, stride, surf_id);
-cleanup:
-    if (allocdev)
-        gralloc_close(allocdev);
-    return st;
+    st = vaCreateSurfaces(mDisplay,
+            fourcc2VaFormat(fourcc),
+            width,
+            height,
+            surf_id,
+            1,
+            &attrib_list,
+            1);
+    VTRACE("%s createSurface GRALLOC for vaformat %u, fourcc %s", __FUNCTION__, fourcc2VaFormat(fourcc), fourcc2str(fourcc));
+    if (st != VA_STATUS_SUCCESS) {
+        ETRACE("%s: vaCreateSurfaces returns %d", __PRETTY_FUNCTION__, st);
+        return JD_RESOURCE_FAILURE;
+     }
+    return JD_SUCCESS;
 }
 
 
+JpegDecodeStatus JpegDecoder::createSurfaceUserptr(int width, int height, uint32_t fourcc, uint8_t* ptr, VASurfaceID *surf_id)
+{
+    VAStatus st;
+    VASurfaceAttrib                 attrib_list;
+    VASurfaceAttribExternalBuffers  vaSurfaceExternBuf;
+    memset(&vaSurfaceExternBuf, 0, sizeof (VASurfaceAttribExternalBuffers));
+    vaSurfaceExternBuf.pixel_format = fourcc;
+    vaSurfaceExternBuf.width        = width;
+    vaSurfaceExternBuf.height       = height;
+    vaSurfaceExternBuf.pitches[0]   = width;
+    vaSurfaceExternBuf.offsets[0]   = 0;
+    switch (fourcc) {
+    case VA_FOURCC_NV12:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = 0;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = 0;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
+        vaSurfaceExternBuf.pitches[0]   = width * 2;
+        vaSurfaceExternBuf.pitches[1]   = 0;
+        vaSurfaceExternBuf.pitches[2]   = 0;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = 0;
+        vaSurfaceExternBuf.offsets[2]   = 0;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_YV12:
+        vaSurfaceExternBuf.pitches[1]   = width / 2;
+        vaSurfaceExternBuf.pitches[2]   = width / 2;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 5 / 4;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_RGBA:
+        vaSurfaceExternBuf.pitches[0]   = width * 4;
+        vaSurfaceExternBuf.pitches[1]   = 0;
+        vaSurfaceExternBuf.pitches[2]   = 0;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = 0;
+        vaSurfaceExternBuf.offsets[2]   = 0;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_411P:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 2;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_411R:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 5 / 4;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_IMC3:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 3 / 2;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_422H:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_422V:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 3 / 2;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC_444P:
+        vaSurfaceExternBuf.pitches[1]   = width;
+        vaSurfaceExternBuf.pitches[2]   = width;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = width * height;
+        vaSurfaceExternBuf.offsets[2]   = width * height * 2;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    case VA_FOURCC('4','0','0','P'):
+    default:
+        vaSurfaceExternBuf.pitches[1]   = 0;
+        vaSurfaceExternBuf.pitches[2]   = 0;
+        vaSurfaceExternBuf.pitches[3]   = 0;
+        vaSurfaceExternBuf.offsets[1]   = 0;
+        vaSurfaceExternBuf.offsets[2]   = 0;
+        vaSurfaceExternBuf.offsets[3]   = 0;
+        break;
+    }
+    vaSurfaceExternBuf.buffers      = (unsigned long*)ptr;
+    vaSurfaceExternBuf.num_buffers  = 1;
+    vaSurfaceExternBuf.flags        = VA_SURFACE_ATTRIB_MEM_TYPE_ANDROID_GRALLOC;
+    attrib_list.type          = VASurfaceAttribMemoryType;
+    attrib_list.flags         = VA_SURFACE_ATTRIB_SETTABLE;
+    attrib_list.value.type    = VAGenericValueTypeInteger;
+    attrib_list.value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_USER_PTR;
+
+    VTRACE("%s, vaformat=0x%x, width=%d, height=%d, attrib=", __FUNCTION__, fourcc2VaFormat(fourcc),
+        width, height);
+    VTRACE("            ext.pixel_format=0x%x", vaSurfaceExternBuf.pixel_format);
+    VTRACE("            ext.width=%u", vaSurfaceExternBuf.width);
+    VTRACE("            ext.height=%u", vaSurfaceExternBuf.height);
+    VTRACE("            ext.data_size=%u", vaSurfaceExternBuf.data_size);
+    VTRACE("            ext.num_planes=%u", vaSurfaceExternBuf.num_planes);
+    VTRACE("            ext.pitches=%u,%u,%u,%u", vaSurfaceExternBuf.pitches[0],vaSurfaceExternBuf.pitches[1],vaSurfaceExternBuf.pitches[2],vaSurfaceExternBuf.pitches[3]);
+    VTRACE("            ext.offsets=%u,%u,%u,%u", vaSurfaceExternBuf.offsets[0],vaSurfaceExternBuf.offsets[1],vaSurfaceExternBuf.offsets[2],vaSurfaceExternBuf.offsets[3]);
+    VTRACE("            ext.buffers[0]=%lu", vaSurfaceExternBuf.buffers[0]);
+    VTRACE("            ext.num_buffers=%u", vaSurfaceExternBuf.num_buffers);
+    VTRACE("            ext.flags=%u", vaSurfaceExternBuf.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.type);
+    VTRACE("            attrib_list.flags=%u", attrib_list.flags);
+    VTRACE("            attrib_list.type=%u", attrib_list.value.type);
+
+    st = vaCreateSurfaces(mDisplay,
+            fourcc2VaFormat(fourcc),
+            width,
+            height,
+            surf_id,
+            1,
+            &attrib_list,
+            1);
+    VTRACE("%s createSurface GRALLOC for vaformat %u, fourcc %s", __FUNCTION__, fourcc2VaFormat(fourcc), fourcc2str(fourcc));
+    if (st != VA_STATUS_SUCCESS) {
+        ETRACE("%s: vaCreateSurfaces returns %d", __PRETTY_FUNCTION__, st);
+        return JD_RESOURCE_FAILURE;
+    }
+    return JD_SUCCESS;
+
+}
 
 
 
diff --git a/imagedecoder/JPEGDecoder_img.cpp b/imagedecoder/JPEGDecoder_img.cpp
index d90559d..165c138 100644
--- a/imagedecoder/JPEGDecoder_img.cpp
+++ b/imagedecoder/JPEGDecoder_img.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -30,6 +29,35 @@
 #include "JPEGCommon_Img.h"
 #include "JPEGDecoder.h"
 
+uint32_t aligned_height(uint32_t height, int tiling)
+{
+    switch(tiling) {
+    // Y-tile (128 x 32): NV12, 411P, IMC3, 422H, 422V, 444P
+    case SURF_TILING_Y:
+        return (height + (32-1)) & ~(32-1);
+    // X-tile (512 x 8):
+    case SURF_TILING_X:
+        return (height + (8-1)) & ~(8-1);
+    // Linear: other
+    default:
+        return height;
+    }
+}
+uint32_t aligned_width(uint32_t width, int tiling)
+{
+    switch(tiling) {
+    // Y-tile (128 x 32): NV12, 411P, IMC3, 422H, 422V, 444P
+    case SURF_TILING_Y:
+        return (width + (128-1)) & ~(128-1);
+    // X-tile (512 x 8):
+    case SURF_TILING_X:
+        return (width + (512-1)) & ~(512-1);
+    // Linear: other
+    default:
+        return width;
+    }
+}
+
 int fourcc2PixelFormat(uint32_t fourcc)
 {
     switch(fourcc) {
@@ -65,19 +93,19 @@
         (jpginfo.image_color_fourcc == VA_FOURCC_444P);
 }
 
-JpegDecodeStatus JpegDecoder::createSurfaceDrm(int width, int height, int pixel_format, unsigned long boname, int stride, VASurfaceID *surf_id)
+JpegDecodeStatus JpegDecoder::createSurfaceDrm(int width, int height, uint32_t fourcc, unsigned long boname, int stride, VASurfaceID *surf_id)
 {
     return JD_RENDER_TARGET_TYPE_UNSUPPORTED;
 }
 
-JpegDecodeStatus JpegDecoder::createSurfaceGralloc(int width, int height, int pixel_format, buffer_handle_t handle, int stride, VASurfaceID *surf_id)
+JpegDecodeStatus JpegDecoder::createSurfaceGralloc(int width, int height, uint32_t fourcc, buffer_handle_t handle, int stride, VASurfaceID *surf_id)
 {
     VAStatus st;
     VASurfaceAttributeTPI attrib_tpi;
     uint32_t va_format = VA_RT_FORMAT_YUV444;
     attrib_tpi.count = 1;
     attrib_tpi.luma_stride = stride;
-    attrib_tpi.pixel_format = pixel_format;
+    attrib_tpi.pixel_format = VA_FOURCC_YV32;
     attrib_tpi.width = width;
     attrib_tpi.height = height;
     attrib_tpi.type = VAExternalMemoryAndroidGrallocBuffer;
@@ -96,4 +124,8 @@
     return JD_SUCCESS;
 }
 
+JpegDecodeStatus JpegDecoder::createSurfaceUserptr(int width, int height, uint32_t fourcc, uint8_t* ptr, VASurfaceID *surf_id)
+{
+    return JD_INVALID_RENDER_TARGET;
+}
 
diff --git a/imagedecoder/JPEGDecoder_libjpeg_wrapper.cpp b/imagedecoder/JPEGDecoder_libjpeg_wrapper.cpp
index edfaac6..0e123e5 100644
--- a/imagedecoder/JPEGDecoder_libjpeg_wrapper.cpp
+++ b/imagedecoder/JPEGDecoder_libjpeg_wrapper.cpp
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -34,8 +33,6 @@
  * to determine which path will be use (SW or HW)
  *
  */
-//#define LOG_NDEBUG 0
-#define LOG_TAG "ImageDecoder"
 
 #include <utils/Log.h>
 #include "JPEGDecoder_libjpeg_wrapper.h"
@@ -43,27 +40,42 @@
 #include <utils/threads.h>
 #include "JPEGDecoder.h"
 #include <va/va.h>
+#include <va/va_android.h>
 #include "va/va_dec_jpeg.h"
-
+#include <utils/Timers.h>
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 
 #include <assert.h>
-
+#include <utils/Vector.h>
 static Mutex jdlock;
+static VADisplay display = NULL;
+static VAConfigID vpCfgId = VA_INVALID_ID;
+static VAContextID vpCtxId = VA_INVALID_ID;
+
+#define DUMP_DECODE 0
+#define DUMP_RGBA 0
+#define RGBA_DUMP_FILE_PATTERN "/sdcard/jpeg_%dx%d_from_%s.rgba"
+#define DECODE_DUMP_FILE_PATTERN "/sdcard/jpeg_%dx%d.%s"
+
+using namespace android;
 
 struct jdva_private
 {
     JpegInfo jpg_info;
-    JpegDecoder decoder;
+    android::Vector<uint8_t> inputs;
+    JpegDecoder *decoder;
     RenderTarget dec_buffer;
-    RenderTarget yuy2_buffer;
-    RenderTarget rgba_buffer;
+    uint8_t* rgba_out;
+    BlitEvent blit_event;
+    int tile_read_x;
+    int tile_read_y;
+    int tile_read_width;
+    int tile_read_height;
+    int scale_factor;
 };
 
-static int internal_buffer_handle = 0;
-
 #define JD_CHECK(err, label) \
         if (err) { \
             ALOGE("%s::%d: failed: %d", __PRETTY_FUNCTION__, __LINE__, err); \
@@ -77,6 +89,38 @@
             goto label; \
         }
 
+static void libva_vp_pre_init_locked()
+{
+    if (display == NULL && vpCfgId == VA_INVALID_ID && vpCtxId == VA_INVALID_ID) {
+        Display dpy;
+        int va_major_version, va_minor_version;
+        VAConfigAttrib  vpp_attrib;
+        VAStatus st;
+        display = vaGetDisplay(&dpy);
+        st = vaInitialize(display, &va_major_version, &va_minor_version);
+        assert(st == VA_STATUS_SUCCESS);
+        vpp_attrib.type  = VAConfigAttribRTFormat;
+        vpp_attrib.value = VA_RT_FORMAT_YUV420;
+        st = vaCreateConfig(display, VAProfileNone,
+                                    VAEntrypointVideoProc,
+                                    &vpp_attrib,
+                                    1, &vpCfgId);
+        assert(st == VA_STATUS_SUCCESS);
+        st = vaCreateContext(display, vpCfgId, 1920, 1080, 0, NULL, 0, &vpCtxId);
+        assert(st == VA_STATUS_SUCCESS);
+    }
+}
+
+/* clear the global VA context
+ * actually it's not needed
+ * when the process terminates, the drm fd will be closed by kernel and the VA
+ * context will be automatically released
+ */
+static void libva_vp_post_deinit_locked()
+{
+    // DO NOTHING
+}
+
 Decode_Status jdva_initialize (jd_libva_struct * jd_libva_ptr)
 {
   /*
@@ -91,21 +135,38 @@
     VAStatus va_status = VA_STATUS_SUCCESS;
     Decode_Status status = DECODE_SUCCESS;
 
+    Mutex::Autolock autoLock(jdlock);
+
+    if (display == NULL || vpCfgId == VA_INVALID_ID || vpCtxId == VA_INVALID_ID) {
+        libva_vp_pre_init_locked();
+    }
+
     if (jd_libva_ptr->initialized) {
         ALOGW("%s HW decode already initialized", __FUNCTION__);
         return DECODE_NOT_STARTED;
     }
 
     {
-        Mutex::Autolock autoLock(jdlock);
         if (!(jd_libva_ptr->initialized)) {
             jdva_private *priv = new jdva_private;
             memset(&priv->jpg_info, 0, sizeof(JpegInfo));
+            priv->jpg_info.use_vector_input = true;
             memset(&priv->dec_buffer, 0, sizeof(RenderTarget));
-            memset(&priv->yuy2_buffer, 0, sizeof(RenderTarget));
-            memset(&priv->rgba_buffer, 0, sizeof(RenderTarget));
+            priv->rgba_out = NULL;
+            priv->inputs.clear();
+            priv->jpg_info.inputs = &priv->inputs;
             jd_libva_ptr->initialized = TRUE;
             jd_libva_ptr->priv = (uint32_t)priv;
+            jd_libva_ptr->cap_available= 0x0;
+            jd_libva_ptr->cap_available |= JPEG_CAPABILITY_DECODE;
+#ifdef GFXGEN
+            jd_libva_ptr->cap_available |= JPEG_CAPABILITY_UPSAMPLE | JPEG_CAPABILITY_DOWNSCALE;
+#endif
+            jd_libva_ptr->cap_enabled = jd_libva_ptr->cap_available;
+            if (jd_libva_ptr->cap_available & JPEG_CAPABILITY_UPSAMPLE)
+                priv->decoder = new JpegDecoder(display, vpCfgId, vpCtxId, true);
+            else
+                priv->decoder = new JpegDecoder();
             status = DECODE_SUCCESS;
         }
     }
@@ -127,74 +188,127 @@
         Mutex::Autolock autoLock(jdlock);
         if (jd_libva_ptr->initialized) {
             jdva_private *p = (jdva_private*)jd_libva_ptr->priv;
+            delete p->decoder;
+            jd_libva_ptr->bitstream_buf = NULL;
+            p->inputs.clear();
             delete p;
             jd_libva_ptr->initialized = FALSE;
         }
     }
-    ALOGV("jdva_deinitialize finished");
     return;
 }
 
-RenderTarget * create_render_target(RenderTarget* target, int width, int height, int pixel_format)
+Decode_Status jdva_fill_input(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr)
+{
+    jdva_private *p = (jdva_private*)jd_libva_ptr->priv;
+    if ((*cinfo->src->fill_input_buffer)(cinfo)) {
+        assert(cinfo->src->next_input_byte);
+        assert(cinfo->src->bytes_in_buffer);
+        p->inputs.appendArray(cinfo->src->next_input_byte, cinfo->src->bytes_in_buffer);
+        jd_libva_ptr->file_size += cinfo->src->bytes_in_buffer;
+        ALOGV("%s read %d bytes, file_size %u bytes, vector %u bytes", __FUNCTION__, cinfo->src->bytes_in_buffer, jd_libva_ptr->file_size, p->inputs.size());
+        cinfo->src->bytes_in_buffer = 0;
+    }
+    else {
+        return DECODE_DRIVER_FAIL;
+    }
+    return DECODE_SUCCESS;
+}
+
+void jdva_drain_input(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr)
+{
+    nsecs_t now = systemTime();
+    jdva_private *p = (jdva_private*)jd_libva_ptr->priv;
+    do {
+      if ((*cinfo->src->fill_input_buffer)(cinfo)) {
+          p->inputs.appendArray(cinfo->src->next_input_byte, cinfo->src->bytes_in_buffer);
+          jd_libva_ptr->file_size += cinfo->src->bytes_in_buffer;
+      }
+      else {
+          break;
+      }
+    } while (cinfo->src->bytes_in_buffer > 0);
+    jd_libva_ptr->bitstream_buf = p->inputs.array();
+    ALOGV("%s drained input %u bytes took %.2f ms", __FUNCTION__, jd_libva_ptr->file_size,
+        (systemTime() - now)/1000000.0);
+}
+
+RenderTarget * create_render_target(RenderTarget* target, int width, int height, uint32_t fourcc)
 {
     hw_module_t const* module = NULL;
     alloc_device_t *allocdev = NULL;
     struct gralloc_module_t *gralloc_module = NULL;
     buffer_handle_t handle;
-    uint32_t fourcc;
     int stride, bpp, err;
-    fourcc = pixelFormat2Fourcc(pixel_format);
     bpp = fourcc2LumaBitsPerPixel(fourcc);
     if (target == NULL) {
         ALOGE("%s malloc new RenderTarget failed", __FUNCTION__);
         return NULL;
     }
-    ALOGV("%s created %s target %p", __FUNCTION__, fourcc2str(NULL, fourcc), target);
-    if ((fourcc == VA_FOURCC_422H) ||
-        (fourcc == VA_FOURCC_YUY2) ||
-        (fourcc == VA_FOURCC_RGBA)){
-        err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
-        if (err || !module) {
-            ALOGE("%s failed to get gralloc module", __FUNCTION__);
-            return NULL;
-        }
-        gralloc_module = (struct gralloc_module_t *)module;
-        err = gralloc_open(module, &allocdev);
-        if (err || !allocdev) {
-            ALOGE("%s failed to open alloc device", __FUNCTION__);
-            return NULL;
-        }
-        err = allocdev->alloc(allocdev,
-                width, height, pixel_format,
-                GRALLOC_USAGE_HW_RENDER,
-                &handle, &stride);
-        if (err) {
-            gralloc_close(allocdev);
-            ALOGE("%s failed to allocate surface", __FUNCTION__);
-            return NULL;
-        }
-        target->type = RenderTarget::ANDROID_GRALLOC;
-        target->handle = (int)handle;
-        target->stride = stride * bpp;
-    }
-    else {
-        *((int*)(&target->type)) = RENDERTARGET_INTERNAL_BUFFER;
-        target->handle = internal_buffer_handle++;
-    }
+    ALOGV("%s created %s target %p", __FUNCTION__, fourcc2str(fourcc), target);
+    target->type = RenderTarget::INTERNAL_BUF;
+    target->handle = generateHandle();
     target->width = width;
     target->height = height;
-    target->pixel_format = pixel_format;
+    target->pixel_format = fourcc;
     target->rect.x = target->rect.y = 0;
     target->rect.width = target->width;
     target->rect.height = target->height;
     return target;
 }
 
+Decode_Status jdva_init_read_tile_scanline(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, int *x, int *y, int *w, int *h)
+{
+    if (jd_libva_ptr->cap_enabled & JPEG_CAPABILITY_UPSAMPLE) {
+        JpegDecodeStatus st;
+        jdva_private * priv = (jdva_private*)jd_libva_ptr->priv;
+        if (priv->scale_factor != cinfo->scale_denom) {
+            ALOGV("%s scale_denom changed from %d to %d!!!!", __FUNCTION__, priv->scale_factor, cinfo->scale_denom);
+        }
+        priv->tile_read_x = (*x < cinfo->image_width)? *x: (cinfo->image_width - 1);
+        priv->tile_read_y = (*y < cinfo->image_height)? *y: (cinfo->image_height - 1);
+        priv->tile_read_width = (priv->tile_read_x + *w < cinfo->image_width)? *w: (cinfo->image_width - priv->tile_read_x);
+        priv->tile_read_width /= priv->scale_factor;
+        priv->tile_read_height = (priv->tile_read_y + *h < cinfo->image_height)? *h: (cinfo->image_height - priv->tile_read_y);
+        priv->tile_read_height /= priv->scale_factor;
+        ALOGV("%s, x=%d->%d, y=%d>%d, w=%d->%d, h=%d->%d", __FUNCTION__,
+            *x, priv->tile_read_x,
+            *y, priv->tile_read_y,
+            *w, priv->tile_read_width,
+            *h, priv->tile_read_height);
+        *x = priv->tile_read_x;
+        *y = priv->tile_read_y;
+        *w = priv->tile_read_width;
+        *h = priv->tile_read_height;
+        return DECODE_SUCCESS;
+    }
+    else {
+        // should not be here
+        assert(false);
+        return DECODE_DRIVER_FAIL;
+    }
+}
+Decode_Status jdva_read_tile_scanline (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, char ** scanlines, unsigned int* row_ctr)
+{
+    if (jd_libva_ptr->cap_enabled & JPEG_CAPABILITY_UPSAMPLE) {
+        jdva_private *priv = (jdva_private*)jd_libva_ptr->priv;
+        *row_ctr = 1;
+        memcpy(scanlines[0], priv->rgba_out + priv->tile_read_y * cinfo->image_width * 4 + priv->tile_read_x * 4, priv->tile_read_width * 4);
+        priv->tile_read_y++;
+        return DECODE_SUCCESS;
+    }
+    else {
+        // should not be here
+        assert(false);
+        return DECODE_DRIVER_FAIL;
+    }
+}
+
 void free_render_target(RenderTarget *target)
 {
     if (target == NULL)
         return;
-    uint32_t fourcc = pixelFormat2Fourcc(target->pixel_format);
+    uint32_t fourcc = target->pixel_format;
     if (target->type == RenderTarget::ANDROID_GRALLOC) {
         buffer_handle_t handle = (buffer_handle_t)target->handle;
         hw_module_t const* module = NULL;
@@ -214,218 +328,190 @@
         allocdev->free(allocdev, handle);
         gralloc_close(allocdev);
     }
-    ALOGV("%s deleting %s target %p", __FUNCTION__, fourcc2str(NULL, fourcc), target);
+    ALOGV("%s deleting %s target %p", __FUNCTION__, fourcc2str(fourcc), target);
 }
 
-void dump_yuy2_target(RenderTarget *target, JpegDecoder *decoder, const char *filename)
-{
-    uint32_t fourcc = pixelFormat2Fourcc(target->pixel_format);
-    assert(fourcc == VA_FOURCC_YUY2);
-    uint8_t *data;
-    uint32_t offsets[3];
-    uint32_t pitches[3];
-    JpegDecoder::MapHandle maphandle = decoder->mapData(*target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    FILE* fpdump = fopen(filename, "wb");
-    if (fpdump) {
-        // YUYV
-        for (int i = 0; i < target->height; ++i) {
-            fwrite(data + offsets[0] + i * pitches[0], 1, target->width * 2, fpdump);
-        }
-        fclose(fpdump);
-    }
-    else {
-        ALOGW("%s failed to create %s", __FUNCTION__, filename);
-    }
-    decoder->unmapData(*target, maphandle);
-}
-
-void dump_dec_target(RenderTarget *target, JpegDecoder *decoder, const char *filename)
-{
-    uint32_t fourcc = pixelFormat2Fourcc(target->pixel_format);
-    assert((fourcc == VA_FOURCC_IMC3) ||
-        (fourcc == VA_FOURCC_411P) ||
-        (fourcc == VA_FOURCC('4','0','0','P')) ||
-        (fourcc == VA_FOURCC_422H) ||
-        (fourcc == VA_FOURCC_422V) ||
-        (fourcc == VA_FOURCC_444P));
-    uint8_t *data;
-    uint32_t offsets[3];
-    uint32_t pitches[3];
-    JpegDecoder::MapHandle maphandle = decoder->mapData(*target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    FILE* fpdump = fopen(filename, "wb");
-    if(fpdump) {
-        float hfactor, vfactor;
-        switch (fourcc) {
-            case VA_FOURCC_IMC3:
-                hfactor = 1;
-                vfactor = 0.5;
-                break;
-            case VA_FOURCC_444P:
-                hfactor = vfactor = 1;
-                break;
-            case VA_FOURCC_422H:
-                hfactor = 0.5;
-                vfactor = 1;
-                break;
-            case VA_FOURCC('4','0','0','P'):
-                hfactor = vfactor = 0;
-                break;
-            case VA_FOURCC_411P:
-                hfactor = 0.25;
-                vfactor = 1;
-                break;
-            case VA_FOURCC_422V:
-                hfactor = 0.5;
-                vfactor = 1;
-                break;
-            default:
-                hfactor = vfactor = 1;
-                break;
-        }
-        // Y
-        for (int i = 0; i < target->height; ++i) {
-            fwrite(data + offsets[0] + i * pitches[0], 1, target->width, fpdump);
-        }
-        // U
-        for (int i = 0; i < target->height * vfactor; ++i) {
-            fwrite(data + offsets[1] + i * pitches[1], 1, target->width * hfactor, fpdump);
-        }
-        // V
-        for (int i = 0; i < target->height * vfactor; ++i) {
-            fwrite(data + offsets[2] + i * pitches[2], 1, target->width * hfactor, fpdump);
-        }
-        fclose(fpdump);
-    }
-    else {
-        ALOGW("%s failed to create %s", __FUNCTION__, filename);
-    }
-    decoder->unmapData(*target, maphandle);
-}
-
+Decode_Status jdva_blit(struct jdva_private * priv);
 
 Decode_Status jdva_decode (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr)
 {
     JpegDecodeStatus st;
-    char **outbuf = jd_libva_ptr->output_image;
-    uint32_t lines = jd_libva_ptr->output_lines;
     jdva_private * priv = (jdva_private*)jd_libva_ptr->priv;
-    if (!priv)
-        return DECODE_DRIVER_FAIL;
-
-    JpegInfo& jpginfo = priv->jpg_info;
-
-    st = priv->decoder.decode(jpginfo, priv->dec_buffer);
-    if (st != JD_SUCCESS) {
-        ALOGE("%s: error decoding %s image", __FUNCTION__, fourcc2str(NULL, jpginfo.image_color_fourcc));
-        return DECODE_DRIVER_FAIL;
-    }
-    ALOGI("%s successfully decoded JPEG with VAAPI", __FUNCTION__);
-    RenderTarget *src_target = &priv->dec_buffer;
-    //dump_dec_target(src_target, decoder,"/sdcard/dec_dump.yuv");
-
-    bool yuy2_csc = false;
     hw_module_t const* module = NULL;
     alloc_device_t *allocdev = NULL;
     struct gralloc_module_t *gralloc_module = NULL;
     buffer_handle_t handle;
     int err;
+    char fname[256];
+    FILE *fdec;
     uint8_t *data = NULL;
     uint32_t offsets[3];
     uint32_t pitches[3];
+    nsecs_t t1, t2, t3;
     JpegDecoder::MapHandle maphandle;
-    FILE *rgbafile = NULL;
-    if (jpginfo.image_color_fourcc != VA_FOURCC_422H)
-        yuy2_csc = true;
+    if (!priv)
+        return DECODE_DRIVER_FAIL;
 
-    // CSC to YUY2 if needed
-    if (yuy2_csc) {
-        st = priv->decoder.blit(*src_target, priv->yuy2_buffer);
-        if (st != JD_SUCCESS) {
-            ALOGE("%s: error blitting to YUY2 buffer", __FUNCTION__);
-            goto cleanup;
-        }
-        //dump_yuy2_target(src_target, decoder,"/sdcard/yuy2_dump.yuv");
-        src_target = &priv->yuy2_buffer;
+    t1 = systemTime();
+    JpegInfo& jpginfo = priv->jpg_info;
+
+    if (jd_libva_ptr->cap_enabled & JPEG_CAPABILITY_DOWNSCALE) {
+        priv->scale_factor = cinfo->scale_denom;
+        cinfo->min_DCT_scaled_size = DCTSIZE/priv->scale_factor;
+        cinfo->output_width = cinfo->image_width/priv->scale_factor;
+        cinfo->output_height = cinfo->image_height/priv->scale_factor;
+    }
+    else {
+        priv->scale_factor = 1;
+        cinfo->min_DCT_scaled_size = DCTSIZE;
+        cinfo->output_width = cinfo->image_width;
+        cinfo->output_height = cinfo->image_height;
     }
 
-    st = priv->decoder.blit(*src_target, priv->rgba_buffer);
+    jdva_drain_input(cinfo, jd_libva_ptr);
+    jpginfo.need_header_only = false;
+    st = priv->decoder->parse(jpginfo);
+    switch (st) {
+    case JD_ERROR_BITSTREAM:
+        ALOGE("%s: error parsing bitstream", __FUNCTION__);
+        return DECODE_PARSER_FAIL;
+    case JD_SUCCESS:
+        break;
+    default:
+        ALOGE("%s: error in driver: parse failed", __FUNCTION__);
+        return DECODE_DRIVER_FAIL;
+    }
+
+    st = priv->decoder->decode(jpginfo, priv->dec_buffer);
     if (st != JD_SUCCESS) {
-        ALOGE("%s: error blitting to RGBA buffer", __FUNCTION__);
-        goto cleanup;
+        ALOGE("%s: error decoding %s image", __FUNCTION__, fourcc2str(jpginfo.image_color_fourcc));
+        return DECODE_DRIVER_FAIL;
     }
-    maphandle = priv->decoder.mapData(priv->rgba_buffer, (void**) &data, offsets, pitches);
+#if DUMP_DECODE
+    sprintf(fname, DECODE_DUMP_FILE_PATTERN, jpginfo.image_width, jpginfo.image_height, fourcc2str(jpginfo.image_color_fourcc));
+    fdec = fopen(fname, "wb");
+    if (fdec) {
+        maphandle = priv->decoder->mapData(priv->dec_buffer, (void**)&data, offsets, pitches);
+        int ss_x, ss_y;
+        ss_x = ss_y = -1;
+        switch(jpginfo.image_color_fourcc) {
+        case VA_FOURCC_411P:
+            ss_x = 2;
+            ss_y = 0;
+            break;
+        case VA_FOURCC_IMC3:
+            ss_x = 1;
+            ss_y = 1;
+            break;
+        case VA_FOURCC_422V:
+            ss_x = 0;
+            ss_y = 1;
+            break;
+        case VA_FOURCC_422H:
+            ss_x = 1;
+            ss_y = 0;
+            break;
+        case VA_FOURCC_444P:
+            ss_x = 0;
+            ss_y = 0;
+            break;
+        default:
+            break;
+        }
+        for (int r = 0; r < jpginfo.image_height; ++r)
+            fwrite(data + offsets[0] + pitches[0] * r, 1, jpginfo.image_width, fdec);
+        if (ss_x >=0 && ss_y >=0) {
+            for (int r = 0; r < jpginfo.image_height >> ss_y; ++r)
+                fwrite(data + offsets[1] + pitches[1] * r, 1, jpginfo.image_width >> ss_x, fdec);
+            for (int r = 0; r < jpginfo.image_height >> ss_y; ++r)
+                fwrite(data + offsets[2] + pitches[2] * r, 1, jpginfo.image_width >> ss_x, fdec);
+        }
+        priv->decoder->unmapData(priv->dec_buffer, maphandle);
+        fclose(fdec);
+        ALOGV("%s Dumped decode surface into %s", __FUNCTION__, fname);
+    }
+#endif
+    t2 = systemTime();
 
-    //rgbafile = fopen("/sdcard/rgba_dump", "wb");
+    if (!(jd_libva_ptr->cap_enabled & JPEG_CAPABILITY_UPSAMPLE)) {
+        ALOGV("%s decoded %ux%u %s JPEG for %.2f ms", __FUNCTION__,
+            priv->jpg_info.image_width, priv->jpg_info.image_height,
+            fourcc2str(priv->jpg_info.image_color_fourcc),
+            (t2-t1)/1000000.0);
+        // TODO: implement
+    }
+    else {
+        priv->rgba_out = (uint8_t*)memalign(0x1000,
+            aligned_width(cinfo->output_width, SURF_TILING_Y)
+            * aligned_height(cinfo->output_height, SURF_TILING_Y) * 4);
+        if (priv->rgba_out == NULL) {
+            ALOGE("%s failed to create RGBA buffer", __FUNCTION__);
+            return DECODE_MEMORY_FAIL;
+        }
 
-    for (uint32_t i = 0; i < lines; ++i) {
-        if (outbuf[i] != NULL) {
-            //memcpy(outbuf[i], data + offsets[0] + i * pitches[0], 4 * jpginfo.image_width);
-            for (int j = 0; j < priv->rgba_buffer.width; ++j) {
-                // BGRA -> RGBA
-                // R
-                memcpy(outbuf[i] + 4 * j, data + offsets[0] + i * pitches[0] + 4 * j + 2, 1);
-                // G
-                memcpy(outbuf[i] + 4 * j + 1, data + offsets[0] + i * pitches[0] + 4 * j + 1, 1);
-                // B
-                memcpy(outbuf[i] + 4 * j + 2, data + offsets[0] + i * pitches[0] + 4 * j, 1);
-                // A
-                memcpy(outbuf[i] + 4 * j + 3, data + offsets[0] + i * pitches[0] + 4 * j + 3, 1);
+        Decode_Status ret;
+        {
+            Mutex::Autolock autoLock(jdlock);
+            ret = jdva_blit(priv);
+            if (ret != DECODE_SUCCESS) {
+                ALOGE("%s blit %ux%u (%dx scaling) %s failed", __FUNCTION__,
+                    priv->jpg_info.image_width, priv->jpg_info.image_height,
+                    priv->scale_factor,
+                    fourcc2str(priv->jpg_info.image_color_fourcc));
+                goto cleanup;
             }
         }
-        else {
-            ALOGE("%s outbuf line %u is NULL", __FUNCTION__, i);
-        }
-        //if (rgbafile) {
-        //    fwrite(data + offsets[0] + i * pitches[0], 1, 4 * rgba_target->width, rgbafile);
-        //}
+        t3 = systemTime();
+        ALOGI("%s decode+blit %ux%u (%dx scaling) %s JPEG for %.2f+%.2f ms", __FUNCTION__,
+            priv->jpg_info.image_width, priv->jpg_info.image_height,
+            priv->scale_factor,
+            fourcc2str(priv->jpg_info.image_color_fourcc),
+            (t2-t1)/1000000.0, (t3-t2)/1000000.0);
     }
-    //if (rgbafile)
-    //    fclose(rgbafile);
-    ALOGI("%s successfully blitted RGBA from JPEG %s data", __FUNCTION__, fourcc2str(NULL, priv->jpg_info.image_color_fourcc));
-    priv->decoder.unmapData(priv->rgba_buffer, maphandle);
     return DECODE_SUCCESS;
-
 cleanup:
+    if (priv->rgba_out) {
+        free(priv->rgba_out);
+        priv->rgba_out = NULL;
+    }
     return DECODE_DRIVER_FAIL;
 }
 
+Decode_Status jdva_read_scanlines (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, char ** scanlines, unsigned int* row_ctr, unsigned int max_lines)
+{
+    if (jd_libva_ptr->cap_enabled & JPEG_CAPABILITY_UPSAMPLE) {
+        jdva_private *priv = (jdva_private*)jd_libva_ptr->priv;
+        uint32_t scanline = cinfo->output_scanline;
+        for (*row_ctr = 0; *row_ctr + scanline < cinfo->output_height && *row_ctr < max_lines; ++*row_ctr) {
+            memcpy(scanlines[*row_ctr], priv->rgba_out + (scanline + *row_ctr) * aligned_width(cinfo->output_width, SURF_TILING_Y) * 4, cinfo->output_width * 4);
+        }
+        return DECODE_SUCCESS;
+    }
+    else {
+        // should not be here
+        assert(false);
+        return DECODE_DRIVER_FAIL;
+    }
+}
+
 Decode_Status jdva_create_resource (jd_libva_struct * jd_libva_ptr)
 {
     VAStatus va_status = VA_STATUS_SUCCESS;
     Decode_Status status = DECODE_SUCCESS;
-    RenderTarget *dec_target, *yuy2_target, *rgba_target;
-    dec_target = yuy2_target = rgba_target = NULL;
     JpegDecodeStatus st;
     Mutex::Autolock autoLock(jdlock);
     jdva_private *priv = (jdva_private*)jd_libva_ptr->priv;
     jd_libva_ptr->image_width = priv->jpg_info.picture_param_buf.picture_width;
     jd_libva_ptr->image_height = priv->jpg_info.picture_param_buf.picture_height;
-    dec_target = create_render_target(&priv->dec_buffer, jd_libva_ptr->image_width,jd_libva_ptr->image_height,fourcc2PixelFormat(priv->jpg_info.image_color_fourcc));
-    if (dec_target == NULL) {
+    create_render_target(&priv->dec_buffer, jd_libva_ptr->image_width,jd_libva_ptr->image_height,priv->jpg_info.image_color_fourcc);
+    if (&priv->dec_buffer == NULL) {
         ALOGE("%s failed to create decode render target", __FUNCTION__);
         return DECODE_MEMORY_FAIL;
     }
-    rgba_target = create_render_target(&priv->rgba_buffer, jd_libva_ptr->image_width,jd_libva_ptr->image_height, HAL_PIXEL_FORMAT_RGBA_8888);
-    if (rgba_target == NULL) {
-        ALOGE("%s failed to create YUY2 csc buffer", __FUNCTION__);
-        free_render_target(dec_target);
-        return DECODE_MEMORY_FAIL;
-    }
-    yuy2_target = create_render_target(&priv->yuy2_buffer, jd_libva_ptr->image_width,jd_libva_ptr->image_height, HAL_PIXEL_FORMAT_YCbCr_422_I);
-    if (yuy2_target == NULL) {
-        ALOGE("%s failed to create RGBA csc buffer", __FUNCTION__);
-        free_render_target(dec_target);
-        free_render_target(rgba_target);
-        return DECODE_MEMORY_FAIL;
-    }
-    RenderTarget *targetlist[3] = { dec_target, yuy2_target, rgba_target };
-    st = priv->decoder.init(jd_libva_ptr->image_width, jd_libva_ptr->image_height, targetlist, 3);
+    RenderTarget *targets = &priv->dec_buffer;
+    st = priv->decoder->init(jd_libva_ptr->image_width, jd_libva_ptr->image_height, &targets, 1);
     if (st != JD_SUCCESS) {
-        free_render_target(dec_target);
-        free_render_target(rgba_target);
-        free_render_target(yuy2_target);
+        free_render_target(&priv->dec_buffer);
         ALOGE("%s failed to initialize resources for decoder: %d", __FUNCTION__, st);
         return DECODE_DRIVER_FAIL;
     }
@@ -455,10 +541,12 @@
     ALOGV("%s deiniting priv 0x%x", __FUNCTION__, jd_libva_ptr->priv);
     jdva_private *priv = (jdva_private*)jd_libva_ptr->priv;
     if (priv) {
-        priv->decoder.deinit();
+        priv->decoder->deinit();
         free_render_target(&priv->dec_buffer);
-        free_render_target(&priv->yuy2_buffer);
-        free_render_target(&priv->rgba_buffer);
+        if (priv->rgba_out) {
+            free(priv->rgba_out);
+            priv->rgba_out = NULL;
+        }
     }
   /*
    * It is safe to destroy Surface/Config/Context severl times
@@ -477,9 +565,17 @@
     if (!priv)
         return DECODE_DRIVER_FAIL;
     JpegInfo& jpginfo = priv->jpg_info;
-    jpginfo.buf = jd_libva_ptr->bitstream_buf;
-    jpginfo.bufsize = jd_libva_ptr->file_size;
-    JpegDecodeStatus st = priv->decoder.parse(jpginfo);
+    JpegDecodeStatus st;
+
+    Decode_Status res;
+    jpginfo.need_header_only = true;
+    do {
+        res = jdva_fill_input(cinfo, jd_libva_ptr);
+        if (res) {
+            return res;
+        }
+        st = priv->decoder->parse(jpginfo);
+    } while (st == JD_INSUFFICIENT_BYTE);
     if (st != JD_SUCCESS) {
         ALOGE("%s parser for HW decode failed: %d", __FUNCTION__, st);
         return DECODE_PARSER_FAIL;
@@ -492,8 +588,44 @@
     cinfo->image_height = jpginfo.picture_param_buf.picture_height;  /* nominal image height */
     cinfo->num_components = jpginfo.picture_param_buf.num_components;       /* # of color components in JPEG image */
     cinfo->jpeg_color_space = JCS_YCbCr; /* colorspace of JPEG image */
-    cinfo->out_color_space = JCS_RGB; /* colorspace for output */
+    cinfo->out_color_space = JCS_RGB; /* set default colorspace for output */
     cinfo->src->bytes_in_buffer = jd_libva_ptr->file_size;
+    cinfo->scale_num = cinfo->scale_denom = 1; /* set default value */
     return DECODE_SUCCESS;
 }
 
+Decode_Status jdva_blit(struct jdva_private * priv)
+{
+    JpegDecodeStatus st;
+    nsecs_t t1, t2;
+
+    char fname[256];
+    FILE *fdec;
+    t1 = systemTime();
+    st = priv->decoder->blitToLinearRgba(priv->dec_buffer, priv->rgba_out,
+        priv->jpg_info.image_width,
+        priv->jpg_info.image_height,
+        priv->blit_event, priv->scale_factor);
+    if (st != JD_SUCCESS) {
+        ALOGE("%s: error blitting to RGBA buffer", __FUNCTION__);
+        goto cleanup;
+    }
+    t2 = systemTime();
+#if DUMP_RGBA
+    sprintf(fname, RGBA_DUMP_FILE_PATTERN, priv->jpg_info.output_width, priv->jpg_info.output_height, fourcc2str(priv->jpg_info.image_color_fourcc));
+    fdec = fopen(fname, "wb");
+    if (fdec) {
+        fwrite(priv->rgba_out, 1, priv->jpg_info.output_width * priv->jpg_info.output_height * 4, fdec);
+        fclose(fdec);
+        ALOGV("%s Dumped RGBA output into %s", __FUNCTION__, fname);
+    }
+#endif
+    ALOGV("%s blitted %ux%u RGBA from JPEG %s data for %.2f ms", __FUNCTION__,
+        priv->jpg_info.image_width, priv->jpg_info.image_height,
+        fourcc2str(priv->jpg_info.image_color_fourcc),
+        (t2-t1)/1000000.0);
+    return DECODE_SUCCESS;
+cleanup:
+    return DECODE_DRIVER_FAIL;
+}
+
diff --git a/imagedecoder/JPEGDecoder_libjpeg_wrapper.h b/imagedecoder/JPEGDecoder_libjpeg_wrapper.h
index c9d060b..72a216a 100644
--- a/imagedecoder/JPEGDecoder_libjpeg_wrapper.h
+++ b/imagedecoder/JPEGDecoder_libjpeg_wrapper.h
@@ -1,6 +1,5 @@
 /* INTEL CONFIDENTIAL
 * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
-* Copyright (c) Imagination Technologies Limited, UK
 *
 * The source code contained or described herein and all documents
 * related to the source code ("Material") are owned by Intel
@@ -41,7 +40,7 @@
 #define JPEG_MAX_QUANT_TABLES 4
 
 typedef struct {
-    uint8_t* bitstream_buf;
+    const uint8_t* bitstream_buf;
     uint32_t image_width;
     uint32_t image_height;
 
@@ -53,24 +52,30 @@
 
     uint32_t file_size;
     uint32_t rotation;
+    int      tile_mode;
 
-    char ** output_image;
-    uint32_t output_lines;
+    uint32_t cap_available;
+    uint32_t cap_enabled;
 
     uint32_t priv;
 } jd_libva_struct;
 
 typedef enum {
-    DECODE_NOT_STARTED = -6,
-    DECODE_INVALID_DATA = -5,
-    DECODE_DRIVER_FAIL = -4,
-    DECODE_PARSER_FAIL = -3,
+    DECODE_NOT_STARTED = -7,
+    DECODE_INVALID_DATA = -6,
+    DECODE_DRIVER_FAIL = -5,
+    DECODE_PARSER_FAIL = -4,
+    DECODE_PARSER_INSUFFICIENT_BYTES = -3,
     DECODE_MEMORY_FAIL = -2,
     DECODE_FAIL = -1,
     DECODE_SUCCESS = 0,
 
 } IMAGE_DECODE_STATUS;
 
+#define JPEG_CAPABILITY_DECODE     0x0
+#define JPEG_CAPABILITY_UPSAMPLE   0x1
+#define JPEG_CAPABILITY_DOWNSCALE  0x2
+
 /*********************** for libjpeg ****************************/
 typedef int32_t Decode_Status;
 extern jd_libva_struct jd_libva;
@@ -79,7 +84,12 @@
 #endif
 Decode_Status jdva_initialize (jd_libva_struct * jd_libva_ptr);
 void jdva_deinitialize (jd_libva_struct * jd_libva_ptr);
+Decode_Status jdva_fill_input(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr);
+void jdva_drain_input(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr);
 Decode_Status jdva_decode (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr);
+Decode_Status jdva_read_scanlines (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, char ** scanlines, unsigned int* row_ctr, unsigned int max_lines);
+Decode_Status jdva_init_read_tile_scanline(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, int *x, int *y, int *w, int *h);
+Decode_Status jdva_read_tile_scanline (j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr, char ** scanlines, unsigned int* row_ctr);
 Decode_Status jdva_create_resource (jd_libva_struct * jd_libva_ptr);
 Decode_Status jdva_release_resource (jd_libva_struct * jd_libva_ptr);
 Decode_Status jdva_parse_bitstream(j_decompress_ptr cinfo, jd_libva_struct * jd_libva_ptr);
diff --git a/imagedecoder/JPEGParser.cpp b/imagedecoder/JPEGParser.cpp
index 1d6ab26..c2ca299 100644
--- a/imagedecoder/JPEGParser.cpp
+++ b/imagedecoder/JPEGParser.cpp
@@ -104,11 +104,94 @@
     return parser->end_of_buff;
 }
 
-uint8_t* getCurrentIndex(CJPEGParse* parser) {
+const uint8_t* getCurrentIndex(CJPEGParse* parser) {
     return parser->stream_buff + parser->parse_index;
 }
 
-void parserInitialize(CJPEGParse* parser,  uint8_t* stream_buff, uint32_t buff_size) {
+uint32_t getRemainingBytes(CJPEGParse* parser) {
+    return parser->buff_size - parser->parse_index - 1;
+}
+
+bool endOfBufferStr(CJPEGParse* parser);
+
+uint8_t readNextByteStr(CJPEGParse* parser) {
+    uint8_t byte = 0;
+
+    if (parser->parse_index < parser->inputs->size()) {
+        byte = parser->inputs->itemAt(parser->parse_index);
+        parser->parse_index++;
+    }
+
+    if (parser->parse_index == parser->inputs->size()) {
+        parser->end_of_buff = true;
+    }
+
+    return byte;
+}
+
+uint32_t readBytesStr( CJPEGParse* parser, uint32_t bytes_to_read ) {
+    uint32_t bytes = 0;
+
+    while (bytes_to_read-- && !endOfBufferStr(parser)) {
+        bytes |= ( (uint32_t)readNextByteStr(parser) << ( bytes_to_read * 8 ) );
+    }
+
+    return bytes;
+}
+
+void burnBytesStr( CJPEGParse* parser, uint32_t bytes_to_burn ) {
+    parser->parse_index += bytes_to_burn;
+
+    if (parser->parse_index >= parser->inputs->size()) {
+        parser->parse_index = parser->inputs->size() - 1;
+        parser->end_of_buff = true;
+    }
+}
+
+uint8_t getNextMarkerStr(CJPEGParse* parser) {
+    while (!endOfBufferStr(parser)) {
+        if (readNextByteStr(parser) == 0xff) {
+            break;
+        }
+    }
+    /* check the next byte to make sure we don't miss the real marker*/
+    uint8_t tempNextByte = readNextByteStr(parser);
+    if (tempNextByte == 0xff)
+        return readNextByteStr(parser);
+    else
+        return tempNextByte;
+}
+
+bool setByteOffsetStr(CJPEGParse* parser, uint32_t byte_offset)
+{
+    bool offset_found = false;
+
+    if (byte_offset < parser->inputs->size()) {
+        parser->parse_index = byte_offset;
+        offset_found = true;
+//      end_of_buff = false;
+    }
+
+    return offset_found;
+}
+
+uint32_t getByteOffsetStr(CJPEGParse* parser) {
+    return parser->parse_index;
+}
+
+bool endOfBufferStr(CJPEGParse* parser) {
+    return parser->end_of_buff;
+}
+
+const uint8_t* getCurrentIndexStr(CJPEGParse* parser) {
+    return parser->inputs->array() + parser->parse_index;
+}
+
+uint32_t getRemainingBytesStr(CJPEGParse* parser) {
+    return parser->inputs->size() - parser->parse_index - 1;
+}
+
+void parserInitialize(CJPEGParse* parser,  const uint8_t* stream_buff, uint32_t buff_size) {
     parser->parse_index = 0;
     parser->buff_size = buff_size;
     parser->stream_buff = stream_buff;
@@ -121,4 +204,21 @@
     parser->endOfBuffer = endOfBuffer;
     parser->getCurrentIndex = getCurrentIndex;
     parser->setByteOffset= setByteOffset;
+    parser->getRemainingBytes = getRemainingBytes;
 }
+
+void parserInitialize(CJPEGParse* parser, android::Vector<uint8_t> *inputs) {
+    parser->parse_index = 0;
+    parser->inputs = inputs;
+    parser->end_of_buff = false;
+    parser->readNextByte = readNextByteStr;
+    parser->readBytes = readBytesStr;
+    parser->burnBytes = burnBytesStr;
+    parser->getNextMarker = getNextMarkerStr;
+    parser->getByteOffset = getByteOffsetStr;
+    parser->endOfBuffer = endOfBufferStr;
+    parser->getCurrentIndex = getCurrentIndexStr;
+    parser->setByteOffset= setByteOffsetStr;
+    parser->getRemainingBytes = getRemainingBytesStr;
+}
+
diff --git a/imagedecoder/JPEGParser.h b/imagedecoder/JPEGParser.h
index be6ac4d..1dfc1bd 100644
--- a/imagedecoder/JPEGParser.h
+++ b/imagedecoder/JPEGParser.h
@@ -30,7 +30,8 @@
 #define _JPEG_PARSE_H_
 
 #include <stdint.h>
-
+#include <utils/Vector.h>
+using namespace std;
 // Marker Codes
 #define CODE_SOF_BASELINE 0xC0
 #define CODE_SOF1         0xC1
@@ -78,9 +79,10 @@
 #define CODE_APP15        0xEF
 
 struct CJPEGParse {
-    uint8_t* stream_buff;
+    const uint8_t* stream_buff;
     uint32_t parse_index;
     uint32_t buff_size;
+    android::Vector<uint8_t> *inputs;
     bool end_of_buff;
     uint8_t (*readNextByte)(CJPEGParse* parser);
     uint32_t (*readBytes)( CJPEGParse* parser, uint32_t bytes_to_read );
@@ -88,10 +90,129 @@
     uint8_t (*getNextMarker)(CJPEGParse* parser);
     uint32_t (*getByteOffset)(CJPEGParse* parser);
     bool (*endOfBuffer)(CJPEGParse* parser);
-    uint8_t* (*getCurrentIndex)(CJPEGParse* parser);
+    const uint8_t* (*getCurrentIndex)(CJPEGParse* parser);
     bool (*setByteOffset)( CJPEGParse* parser, uint32_t byte_offset );
+    uint32_t (*getRemainingBytes)(CJPEGParse* parser);
 };
 
-void parserInitialize(CJPEGParse* parser,  uint8_t* stream_buff, uint32_t buff_size);
+void parserInitialize(CJPEGParse* parser, const uint8_t* stream_buff, uint32_t buff_size);
+void parserInitialize(CJPEGParse* parser, android::Vector<uint8_t> *inputs);
+
+class JpegBitstreamParser
+{
+public:
+    void set(android::Vector<uint8_t>* inputs)
+    {
+        parserInitialize(&parser, inputs);
+        use_vector = true;
+    }
+    void set(const uint8_t *buf, uint32_t bufsize)
+    {
+        parserInitialize(&parser, buf, bufsize);
+        use_vector = false;
+    }
+    bool tryReadNextByte(uint8_t *byte)
+    {
+        if (parser.getRemainingBytes(&parser) >= 1) {
+            *byte = parser.readNextByte(&parser);
+            return true;
+        }
+        return false;
+    }
+    bool tryReadBytes(uint32_t *bytes, uint32_t bytes_to_read)
+    {
+        if (parser.getRemainingBytes(&parser) >= bytes_to_read) {
+            *bytes = parser.readBytes(&parser, bytes_to_read);
+            return true;
+        }
+        return false;
+    }
+    bool tryBurnBytes(uint32_t bytes_to_burn)
+    {
+        if (parser.getRemainingBytes(&parser) >= bytes_to_burn) {
+            parser.burnBytes(&parser, bytes_to_burn);
+            return true;
+        }
+        return false;
+    }
+    bool tryGetNextMarker(uint8_t *marker)
+    {
+        uint32_t rollbackoff = parser.getByteOffset(&parser);
+        while (!parser.endOfBuffer(&parser)) {
+            if (tryReadNextByte(marker)) {
+                if (*marker == 0xff) {
+                    //rollbackoff = parser.parse_index - 1;
+                    break;
+                }
+            } else {
+                goto rollback;
+            }
+        }
+        /* check the next byte to make sure we don't miss the real marker*/
+        if (tryReadNextByte(marker)) {
+            if (*marker == 0xff) {
+                if (tryReadNextByte(marker)) {
+                    return true;
+                }
+                else
+                    goto rollback;
+            }
+            else {
+                return true;
+            }
+        }
+        else goto rollback;
+rollback:
+        parser.parse_index = rollbackoff;
+        return false;
+    }
+    uint32_t getByteOffset()
+    {
+        return parser.getByteOffset(&parser);
+    }
+    bool endOfBuffer()
+    {
+        return parser.endOfBuffer(&parser);
+    }
+    const uint8_t* getCurrentIndex()
+    {
+        return parser.getCurrentIndex(&parser);
+    }
+    bool trySetByteOffset(uint32_t byte_offset)
+    {
+        uint32_t bufsize;
+        if (use_vector)
+            bufsize = parser.inputs->size();
+        else
+            bufsize= parser.buff_size;
+        if (bufsize > byte_offset) {
+            parser.setByteOffset(&parser, byte_offset);
+            return true;
+        }
+        return false;
+    }
+    uint32_t getRemainingBytes()
+    {
+        return parser.getRemainingBytes(&parser);
+    }
+    const uint8_t itemAt(uint32_t index)
+    {
+        if (use_vector)
+            return parser.inputs->itemAt(index);
+        else
+            return parser.stream_buff[index];
+    }
+    void reset()
+    {
+        parser.parse_index = 0;
+        parser.inputs = NULL;
+        parser.stream_buff = NULL;
+        parser.buff_size = 0;
+        use_vector = false;
+    }
+private:
+    CJPEGParse parser;
+    bool use_vector;
+};
 #endif // _JPEG_PARSE_H_
 
diff --git a/imagedecoder/libjpeg_cm_genx.isa b/imagedecoder/libjpeg_cm_genx.isa
new file mode 100644
index 0000000..0947267
--- /dev/null
+++ b/imagedecoder/libjpeg_cm_genx.isa
Binary files differ
diff --git a/imagedecoder/test/testdecode.cpp b/imagedecoder/test/testdecode.cpp
index 6823b85..3dde5b4 100644
--- a/imagedecoder/test/testdecode.cpp
+++ b/imagedecoder/test/testdecode.cpp
@@ -4,21 +4,20 @@
 #include <utils/threads.h>
 #include <utils/Timers.h>
 #include <stdio.h>
+#ifdef NDEBUG
 #undef NDEBUG
+#endif
 #include <assert.h>
 #include <hardware/gralloc.h>
 
-#define JPGFILE "/sdcard/1280x720xYUV422H.jpg"
+static char jpgfile[100];
 
-RenderTarget& init_render_target(RenderTarget &target, int width, int height, int pixel_format)
+RenderTarget& init_render_target_drm(RenderTarget &target, int width, int height, uint32_t fourcc, buffer_handle_t *handle)
 {
     hw_module_t const* module = NULL;
     alloc_device_t *allocdev = NULL;
     struct gralloc_module_t *gralloc_module = NULL;
-    buffer_handle_t handle;
-    uint32_t fourcc;
     int stride, bpp, err;
-    fourcc = pixelFormat2Fourcc(pixel_format);
     bpp = fourcc2LumaBitsPerPixel(fourcc);
     err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
     if (err || !module) {
@@ -34,31 +33,211 @@
     err = allocdev->alloc(allocdev,
             width,
             height,
-            pixel_format,
+            fourcc2PixelFormat(fourcc),
+            GRALLOC_USAGE_HW_RENDER,
+            handle,
+            &stride);
+    if (err) {
+        gralloc_close(allocdev);
+        printf("%s failed to allocate surface %d, %dx%d, pixelformat %x\n", __PRETTY_FUNCTION__, err,
+            width, height, fourcc2PixelFormat(fourcc));
+        assert(false);
+    }
+    unsigned long boname;
+    err = gralloc_module->perform(gralloc_module,
+        INTEL_UFO_GRALLOC_MODULE_PERFORM_GET_BO_NAME,
+        *handle,
+        &boname);
+    assert(!err);
+    target.type = RenderTarget::KERNEL_DRM;
+    target.handle = (int)boname;
+    switch(fourcc) {
+    case VA_FOURCC_NV12:
+    case VA_FOURCC_422H:
+    case VA_FOURCC_422V:
+    case VA_FOURCC_IMC3:
+    case VA_FOURCC_444P:
+    case VA_FOURCC_411P:
+    case VA_FOURCC('4','0','0','P'):
+        target.width = aligned_width(width, SURF_TILING_Y);
+        target.height = aligned_height(height, SURF_TILING_Y);
+        break;
+    default:
+        target.width = aligned_width(width, SURF_TILING_NONE);
+        target.height = aligned_height(height, SURF_TILING_NONE);
+        break;
+    }
+    target.format = fourcc2VaFormat(fourcc);
+    target.pixel_format = fourcc;
+    target.rect.x = target.rect.y = 0;
+    target.rect.width = width;
+    target.rect.height = height;
+    target.stride = stride * bpp;
+    gralloc_close(allocdev);
+    return target;
+}
+
+RenderTarget& init_render_target_gralloc(RenderTarget &target, int width, int height, uint32_t fourcc)
+{
+    hw_module_t const* module = NULL;
+    alloc_device_t *allocdev = NULL;
+    struct gralloc_module_t *gralloc_module = NULL;
+    buffer_handle_t handle;
+    int stride, bpp, err;
+    bpp = fourcc2LumaBitsPerPixel(fourcc);
+    err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
+    if (err || !module) {
+        printf("%s failed to get gralloc module\n", __PRETTY_FUNCTION__);
+        assert(false);
+    }
+    gralloc_module = (struct gralloc_module_t *)module;
+    err = gralloc_open(module, &allocdev);
+    if (err || !allocdev) {
+        printf("%s failed to open alloc device\n", __PRETTY_FUNCTION__);
+        assert(false);
+    }
+    err = allocdev->alloc(allocdev,
+            width,
+            height,
+            fourcc2PixelFormat(fourcc),
             GRALLOC_USAGE_HW_RENDER,
             &handle,
             &stride);
     if (err) {
         gralloc_close(allocdev);
         printf("%s failed to allocate surface %d, %dx%d, pixelformat %x\n", __PRETTY_FUNCTION__, err,
-            width, height, pixel_format);
+            width, height, fourcc2PixelFormat(fourcc));
         assert(false);
     }
     target.type = RenderTarget::ANDROID_GRALLOC;
     target.handle = (int)handle;
-    target.width = width;
-    target.height = height;
-    target.pixel_format = pixel_format;
+    switch(fourcc) {
+    case VA_FOURCC_NV12:
+    case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
+    case VA_FOURCC_422H:
+    case VA_FOURCC_422V:
+    case VA_FOURCC_IMC3:
+    case VA_FOURCC_444P:
+    case VA_FOURCC_411P:
+    case VA_FOURCC('4','0','0','P'):
+        target.width = aligned_width(width, SURF_TILING_Y);
+        target.height = aligned_height(height, SURF_TILING_Y);
+        break;
+    default:
+        target.width = aligned_width(width, SURF_TILING_NONE);
+        target.height = aligned_height(height, SURF_TILING_NONE);
+        break;
+    }
+
+    target.format = fourcc2VaFormat(fourcc);
+    target.pixel_format = fourcc;
     target.rect.x = target.rect.y = 0;
-    target.rect.width = target.width;
-    target.rect.height = target.height;
+    target.rect.width = width;
+    target.rect.height = height;
     target.stride = stride * bpp;
+    gralloc_close(allocdev);
     return target;
 }
 
-void deinit_render_target(RenderTarget &target)
+RenderTarget& init_render_target_userptr(RenderTarget &target, int width, int height, uint32_t fourcc)
 {
-    buffer_handle_t handle = (buffer_handle_t)target.handle;
+    hw_module_t const* module = NULL;
+    alloc_device_t *allocdev = NULL;
+    static int surf_hnd = 0;
+    int stride, bpp, err;
+    void * userptr = NULL;
+    size_t mallocsize;
+    bpp = fourcc2LumaBitsPerPixel(fourcc);
+    target.type = RenderTarget::USER_PTR;
+
+    // all linear, no alignment
+    switch(fourcc) {
+    case VA_FOURCC_NV12:
+        mallocsize = width * height * 3 / 2;
+        break;
+    case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
+        mallocsize = width * height * 2;
+        break;
+    case VA_FOURCC_422H:
+        mallocsize = width * height * 3;
+        break;
+    case VA_FOURCC_422V:
+        mallocsize = width * height * 2;
+        break;
+    case VA_FOURCC_IMC3:
+        mallocsize = width * height * 2;
+        break;
+    case VA_FOURCC_444P:
+        mallocsize = width * height * 3;
+        break;
+    case VA_FOURCC_411P:
+        mallocsize = width * height * 3;
+        break;
+    case VA_FOURCC_411R:
+        mallocsize = width * height * 3 / 2;
+        break;
+    case VA_FOURCC('4','0','0','P'):
+        mallocsize = width * height;
+        break;
+    case VA_FOURCC_RGBA:
+    case VA_FOURCC_BGRA:
+        mallocsize = width * height * 4;
+        break;
+    default:
+        mallocsize = width * height * 3;
+        break;
+    }
+    userptr = memalign(0x1000, mallocsize);
+    target.width = width;
+    target.height = height;
+    target.pixel_format = fourcc;
+    target.rect.x = target.rect.y = 0;
+    target.rect.width = target.width;
+    target.rect.height = target.height;
+    target.handle = (int)userptr;
+    //target.stride = stride * bpp;
+    return target;
+}
+
+RenderTarget& init_render_target(RenderTarget &target, int width, int height, uint32_t fourcc)
+{
+    hw_module_t const* module = NULL;
+    alloc_device_t *allocdev = NULL;
+    static int surf_hnd = 0;
+    int stride, bpp, err;
+    bpp = fourcc2LumaBitsPerPixel(fourcc);
+    target.type = RenderTarget::INTERNAL_BUF;
+    target.handle = generateHandle();
+    switch(fourcc) {
+    case VA_FOURCC_NV12:
+    case VA_FOURCC_YUY2:
+    case VA_FOURCC_UYVY:
+    case VA_FOURCC_422H:
+    case VA_FOURCC_422V:
+    case VA_FOURCC_IMC3:
+    case VA_FOURCC_444P:
+    case VA_FOURCC_411P:
+    case VA_FOURCC('4','0','0','P'):
+        target.width = aligned_width(width, SURF_TILING_Y);
+        target.height = aligned_height(height, SURF_TILING_Y);
+        break;
+    default:
+        target.width = aligned_width(width, SURF_TILING_NONE);
+        target.height = aligned_height(height, SURF_TILING_NONE);
+        break;
+    }
+    target.pixel_format = fourcc;
+    target.rect.x = target.rect.y = 0;
+    target.rect.width = target.width;
+    target.rect.height = target.height;
+    //target.stride = stride * bpp;
+    return target;
+}
+
+void deinit_render_target(RenderTarget &target, buffer_handle_t *handle = NULL)
+{
     hw_module_t const* module = NULL;
     alloc_device_t *allocdev = NULL;
     struct gralloc_module_t *gralloc_module = NULL;
@@ -73,24 +252,66 @@
         printf("%s failed to get gralloc module\n", __PRETTY_FUNCTION__);
         return;
     }
-    allocdev->free(allocdev, handle);
+    if (handle && target.type == RenderTarget::KERNEL_DRM)
+        allocdev->free(allocdev, *handle);
+    else if (target.type == RenderTarget::ANDROID_GRALLOC)
+        allocdev->free(allocdev, (buffer_handle_t)target.handle);
+    else if (target.type == RenderTarget::USER_PTR)
+        free((void*)target.handle);
     gralloc_close(allocdev);
 }
 
-void decode_blit_functionality_test()
+void decode_blit_functionality_test(RenderTarget::bufType type, uint32_t format, int scale_factor)
 {
     JpegDecodeStatus st;
+    VAStatus vast;
     JpegInfo jpginfo;
     hw_module_t const* module = NULL;
     alloc_device_t *allocdev = NULL;
     struct gralloc_module_t *gralloc_module = NULL;
-    buffer_handle_t handle;
-    JpegDecoder decoder;
-    JpegBlitter blitter;
-    blitter.setDecoder(decoder);
-    RenderTarget targets[5];
-    RenderTarget *dec_target, *blit_nv12_target, *blit_rgba_target, *blit_yuy2_target, *blit_yv12_target;
-    FILE* fp = fopen(JPGFILE, "rb");
+    VAStatus vret;
+    char decdumpfile[100];
+    char origdecdumpfile[100];
+    char nv12dumpfile[100];
+    char nv21dumpfile[100];
+    char yuy2dumpfile[100];
+    char yv12dumpfile[100];
+    char rgbadumpfile[100];
+    FILE* fpdump = NULL;
+    memset(&jpginfo, 0, sizeof(JpegInfo));
+    memset(decdumpfile, 0, sizeof(decdumpfile));
+    memset(origdecdumpfile, 0, sizeof(origdecdumpfile));
+    memset(nv12dumpfile, 0, sizeof(nv12dumpfile));
+    memset(nv21dumpfile, 0, sizeof(nv21dumpfile));
+    memset(yuy2dumpfile, 0, sizeof(yuy2dumpfile));
+    memset(yv12dumpfile, 0, sizeof(yv12dumpfile));
+    memset(rgbadumpfile, 0, sizeof(rgbadumpfile));
+    VADisplay display = NULL;
+    VAConfigID vpCfgId = VA_INVALID_ID;
+    VAContextID vpCtxId = VA_INVALID_ID;
+    typedef uint32_t Display;
+    Display dpy;
+    int va_major_version, va_minor_version;
+    VAConfigAttrib  vpp_attrib;
+    display = vaGetDisplay(&dpy);
+    vast = vaInitialize(display, &va_major_version, &va_minor_version);
+    assert(vast == VA_STATUS_SUCCESS);
+    vpp_attrib.type  = VAConfigAttribRTFormat;
+    vpp_attrib.value = VA_RT_FORMAT_YUV420;
+    vast = vaCreateConfig(display, VAProfileNone,
+                                VAEntrypointVideoProc,
+                                &vpp_attrib,
+                                1, &vpCfgId);
+    assert(vast == VA_STATUS_SUCCESS);
+    vast = vaCreateContext(display, vpCfgId, 1920, 1080, 0, NULL, 0, &vpCtxId);
+    assert(vast == VA_STATUS_SUCCESS);
+    JpegDecoder decoder(display, vpCfgId, vpCtxId, true);
+
+    RenderTarget dec_target;
+    buffer_handle_t dec_handle, nv12_handle, yuy2_handle;
+    uint8_t *nv12_mem, *yuy2_mem, *nv21_mem, *yv12_mem, *rgba_mem;
+    int stride;
+    FILE* fp = fopen(jpgfile, "rb");
     assert(fp);
     fseek(fp, 0, SEEK_END);
     jpginfo.bufsize = ftell(fp);
@@ -100,329 +321,325 @@
     fclose(fp);
 
     printf("finished loading src file: size %u\n", jpginfo.bufsize);
+    jpginfo.need_header_only = false;
+    jpginfo.use_vector_input = false;
     st = decoder.parse(jpginfo);
     assert(st == JD_SUCCESS);
+    printf("parse succeeded: %ux%u\n", jpginfo.image_width, jpginfo.image_height);
 
-    init_render_target(targets[0], jpginfo.image_width, jpginfo.image_height, jpginfo.image_pixel_format);
-    init_render_target(targets[1], jpginfo.image_width, jpginfo.image_height, HAL_PIXEL_FORMAT_NV12_TILED_INTEL);
-    init_render_target(targets[2], jpginfo.image_width, jpginfo.image_height, HAL_PIXEL_FORMAT_RGBA_8888);
-    init_render_target(targets[3], jpginfo.image_width, jpginfo.image_height, HAL_PIXEL_FORMAT_YCbCr_422_I);
-    init_render_target(targets[4], jpginfo.image_width, jpginfo.image_height, HAL_PIXEL_FORMAT_YV12);
-    dec_target = &targets[0];
-    blit_nv12_target = &targets[1];
-    blit_rgba_target = &targets[2];
-    blit_yuy2_target = &targets[3];
-    blit_yv12_target = &targets[4];
-    dec_target->rect.x = blit_nv12_target->rect.x = blit_yuy2_target->rect.x = blit_rgba_target->rect.x = blit_yv12_target->rect.x = 0;
-    dec_target->rect.y = blit_nv12_target->rect.y = blit_yuy2_target->rect.y = blit_rgba_target->rect.y = blit_yv12_target->rect.y = 0;
-    dec_target->rect.width = blit_nv12_target->rect.width = blit_yuy2_target->rect.width = blit_rgba_target->rect.width = blit_yv12_target->rect.width = jpginfo.image_width;
-    dec_target->rect.height = blit_nv12_target->rect.height = blit_yuy2_target->rect.height = blit_rgba_target->rect.height = blit_yv12_target->rect.height = jpginfo.image_height;
-    RenderTarget* targetlist[5] = {dec_target, blit_nv12_target, blit_rgba_target, blit_yuy2_target, blit_yv12_target };
-    //st = decoder.init(jpginfo.image_width, jpginfo.image_height, targetlist, 5);
-    st = decoder.init(jpginfo.image_width, jpginfo.image_height, &dec_target, 1);
+    if (format == 0)
+        format = jpginfo.image_color_fourcc;
+
+    char buftypename[100];
+    switch(type) {
+    case RenderTarget::KERNEL_DRM:
+        sprintf(buftypename, "DRM");
+        init_render_target_drm(dec_target, jpginfo.image_width, jpginfo.image_height, format, &dec_handle);
+        break;
+    case RenderTarget::ANDROID_GRALLOC:
+        sprintf(buftypename, "GRALLOC");
+        init_render_target_gralloc(dec_target, jpginfo.image_width, jpginfo.image_height, format);
+        break;
+    case RenderTarget::INTERNAL_BUF:
+        sprintf(buftypename, "DRIVER");
+        init_render_target(dec_target, jpginfo.image_width, jpginfo.image_height, format);
+        break;
+    default:
+        assert(0);
+        break;
+    }
+
+    uint32_t aligned_w = aligned_width(jpginfo.image_width, SURF_TILING_Y);
+    uint32_t aligned_h = aligned_height(jpginfo.image_height, SURF_TILING_Y);
+    uint32_t aligned_scaled_w = aligned_width(jpginfo.image_width / scale_factor, SURF_TILING_Y);
+    uint32_t aligned_scaled_h = aligned_width(jpginfo.image_height / scale_factor, SURF_TILING_Y);
+    int err;
+    err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
+    if (err || !module) {
+        printf("%s failed to get gralloc module\n", __PRETTY_FUNCTION__);
+        assert(false);
+    }
+    gralloc_module = (struct gralloc_module_t *)module;
+    err = gralloc_open(module, &allocdev);
+    if (err || !allocdev) {
+        printf("%s failed to open alloc device\n", __PRETTY_FUNCTION__);
+        assert(false);
+    }
+    err = allocdev->alloc(allocdev,
+            aligned_w,
+            aligned_h,
+            fourcc2PixelFormat(VA_FOURCC_NV12),
+            GRALLOC_USAGE_HW_RENDER,
+            &nv12_handle,
+            &stride);
+    if (err) {
+        printf("%s failed to allocate surface %d, %dx%d, pixelformat %x\n", __PRETTY_FUNCTION__, err,
+            aligned_w, aligned_h, fourcc2PixelFormat(VA_FOURCC_NV12));
+        assert(false);
+    }
+    err = allocdev->alloc(allocdev,
+            aligned_w,
+            aligned_h,
+            fourcc2PixelFormat(VA_FOURCC_YUY2),
+            GRALLOC_USAGE_HW_RENDER,
+            &yuy2_handle,
+            &stride);
+    if (err) {
+        printf("%s failed to allocate surface %d, %dx%d, pixelformat %x\n", __PRETTY_FUNCTION__, err,
+            aligned_w, aligned_h, fourcc2PixelFormat(VA_FOURCC_YUY2));
+        assert(false);
+    }
+
+    nv21_mem = (uint8_t*)memalign(0x1000, aligned_w * aligned_h * 3 / 2);
+    yv12_mem = (uint8_t*)memalign(0x1000, aligned_w * aligned_h * 3 / 2);
+    rgba_mem = (uint8_t*)memalign(0x1000, aligned_scaled_w * aligned_scaled_h * 4);
+    assert(nv21_mem && yv12_mem && rgba_mem);
+
+    sprintf(decdumpfile, "/sdcard/jpeg_%s_dec_%dx%d.%s", buftypename, jpginfo.image_width, jpginfo.image_height, fourcc2str(format));
+    sprintf(origdecdumpfile, "/sdcard/jpeg_%s_dec_orig_%dx%d.yuv", buftypename, aligned_w, aligned_h);
+    sprintf(nv12dumpfile, "/sdcard/jpeg_%s_out_%dx%d.nv12", buftypename, aligned_w, aligned_h);
+    sprintf(nv21dumpfile, "/sdcard/jpeg_%s_out_%dx%d.nv21", buftypename, aligned_w, aligned_h);
+    sprintf(yuy2dumpfile, "/sdcard/jpeg_%s_out_%dx%d.yuy2", buftypename, aligned_w, aligned_h);
+    sprintf(yv12dumpfile, "/sdcard/jpeg_%s_out_%dx%d.yv12", buftypename, aligned_w, aligned_h);
+    sprintf(rgbadumpfile, "/sdcard/jpeg_%s_out_%dx%d.rgba", buftypename, aligned_scaled_w, aligned_scaled_h);
+
+    RenderTarget* targetlist[1] = {&dec_target};
+    st = decoder.init(jpginfo.image_width, jpginfo.image_height, targetlist, 1);
     assert(st == JD_SUCCESS);
 
-    //jpginfo.render_target = dec_target;
-    st = decoder.decode(jpginfo, *dec_target);
+    st = decoder.decode(jpginfo, dec_target);
     printf("decode returns %d\n", st);
     assert(st == JD_SUCCESS);
 
     uint8_t *data;
     uint32_t offsets[3];
     uint32_t pitches[3];
-    JpegDecoder::MapHandle maphandle = decoder.mapData(*dec_target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    FILE* fpdump = fopen("/sdcard/dec_dump.yuv", "wb");
+
+    JpegDecoder::MapHandle maphandle = decoder.mapData(dec_target, (void**) &data, offsets, pitches);
+    assert (maphandle);
+    fpdump = fopen(decdumpfile, "wb");
     assert(fpdump);
-    // Y
-    for (int i = 0; i < dec_target->height; ++i) {
-        fwrite(data + offsets[0] + i * pitches[0], 1, dec_target->width, fpdump);
+    int hs, vs, nv12, yuy2, uyvy;
+    hs = vs = nv12 = yuy2 = uyvy = 0;
+    switch(format) {
+    case VA_FOURCC_NV12:
+        nv12 = 1;
+        break;
+    case VA_FOURCC_YUY2:
+        yuy2 = 1;
+        break;
+    case VA_FOURCC_UYVY:
+        uyvy = 1;
+        break;
+    case VA_FOURCC('4','0','0','P'):
+        hs = vs = 0;
+        break;
+    case VA_FOURCC_411P:
+        hs = 4;
+        vs = 1;
+        break;
+    case VA_FOURCC_411R:
+        hs = 1;
+        vs = 4;
+        break;
+    case VA_FOURCC_IMC3:
+        hs = 2;
+        vs = 2;
+        break;
+    case VA_FOURCC_422H:
+        hs = 2;
+        vs = 1;
+        break;
+    case VA_FOURCC_422V:
+        hs = 1;
+        vs = 2;
+        break;
+    case VA_FOURCC_444P:
+        hs = vs = 1;
+        break;
+    default:
+        printf("Invalid format %x\n", format);
+        assert(false);
+        break;
     }
-    // U
-    for (int i = 0; i < dec_target->height; ++i) {
-        fwrite(data + offsets[1] + i * pitches[1], 1, dec_target->width/2, fpdump);
+    if (nv12) {
+        for (int i = 0; i < jpginfo.image_height; ++i) {
+            fwrite(data + offsets[0] + i * pitches[0], 1, jpginfo.image_width, fpdump);
+        }
+        for (int i = 0; i < jpginfo.image_height/2; ++i) {
+            fwrite(data + offsets[1] + i * pitches[1], 1, jpginfo.image_width, fpdump);
+        }
     }
-    // V
-    for (int i = 0; i < dec_target->height; ++i) {
-        fwrite(data + offsets[2] + i * pitches[2], 1, dec_target->width/2, fpdump);
+    else if (yuy2 || uyvy) {
+        for (int i = 0; i < jpginfo.image_height; ++i) {
+            fwrite(data + offsets[0] + i * pitches[0], 2, jpginfo.image_width, fpdump);
+        }
+    }
+    else { // yuv planar
+        // Y
+        for (int i = 0; i < jpginfo.image_height; ++i) {
+            fwrite(data + offsets[0] + i * pitches[0], 1, jpginfo.image_width, fpdump);
+        }
+        if (hs != 0 && vs != 0) {
+            // U
+            for (int i = 0; i < jpginfo.image_height / vs; ++i) {
+                fwrite(data + offsets[1] + i * pitches[1], 1, jpginfo.image_width/hs, fpdump);
+            }
+            // V
+            for (int i = 0; i < jpginfo.image_height / vs; ++i) {
+                fwrite(data + offsets[2] + i * pitches[2], 1, jpginfo.image_width/hs, fpdump);
+            }
+        }
     }
     fclose(fpdump);
-    printf("Dumped decoded YUV to /sdcard/dec_dump.yuv\n");
-    decoder.unmapData(*dec_target, maphandle);
+    printf("Dumped decoded YUV to %s\n", decdumpfile);
+    decoder.unmapData(dec_target, maphandle);
 
-    st = decoder.blit(*dec_target, *blit_nv12_target);
+    BlitEvent ev;
+
+    st = decoder.blitToLinearRgba(dec_target, rgba_mem, aligned_w, aligned_h, ev, scale_factor);
     assert(st == JD_SUCCESS);
 
-    maphandle = decoder.mapData(*blit_nv12_target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    fpdump = fopen("/sdcard/nv12_dump.yuv", "wb");
+    decoder.syncBlit(ev);
+    fpdump = fopen(rgbadumpfile, "wb");
     assert(fpdump);
-    // Y
-    for (int i = 0; i < blit_nv12_target->height; ++i) {
-        fwrite(data + offsets[0] + i * pitches[0], 1, blit_nv12_target->width, fpdump);
-    }
-    // UV
-    for (int i = 0; i < blit_nv12_target->height/2; ++i) {
-        fwrite(data + offsets[1] + i * pitches[1], 1, blit_nv12_target->width, fpdump);
-    }
+    fwrite(rgba_mem, 4, aligned_scaled_w * aligned_scaled_h, fpdump);
     fclose(fpdump);
-    printf("Dumped converted NV12 to /sdcard/nv12_dump.yuv\n");
-    decoder.unmapData(*blit_nv12_target, maphandle);
+    printf("Dumped RGBA into %s\n", rgbadumpfile);
 
-    st = decoder.blit(*dec_target, *blit_yuy2_target);
-    assert(st == JD_SUCCESS);
-    maphandle = decoder.mapData(*blit_yuy2_target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    fpdump = fopen("/sdcard/yuy2_dump.yuv", "wb");
-    assert(fpdump);
-    // YUYV
-    for (int i = 0; i < blit_yuy2_target->height; ++i) {
-        fwrite(data + offsets[0] + i * pitches[0], 2, blit_yuy2_target->width, fpdump);
-    }
-    fclose(fpdump);
-    printf("Dumped converted YUY2 to /sdcard/yuy2_dump.yuv\n");
-    decoder.unmapData(*blit_yuy2_target, maphandle);
+    // test blit_to_camera_surfaces
+    if (format == VA_FOURCC_422H) {
 
-    st = decoder.blit(*dec_target, *blit_rgba_target);
-    assert(st == JD_SUCCESS);
-    maphandle = decoder.mapData(*blit_rgba_target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    fpdump = fopen("/sdcard/rgba_dump.yuv", "wb");
-    assert(fpdump);
-    // RGBA
-    for (int i = 0; i < blit_rgba_target->height; ++i) {
-        fwrite(data + offsets[0] + i * pitches[0], 4, blit_rgba_target->width, fpdump);
+        RenderTarget nv12_dst, yuy2_dst;
+        nsecs_t t1, t2;
+        init_render_target_gralloc(nv12_dst, aligned_w, aligned_h, VA_FOURCC_NV12);
+        init_render_target_gralloc(yuy2_dst, aligned_w, aligned_h, VA_FOURCC_YUY2);
+        t1 = systemTime();
+        st = decoder.blit(dec_target, nv12_dst, 1);
+        st = decoder.blit(dec_target, yuy2_dst, 1);
+        t2 = systemTime();
+        printf("422H->NV12+YUY2 VA took %.2f ms\n", (t2-t1)/1000000.0);
+        deinit_render_target(nv12_dst);
+        deinit_render_target(yuy2_dst);
+        t1 = systemTime();
+        st = decoder.blitToCameraSurfaces(dec_target, nv12_handle, yuy2_handle,
+                                          NULL, NULL,
+                                          aligned_w, aligned_h,
+                                          ev);
+        t2 = systemTime();
+        decoder.syncBlit(ev);
+        printf("422H->NV12+YUY2 CM took %.2f ms\n", (t2-t1)/1000000.0);
+        t1 = systemTime();
+        st = decoder.blitToCameraSurfaces(dec_target, nv12_handle, yuy2_handle,
+                                          nv21_mem, yv12_mem,
+                                          aligned_w, aligned_h,
+                                          ev);
+        t2 = systemTime();
+        decoder.syncBlit(ev);
+        printf("422H->NV12+YUY2+NV21+YV12 CM took %.2f ms\n", (t2-t1)/1000000.0);
+        assert(st == JD_SUCCESS);
+        fpdump = fopen(nv21dumpfile, "wb");
+        assert(fpdump);
+        fwrite(nv21_mem, 1, aligned_w * aligned_h* 3 /2, fpdump);
+        fclose(fpdump);
+        printf("Dumped NV21 into %s\n", nv21dumpfile);
+        fpdump = fopen(yv12dumpfile, "wb");
+        assert(fpdump);
+        fwrite(yv12_mem, 1, aligned_w * aligned_h * 3 / 2, fpdump);
+        fclose(fpdump);
+        printf("Dumped YV12 into %s\n", yv12dumpfile);
+        gralloc_module->lock(gralloc_module, nv12_handle, GRALLOC_USAGE_SW_READ_OFTEN, 0, 0, aligned_w, aligned_h, (void**)&nv12_mem);
+        fpdump = fopen(nv12dumpfile, "wb");
+        assert(fpdump);
+        fwrite(nv12_mem, 1, aligned_w * aligned_h * 3 / 2, fpdump);
+        fclose(fpdump);
+        gralloc_module->unlock(gralloc_module, nv12_handle);
+        printf("Dumped NV12 into %s\n", nv12dumpfile);
+        gralloc_module->lock(gralloc_module, yuy2_handle, GRALLOC_USAGE_SW_READ_OFTEN, 0, 0, aligned_w, aligned_h, (void**)&yuy2_mem);
+        fpdump = fopen(yuy2dumpfile, "wb");
+        assert(fpdump);
+        fwrite(yuy2_mem, 2, aligned_w * aligned_h, fpdump);
+        fclose(fpdump);
+        gralloc_module->unlock(gralloc_module, yuy2_handle);
+        printf("Dumped YUY2 into %s\n", yuy2dumpfile);
     }
-    fclose(fpdump);
-    printf("Dumped converted RGBA to /sdcard/rgba_dump.yuv\n");
-    decoder.unmapData(*blit_rgba_target, maphandle);
-
-    st = decoder.blit(*dec_target, *blit_yv12_target);
-    assert(st == JD_SUCCESS);
-    maphandle = decoder.mapData(*blit_yv12_target, (void**) &data, offsets, pitches);
-    assert (maphandle.valid);
-    fpdump = fopen("/sdcard/yv12_dump.yuv", "wb");
-    assert(fpdump);
-    // YV12
-    for (int i = 0; i < blit_yv12_target->height; ++i) {
-        fwrite(data + offsets[0] + i * pitches[0], 1, blit_yv12_target->width, fpdump);
-    }
-    for (int i = 0; i < blit_yv12_target->height/2; ++i) {
-        fwrite(data + offsets[1] + i * pitches[1], 1, blit_yv12_target->width/2, fpdump);
-    }
-    for (int i = 0; i < blit_yv12_target->height/2; ++i) {
-        fwrite(data + offsets[2] + i * pitches[2], 1, blit_yv12_target->width/2, fpdump);
-    }
-    fclose(fpdump);
-    printf("Dumped converted YV12 to /sdcard/yv12_dump.yuv\n");
-    decoder.unmapData(*blit_yv12_target, maphandle);
-
 
     decoder.deinit();
 
-    deinit_render_target(*dec_target);
-    deinit_render_target(*blit_nv12_target);
-    deinit_render_target(*blit_yuy2_target);
-    deinit_render_target(*blit_rgba_target);
-    deinit_render_target(*blit_yv12_target);
+    allocdev->free(allocdev, nv12_handle);
+    allocdev->free(allocdev, yuy2_handle);
+    free(nv21_mem);
+    free(yv12_mem);
+    free(rgba_mem);
+
+    switch(type) {
+    case RenderTarget::KERNEL_DRM:
+        deinit_render_target(dec_target, &dec_handle);
+        break;
+    default:
+        deinit_render_target(dec_target);
+        break;
+    }
     delete[] jpginfo.buf;
-
-}
-
-enum target_state
-{
-    TARGET_FREE,
-    TARGET_DECODE,
-    TARGET_BLIT,
-};
-
-struct thread_param
-{
-    JpegDecoder *decoder;
-    RenderTarget *targets;
-    RenderTarget *nv12_targets;
-    RenderTarget *yuy2_targets;
-    RenderTarget *imc3_targets;
-    size_t target_count;
-    target_state *states;
-};
-
-static Mutex state_lock;
-
-void read_new_frame(JpegInfo &jpginfo)
-{
-    memset(&jpginfo, 0, sizeof(JpegInfo));
-    FILE* fp = fopen(JPGFILE, "rb");
-    assert(fp);
-    fseek(fp, 0, SEEK_END);
-    jpginfo.bufsize = ftell(fp);
-    fseek(fp, 0, SEEK_SET);
-    jpginfo.buf = new uint8_t[jpginfo.bufsize];
-    fread(jpginfo.buf, 1, jpginfo.bufsize, fp);
-    fclose(fp);
-}
-
-static bool exit_thread = false;
-
-#define VPP_DECODE_BATCH
-
-void* decode_frame_threadproc(void* data)
-{
-    thread_param *param = (thread_param*) data;
-    JpegInfo *jpginfos = new JpegInfo[param->target_count];
-    int surface_id = 0;
-    int blit_surface_id = (surface_id + param->target_count - 1) % param->target_count;
-    while(!exit_thread) {
-        printf("%s blit %d and decode %d\n", __FUNCTION__, blit_surface_id, surface_id);
-        RenderTarget& cur_target = param->targets[surface_id];
-#ifdef VPP_DECODE_BATCH
-        RenderTarget& blit_target = param->targets[blit_surface_id];
-        RenderTarget& blit_nv12_target = param->nv12_targets[blit_surface_id];
-        RenderTarget& blit_yuy2_target = param->yuy2_targets[blit_surface_id];
-        if (param->states[blit_surface_id] == TARGET_BLIT) {
-            printf("%s blit with surface %d\n", __FUNCTION__, blit_surface_id);
-            nsecs_t t1 = systemTime();
-            if (param->decoder->busy(blit_target)) {
-                param->decoder->sync(blit_target);
-                nsecs_t t2 = systemTime();
-                printf("%s wait surface %d decode took %f ms\n", __FUNCTION__, blit_surface_id, ns2us(t2 - t1)/1000.0);
-                param->states[blit_surface_id] = TARGET_FREE;
-            }
-            t1 = systemTime();
-            param->decoder->blit(blit_target, blit_nv12_target);
-            nsecs_t t2 = systemTime();
-            param->decoder->blit(blit_target, blit_yuy2_target);
-            nsecs_t t3 = systemTime();
-            printf("%s blit %d NV12 took %f ms, YUY2 took %f ms\n",
-                __FUNCTION__,
-                blit_surface_id, ns2us(t2 - t1)/1000.0,
-                ns2us(t3 - t2)/1000.0);
-            param->states[blit_surface_id] = TARGET_FREE;
-        }
-#endif
-        if (param->states[surface_id] != TARGET_FREE) {
-            printf("%s wait surface %d blit finish\n", __FUNCTION__, surface_id);
-            nsecs_t t1 = systemTime();
-            while (param->states[surface_id] != TARGET_FREE) {
-                usleep(1000);
-            }
-            nsecs_t t2 = systemTime();
-            printf("%s wait surface %d for decode/blit finish took %f ms\n", __FUNCTION__, surface_id, ns2us(t2 - t1)/1000.0);
-        }
-        JpegInfo &jpginfo = jpginfos[surface_id];
-        read_new_frame(jpginfo);
-        nsecs_t t3 = systemTime();
-        param->decoder->parse(jpginfo);
-        nsecs_t t4 = systemTime();
-        printf("%s parse surface %d took %f ms\n", __FUNCTION__, surface_id, ns2us(t4 - t3)/1000.0);
-        param->states[surface_id] = TARGET_DECODE;
-        param->decoder->decode(jpginfo, cur_target);
-        nsecs_t t5 = systemTime();
-        printf("%s decode surface %d took %f ms\n", __FUNCTION__, surface_id, ns2us(t5 - t4)/1000.0);
-        param->states[surface_id] = TARGET_BLIT;
-        surface_id  = (surface_id + 1) % param->target_count;
-        blit_surface_id  = (blit_surface_id + 1) % param->target_count;
-    }
-    delete[] jpginfos;
-    return NULL;
-}
-
-void* blit_frame_threadproc(void* data)
-{
-    thread_param *param = (thread_param*) data;
-    int surface_id = 0;
-    while(!exit_thread) {
-        printf("%s blit %d->%d\n", __FUNCTION__, surface_id, surface_id);
-        RenderTarget& dec_target = param->targets[surface_id];
-        RenderTarget& blit_target = param->nv12_targets[surface_id];
-        if (param->states[surface_id] != TARGET_BLIT) {
-            printf("%s wait surface %d decoding finish\n", __FUNCTION__, surface_id);
-            nsecs_t t1 = systemTime();
-            while (param->states[surface_id] != TARGET_BLIT) {
-                usleep(100);
-            }
-            nsecs_t t2 = systemTime();
-            printf("%s wait surface %d for decode finish took %f ms\n", __FUNCTION__, surface_id, ns2us(t2 - t1)/1000.0);
-        }
-        nsecs_t t3 = systemTime();
-        param->decoder->blit(dec_target, blit_target);
-        nsecs_t t4 = systemTime();
-        printf("%s blit surface %d took %f ms\n", __FUNCTION__, surface_id, ns2us(t4 - t3)/1000.0);
-        param->states[surface_id] = TARGET_FREE;
-        surface_id  = (surface_id + 1) % param->target_count;
-    }
-    return NULL;
-}
-
-void parallel_decode_blit_test()
-{
-    RenderTarget **all_targets = new RenderTarget*[12];
-    RenderTarget dec_targets[12];
-    RenderTarget nv12_targets[12];
-    RenderTarget yuy2_targets[12];
-    RenderTarget imc3_targets[12];
-    JpegInfo jpginfos[12];
-    target_state states[12];
-    for (int i = 0; i < 12; ++i) {
-        init_render_target(dec_targets[i], 1280, 720, fourcc2PixelFormat(VA_FOURCC_422H)); // 422H
-        init_render_target(nv12_targets[i], 1280, 720, fourcc2PixelFormat(VA_FOURCC_NV12)); // NV12 for video encode
-        init_render_target(yuy2_targets[i], 1280, 720, fourcc2PixelFormat(VA_FOURCC_YUY2)); // YUY2 for overlay
-        //init_render_target(imc3_targets[i], 1280, 720, HAL_PIXEL_FORMAT_IMC3); // IMC3 for libjpeg encode
-        jpginfos[i].buf = new uint8_t[2 * 1024 * 1024];
-        all_targets[i] = &dec_targets[i];
-        //all_targets[i + 12] = &nv12_targets[i];
-        //all_targets[i + 24] = &yuy2_targets[i];
-        //all_targets[i + 36] = &imc3_targets[i];
-        states[i] = TARGET_FREE;
-    }
-
-    exit_thread = false;
-
-    pthread_attr_t dec_attr, blit_attr;
-    pthread_attr_init(&dec_attr);
-    pthread_attr_init(&blit_attr);
-    pthread_attr_setdetachstate(&dec_attr, PTHREAD_CREATE_JOINABLE);
-    pthread_attr_setdetachstate(&blit_attr, PTHREAD_CREATE_JOINABLE);
-    pthread_t dec_thread, blit_thread;
-    thread_param param;
-    param.nv12_targets = nv12_targets;
-    param.yuy2_targets = yuy2_targets;
-    param.imc3_targets = imc3_targets;
-    param.targets = dec_targets;
-    param.target_count = 12;
-    param.decoder = new JpegDecoder();
-    //param.decoder->init(1280, 720, all_targets, 36);
-    param.decoder->init(1280, 720, all_targets, 12);
-    param.states = states;
-    pthread_create(&dec_thread, &dec_attr, decode_frame_threadproc, (void*)&param);
-#ifndef VPP_DECODE_BATCH
-    pthread_create(&blit_thread, &blit_attr, blit_frame_threadproc, (void*)&param);
-#endif
-    pthread_attr_destroy(&blit_attr);
-    pthread_attr_destroy(&dec_attr);
-
-    // test for 1 minute
-    usleep(60 * 1000 * 1000);
-    exit_thread = true;
-    void *dummy;
-    pthread_join(dec_thread, &dummy);
-#ifndef VPP_DECODE_BATCH
-    pthread_join(blit_thread, &dummy);
-#endif
-
-    for (int i = 0; i < 12; ++i) {
-        delete[] jpginfos[i].buf;
-        deinit_render_target(dec_targets[i]);
-        deinit_render_target(nv12_targets[i]);
-        deinit_render_target(yuy2_targets[i]);
-        //deinit_render_target(imc3_targets[i]);
-    }
-    delete[] all_targets;
+    gralloc_close(allocdev);
+    vaDestroyContext(display, vpCtxId);
+    vaDestroyConfig(display, vpCfgId);
+    vaTerminate(display);
 }
 
 int main(int argc, char ** argv)
 {
-    //decode_blit_functionality_test();
-    parallel_decode_blit_test();
+    int res, scale;
+    uint32_t format = 0;
+    scale = 1;
+    memset(jpgfile, 0, sizeof(jpgfile));
+    while ((res = getopt(argc, argv, "i:f:s:")) >= 0) {
+        switch (res) {
+            case 'i':
+                {
+                    strcpy(jpgfile, optarg);
+                    break;
+                }
+            case 's':
+                {
+                    scale = atoi(optarg);
+                    break;
+                }
+            case 'f':
+                {
+                    if (strcmp(optarg, "NV12") == 0) {
+                        format = VA_FOURCC_NV12;
+                    }
+                    else if (strcmp(optarg, "YUY2") == 0) {
+                        format = VA_FOURCC_YUY2;
+                    }
+                    else if (strcmp(optarg, "UYVY") == 0) {
+                        format = VA_FOURCC_UYVY;
+                    }
+                    else {
+                        format = 0;
+                        printf("INVALID output decode format, using YUV planar\n");
+                    }
+                    break;
+                }
+            default:
+                printf("usage: testjpegdec -i <filename> [-w <width> -h <height>]\n");
+                exit(-1);
+        }
+    }
+    if (strcmp(jpgfile, "") == 0) {
+        printf("usage: testjpegdec -i <filename> [-f <decode output format FOURCC>] [-s <scaling_factor>]\n");
+        printf("       available output FOURCC: NV12, YUY2, UYVY, 0. 0 by default (YUV planar)\n");
+        printf("       available scaling_factor: 1, 2, 4, 8. 1 by default (no down-scale)\n");
+        exit(-1);
+    }
+    printf("----- DRM surface type test -----\n");
+    //decode_blit_functionality_test(RenderTarget::KERNEL_DRM, 0);
+    printf("----- GRALLOC surface type test -----\n");
+    //decode_blit_functionality_test(RenderTarget::ANDROID_GRALLOC, 0);
+    printf("----- Normal surface type test, scale %d-----\n", scale);
+    decode_blit_functionality_test(RenderTarget::INTERNAL_BUF, format, scale);
+    printf("----- Userptr surface type test -----\n");
+    //decode_blit_functionality_test(RenderTarget::USER_PTR, format);
     return 0;
 }