DO NOT MERGE Sync libwebp ver 0.2.0

Change-Id: Ibbd20e72d28122a9c972621a9752673a28c95d5c
diff --git a/ChangeLog b/ChangeLog
index fe7d3a1..1fcff1a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,3 +2,4 @@
 - 5/11: release version 0.1.2
 - 6/11: Added encoder (version 0.1.2) as well
 - 7/11: Updated WebP with head change#Ia53f845b
+- 8/12: release version 0.2.0 (head change#I3da2063b
diff --git a/NEWS b/NEWS
index afa1349..9232f5c 100644
--- a/NEWS
+++ b/NEWS
@@ -1 +1,2 @@
 - 9/10: initial release version 0.1
+- 8/12: release version 0.2.0 (head change#I3da2063b).
diff --git a/README b/README
index 22db086..d3f4062 100644
--- a/README
+++ b/README
@@ -1,43 +1,103 @@
           __   __  ____  ____  ____
          /  \\/  \/  _ \/  _ )/  _ \
          \       /   __/  _  \   __/
-          \__\__/\____/\_____/__/ _________   ____ ____
-               \    \ /  _ \/ _/ /    \    \ /  _ \  _ \
-               /   \ \   __/  \_/   / /   \ \   __/    /_
-               \_____/_____/____/____/\_____/_____/_/\__/v0.1
-             
+          \__\__/\____/\_____/__/ ____  ___
+                / _/ /    \    \ /  _ \/ _/
+               /  \_/   / /   \ \   __/  \__
+               \____/____/\_____/_____/____/v0.2.0
+
 Description:
 ============
 
-WEBP decoder: libwebpdecode.so is a simple library for
-decoding WEBP image files. 
+WebP codec: library to encode and decode images in WebP format. This package
+contains the library that can be used in other programs to add WebP support,
+as well as the command line tools 'cwebp' and 'dwebp'.
 
-See http://code.google.com/speed/webp
+See http://developers.google.com/speed/webp
 
+Latest sources are available from http://www.webmproject.org/code/
 
 It is released under the same license as the WebM project.
 See http://www.webmproject.org/license/software/ or the
 file "COPYING" file for details. An additional intellectual
 property rights grant can be found in the file PATENTS.
 
-
-API:
-====
-
-This is mainly just one function to call, so just have a look at
-the file src/webp/decode.h for the details and variants:
-
-#include "webp/decode.h"
-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
-
-A lower-level API is available from the header file <webp/decode_vp8.h>
-
-
 Building:
 =========
 
-If everything goes right, then:
+Windows build (without experimental features):
+----------------------------------------------
+
+By running:
+
+  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output
+
+the directory output\release-static\(x64|x86)\bin will contain the tools
+cwebp.exe and dwebp.exe. The directory output\release-static\(x64|x86)\lib will
+contain the libwebp static library.
+The target architecture (x86/x64) is detected by Makefile.vc from the Visual
+Studio compiler (cl.exe) available in the system path.
+
+Windows build (with experimental features):
+-------------------------------------------
+
+This release requires the zlib library. This library is not common under
+Windows nor can it be replaced with calls to the Windows API, so you will need
+to download it or build it yourself.
+
+You can either:
+(a) Link zlib dynamically (as a DLL).
+(b) Link zlib statically.
+
+Linking it dynamically is easier to do (as you can download a precompiled DLL),
+but a bit more cumbersome to use - you need to keep zlib1.dll in the same
+directory as the created EXEs.
+
+a. With dynamic zlib
+~~~~~~~~~~~~~~~~~~~~
+1. Download and unpack the archive from http://zlib.net/zlib125-dll.zip.
+2. Compile libwebp with the command:
+  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output \
+      ZDLLDIR=C:\path\to\the\unpacked\archive EXPERIMENTAL=y
+3. Copy C:\path\to\the\unpacked\archive\zlib1.dll to
+   output\release-static\x86\bin for binaries there to work.
+Note: only a 32-bit DLL is currently available for download, so for a 64-bit
+build, you will need to build the DLL yourself.
+
+b. With static zlib
+~~~~~~~~~~~~~~~~~~~
+
+1. Download and unpack the source code from http://zlib.net/zlib125.zip.
+2. For a 32-bit build of zlib compatible with libwebp:
+  a. Edit zlib's win32\Makefile.msc, adding "-SAFESEH" to ASFLAGS.
+  b. Compile zlib with the command:
+    nmake /f win32/Makefile.msc LOC="-DASMV -DASMINF -MT" \
+        OBJA="inffas32.obj match686.obj"
+3. Compile libwebp with the command
+  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output \
+      ZLIBDIR=C:\path\to\the\unpacked\archive experimental
+4. Use the binaries in output\release-static\x86\bin.
+Note: a 64-bit build of zlib requires different modifications of
+Makefile.msc.
+
+
+Unix build using makefile.unix:
+-------------------------------
+
+On platforms with GNU tools installed (gcc and make), running
+
+  make -f makefile.unix
+
+will build the binaries examples/cwebp and examples/dwebp, along
+with the static library src/libwebp.a. No system-wide installation
+is supplied, as this is a simple alternative to the full installation
+system based on the autoconf tools (see below).
+Please refer to the makefile.unix for additional details and customizations.
+
+Using autoconf tools:
+---------------------
+When building from git sources, you will need to run autogen.sh to generate the
+configure script.
 
 ./configure
 make
@@ -46,19 +106,423 @@
 should be all you need to have the following files
 
 /usr/local/include/webp/decode.h
-/usr/local/include/webp/decode_vp8.h
-/usr/local/lib/libwebpdecode.*
+/usr/local/include/webp/encode.h
+/usr/local/include/webp/types.h
+/usr/local/lib/libwebp.*
+/usr/local/bin/cwebp
+/usr/local/bin/dwebp
 
 installed.
 
+Note: The encoding and decoding libraries are compiled separately
+(as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
+can be installed independently using a minor modification in the
+corresponding Makefile.am configure files (see comments there).
 
-Decoding example:
-=================
+SWIG bindings:
+--------------
 
-there's a decoding example in example/dwebp.c which will take a .webp file and
-decode it to a PPM image file. This is simply to demonstrate use of the API.
-You can verify the file test.webp decodes to exactly the same as test_ref.ppm:
-  `cd examples && ./dwebp test.webp -o test.ppm && diff test.ppm test_ref.ppm`
+To generate language bindings from swig/libwebp.i swig-1.3
+(http://www.swig.org) is required. 2.0 may work, but has not been tested.
+
+Currently the following functions are mapped:
+Decode:
+  WebPGetDecoderVersion
+  WebPGetInfo
+  WebPDecodeRGBA
+  WebPDecodeARGB
+  WebPDecodeBGRA
+  WebPDecodeBGR
+  WebPDecodeRGB
+
+Encode:
+  WebPGetEncoderVersion
+  WebPEncodeRGBA
+  WebPEncodeBGRA
+  WebPEncodeRGB
+  WebPEncodeBGR
+  WebPEncodeLosslessRGBA
+  WebPEncodeLosslessBGRA
+  WebPEncodeLosslessRGB
+  WebPEncodeLosslessBGR
+
+Java bindings:
+
+To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
+is necessary for enum support. The output is intended to be a shared object /
+DLL that can be loaded via System.loadLibrary("webp_jni").
+
+Encoding tool:
+==============
+
+The examples/ directory contains tools for encoding (cwebp) and
+decoding (dwebp) images.
+
+The easiest use should look like:
+  cwebp input.png -q 80 -o output.webp
+which will convert the input file to a WebP file using a quality factor of 80
+on a 0->100 scale (0 being the lowest quality, 100 being the best. Default
+value is 75).
+You might want to try the -lossless flag too, which will compress the source
+(in RGBA format) without any loss. The -q quality parameter will in this case
+control the amount of processing time spent trying to make the output file as
+small as possible.
+
+A longer list of options is available using the -longhelp command line flag:
+
+> cwebp -longhelp
+Usage:
+ cwebp [-preset <...>] [options] in_file [-o out_file]
+
+If input size (-s) for an image is not specified, it is assumed to be a PNG,
+JPEG or TIFF file.
+options:
+  -h / -help  ............ short help
+  -H / -longhelp  ........ long help
+  -q <float> ............. quality factor (0:small..100:big)
+  -alpha_q <int> ......... Transparency-compression quality (0..100).
+  -preset <string> ....... Preset setting, one of:
+                            default, photo, picture,
+                            drawing, icon, text
+     -preset must come first, as it overwrites other parameters.
+  -m <int> ............... compression method (0=fast, 6=slowest)
+  -segments <int> ........ number of segments to use (1..4)
+  -size <int> ............ Target size (in bytes)
+  -psnr <float> .......... Target PSNR (in dB. typically: 42)
+
+  -s <int> <int> ......... Input size (width x height) for YUV
+  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
+  -f <int> ............... filter strength (0=off..100)
+  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
+  -strong ................ use strong filter instead of simple.
+  -partition_limit <int> . limit quality to fit the 512k limit on
+                           the first partition (0=no degradation ... 100=full)
+  -pass <int> ............ analysis pass number (1..10)
+  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
+  -resize <w> <h> ........ resize picture (after any cropping)
+  -map <int> ............. print map of extra info.
+  -print_psnr ............ prints averaged PSNR distortion.
+  -print_ssim ............ prints averaged SSIM distortion.
+  -print_lsim ............ prints local-similarity distortion.
+  -d <file.pgm> .......... dump the compressed output (PGM file).
+  -alpha_method <int> .... Transparency-compression method (0..1)
+  -alpha_filter <string> . predictive filtering for alpha plane.
+                           One of: none, fast (default) or best.
+  -alpha_cleanup ......... Clean RGB values in transparent area.
+  -noalpha ............... discard any transparency information.
+  -lossless .............. Encode image losslessly.
+  -hint <string> ......... Specify image characteristics hint.
+                           One of: photo, picture or graph
+
+  -short ................. condense printed message
+  -quiet ................. don't print anything.
+  -version ............... print version number and exit.
+  -noasm ................. disable all assembly optimizations.
+  -v ..................... verbose, e.g. print encoding/decoding times
+  -progress .............. report encoding progress
+
+Experimental Options:
+  -af .................... auto-adjust filter strength.
+  -pre <int> ............. pre-processing filter
+
+
+The main options you might want to try in order to further tune the
+visual quality are:
+ -preset
+ -sns
+ -f
+ -m
+
+Namely:
+  * 'preset' will set up a default encoding configuration targeting a
+     particular type of input. It should appear first in the list of options,
+     so that subsequent options can take effect on top of this preset.
+     Default value is 'default'.
+  * 'sns' will progressively turn on (when going from 0 to 100) some additional
+     visual optimizations (like: segmentation map re-enforcement). This option
+     will balance the bit allocation differently. It tries to take bits from the
+     "easy" parts of the picture and use them in the "difficult" ones instead.
+     Usually, raising the sns value (at fixed -q value) leads to larger files,
+     but with better quality.
+     Typical value is around '75'.
+  * 'f' option directly links to the filtering strength used by the codec's
+     in-loop processing. The higher the value, the smoother the
+     highly-compressed area will look. This is particularly useful when aiming
+     at very small files. Typical values are around 20-30. Note that using the
+     option -strong will change the type of filtering. Use "-f 0" to turn
+     filtering off.
+  * 'm' controls the trade-off between encoding speed and quality. Default is 4.
+     You can try -m 5 or -m 6 to explore more (time-consuming) encoding
+     possibilities. A lower value will result in faster encoding at the expense
+     of quality.
+
+Decoding tool:
+==============
+
+There is a decoding sample in examples/dwebp.c which will take
+a .webp file and decode it to a PNG image file (amongst other formats).
+This is simply to demonstrate the use of the API. You can verify the
+file test.webp decodes to exactly the same as test_ref.ppm by using:
+
+ cd examples
+ ./dwebp test.webp -ppm -o test.ppm
+ diff test.ppm test_ref.ppm
+
+The full list of options is available using -h:
+
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+
+Decodes the WebP image file to PNG format [Default]
+Use following options to convert into alternate image formats:
+  -pam ......... save the raw RGBA samples as a color PAM
+  -ppm ......... save the raw RGB samples as a color PPM
+  -pgm ......... save the raw YUV samples as a grayscale PGM
+                 file with IMC4 layout.
+ Other options are:
+  -version  .... print version number and exit.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -mt .......... use multi-threading
+  -crop <x> <y> <w> <h> ... crop output with the given rectangle
+  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -alpha ....... only save the alpha plane.
+  -h     ....... this help message.
+  -v     ....... verbose (e.g. print encoding/decoding times)
+  -noasm ....... disable all assembly optimizations.
+
+Visualization tool:
+===================
+
+There's a little self-serve visualization tool called 'vwebp' under the
+examples/ directory. It uses OpenGL to open a simple drawing window and show
+a decoded WebP file. It's not yet integrated in the automake or makefile.unix
+build system, but you can try to manually compile it using the recommendations
+at the top of the source file.
+
+Usage: 'vwebp my_picture.webp'
+
+
+Encoding API:
+=============
+
+The main encoding functions are available in the header src/webp/encode.h
+The ready-to-use ones are:
+size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride,
+                     float quality_factor, uint8_t** output);
+size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride,
+                     float quality_factor, uint8_t** output);
+size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride,
+                      float quality_factor, uint8_t** output);
+size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
+                      float quality_factor, uint8_t** output);
+
+They will convert raw RGB samples to a WebP data. The only control supplied
+is the quality factor.
+
+There are some variants for using the lossless format:
+
+size_t WebPEncodeLosslessRGB(const uint8_t* rgb, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGR(const uint8_t* bgr, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height,
+                              int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height,
+                              int stride, uint8_t** output);
+
+Of course in this case, no quality factor is needed since the compression
+occurs without loss of the input values, at the expense of larger output sizes.
+
+Advanced encoding API:
+----------------------
+
+A more advanced API is based on the WebPConfig and WebPPicture structures.
+
+WebPConfig contains the encoding settings and is not tied to a particular
+picture.
+WebPPicture contains input data, on which some WebPConfig will be used for
+compression.
+The encoding flow looks like:
+
+-------------------------------------- BEGIN PSEUDO EXAMPLE
+
+#include <webp/encode.h>
+
+  // Setup a config, starting form a preset and tuning some additional
+  // parameters
+  WebPConfig config;
+  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor))
+    return 0;   // version error
+  }
+  // ... additional tuning
+  config.sns_strength = 90;
+  config.filter_sharpness = 6;
+  config_error = WebPValidateConfig(&config);  // not mandatory, but useful
+
+  // Setup the input data
+  WebPPicture pic;
+  if (!WebPPictureInit(&pic)) {
+    return 0;  // version error
+  }
+  pic.width = width;
+  pic.height = height;
+  // allocated picture of dimension width x height
+  if (!WebPPictureAllocate(&pic)) {
+    return 0;   // memory error
+  }
+  // at this point, 'pic' has been initialized as a container,
+  // and can receive the Y/U/V samples.
+  // Alternatively, one could use ready-made import functions like
+  // WebPPictureImportRGB(), which will take care of memory allocation.
+  // In any case, past this point, one will have to call
+  // WebPPictureFree(&pic) to reclaim memory.
+
+  // Set up a byte-output write method. WebPMemoryWriter, for instance.
+  WebPMemoryWriter wrt;
+  pic.writer = MyFileWriter;
+  pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
+  // initialize 'wrt' here...
+
+  // Compress!
+  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
+  WebPPictureFree(&pic);  // must be called independently of the 'ok' result.
+
+  // output data should have been handled by the writer at that point.
+
+-------------------------------------- END PSEUDO EXAMPLE
+
+Decoding API:
+=============
+
+This is mainly just one function to call:
+
+#include "webp/decode.h"
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                       int* width, int* height);
+
+Please have a look at the file src/webp/decode.h for the details.
+There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
+decoding to raw Y'CbCr samples. One can also decode the image directly into a
+pre-allocated buffer.
+
+To detect a WebP file and gather the picture's dimensions, the function:
+  int WebPGetInfo(const uint8_t* data, size_t data_size,
+                  int* width, int* height);
+is supplied. No decoding is involved when using it.
+
+Incremental decoding API:
+=========================
+
+In the case when data is being progressively transmitted, pictures can still
+be incrementally decoded using a slightly more complicated API. Decoder state
+is stored into an instance of the WebPIDecoder object. This object can be
+created with the purpose of decoding either RGB or Y'CbCr samples.
+For instance:
+
+  WebPDecBuffer buffer;
+  WebPInitDecBuffer(&buffer);
+  buffer.colorspace = MODE_BGR;
+  ...
+  WebPIDecoder* idec = WebPINewDecoder(&buffer);
+
+As data is made progressively available, this incremental-decoder object
+can be used to decode the picture further. There are two (mutually exclusive)
+ways to pass freshly arrived data:
+
+either by appending the fresh bytes:
+
+  WebPIAppend(idec, fresh_data, size_of_fresh_data);
+
+or by just mentioning the new size of the transmitted data:
+
+  WebPIUpdate(idec, buffer, size_of_transmitted_buffer);
+
+Note that 'buffer' can be modified between each call to WebPIUpdate, in
+particular when the buffer is resized to accommodate larger data.
+
+These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
+decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
+status is an error condition.
+
+The 'idec' object must always be released (even upon an error condition) by
+calling: WebPDelete(idec).
+
+To retrieve partially decoded picture samples, one must use the corresponding
+method: WebPIDecGetRGB or WebPIDecGetYUVA.
+It will return the last displayable pixel row.
+
+Lastly, note that decoding can also be performed into a pre-allocated pixel
+buffer. This buffer must be passed when creating a WebPIDecoder, calling
+WebPINewRGB() or WebPINewYUVA().
+
+Please have a look at the src/webp/decode.h header for further details.
+
+Advanced Decoding API:
+======================
+
+WebP decoding supports an advanced API which provides on-the-fly cropping and
+rescaling, something of great usefulness on memory-constrained environments like
+mobile phones. Basically, the memory usage will scale with the output's size,
+not the input's, when one only needs a quick preview or a zoomed in portion of
+an otherwise too-large picture. Some CPU can be saved too, incidentally.
+
+-------------------------------------- BEGIN PSEUDO EXAMPLE
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config' options, if needed
+     config.options.no_fancy_upsampling = 1;
+     config.options.use_scaling = 1;
+     config.options.scaled_width = scaledWidth();
+     config.options.scaled_height = scaledHeight();
+     // etc.
+
+     // D) Specify 'config' output options for specifying output colorspace.
+     // Optionally the external image decode buffer can also be specified.
+     config.output.colorspace = MODE_BGRA;
+     // Optionally, the config.output can be pointed to an external buffer as
+     // well for decoding the image. This externally supplied memory buffer
+     // should be big enough to store the decoded picture.
+     config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
+     config.output.u.RGBA.stride = scanline_stride;
+     config.output.u.RGBA.size = total_size_of_the_memory_buffer;
+     config.output.is_external_memory = 1;
+
+     // E) Decode the WebP image. There are two variants w.r.t decoding image.
+     // The first one (E.1) decodes the full image and the second one (E.2) is
+     // used to incrementally decode the image using small input buffers.
+     // Any one of these steps can be used to decode the WebP image.
+
+     // E.1) Decode full image.
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E.2) Decode image incrementally.
+     WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
+     CHECK(idec != NULL);
+     while (bytes_remaining > 0) {
+       VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
+       if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
+         bytes_remaining -= bytes_read;
+       } else {
+         break;
+       }
+     }
+     WebPIDelete(idec);
+
+     // F) Decoded image is now in config.output (and config.output.u.RGBA).
+     // It can be saved, displayed or otherwise processed.
+
+     // G) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+
+-------------------------------------- END PSEUDO EXAMPLE
 
 Bugs:
 =====
@@ -72,3 +536,4 @@
 ========
 
 Email: webp-discuss@webmproject.org
+Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/README.android b/README.android
index c467de4..96dc441 100644
--- a/README.android
+++ b/README.android
@@ -23,6 +23,7 @@
 - Fixed the Endian'ness bug for Color-Configs (RGB_565 & ARGB_4444).
   The fix is similar to jpeglib handling for JCS_RGB_565 & JCS_RGBA_8888
   color configs. Added the code under "ANDROID_WEBP_RGB" flag.
+- Sync-patch with libwebp ver 0.2.0 (head change#I3da2063b).
 
 The Android.mk file creates WebP Decoder and Encoder static libraries which
 can be added to any application by Adding to LOCAL_STATIC_LIBRARIES
diff --git a/include/webp/decode.h b/include/webp/decode.h
index ccb4c36..43b6c58 100644
--- a/include/webp/decode.h
+++ b/include/webp/decode.h
@@ -1,11 +1,11 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  Main decoding functions for WEBP images.
+//  Main decoding functions for WebP images.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -18,7 +18,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0002
+#define WEBP_DECODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
@@ -27,42 +27,46 @@
 // Retrieve basic header information: width, height.
 // This function will also validate the header and return 0 in
 // case of formatting error.
-// Pointers *width/*height can be passed NULL if deemed irrelevant.
-WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, uint32_t data_size,
+// Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
+WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size,
                              int* width, int* height);
 
-// Decodes WEBP images pointed to by *data and returns RGB samples, along
-// with the dimensions in *width and *height.
+// Decodes WebP images pointed to by 'data' and returns RGBA samples, along
+// with the dimensions in *width and *height. The ordering of samples in
+// memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
 // The returned pointer should be deleted calling free().
 // Returns NULL in case of error.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
+// If the bitstream contains transparency, it is ignored.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size,
                                     int* width, int* height);
 
-// Same as WebPDecodeRGB, but returning RGBA data.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                                     int* width, int* height);
-
-// Same as WebPDecodeRGBA, but returning ARGB data.
-WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
-                                     int* width, int* height);
-
-// This variant decode to BGR instead of RGB.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+// Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
                                     int* width, int* height);
-// This variant decodes to BGRA instead of RGBA.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                                     int* width, int* height);
 
-// Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is
-// the Y samples buffer. Upon return, *u and *v will point to the U and V
-// chroma data. These U and V buffers need NOT be free()'d, unlike the returned
-// Y luma one. The dimension of the U and V planes are both (*width + 1) / 2
-// and (*height + 1)/ 2.
+
+// Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
+// returned is the Y samples buffer. Upon return, *u and *v will point to
+// the U and V chroma data. These U and V buffers need NOT be free()'d,
+// unlike the returned Y luma one. The dimension of the U and V planes
+// are both (*width + 1) / 2 and (*height + 1)/ 2.
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
-WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
+WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                     int* width, int* height,
                                     uint8_t** u, uint8_t** v,
                                     int* stride, int* uv_stride);
@@ -75,22 +79,24 @@
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
-// BGR variants
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+
+// RGB and BGR variants. Here too the transparency information, if present,
+// will be dropped and ignored.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly
 // into pre-allocated luma/chroma plane buffers. This function requires the
@@ -100,29 +106,56 @@
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
 WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
-    const uint8_t* data, uint32_t data_size,
-    uint8_t* luma, int luma_size, int luma_stride,
-    uint8_t* u, int u_size, int u_stride,
-    uint8_t* v, int v_size, int v_stride);
+    const uint8_t* data, size_t data_size,
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Output colorspaces and buffer
 
 // Colorspaces
+// Note: the naming describes the byte-ordering of packed samples in memory.
+// For instance, MODE_BGRA relates to samples ordered as B,G,R,A,B,G,R,A,...
+// Non-capital names (e.g.:MODE_Argb) relates to pre-multiplied RGB channels.
+// RGB-565 and RGBA-4444 are also endian-agnostic and byte-oriented.
 typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
                MODE_BGR = 2, MODE_BGRA = 3,
                MODE_ARGB = 4, MODE_RGBA_4444 = 5,
                MODE_RGB_565 = 6,
+               // RGB-premultiplied transparent modes (alpha value is preserved)
+               MODE_rgbA = 7,
+               MODE_bgrA = 8,
+               MODE_Argb = 9,
+               MODE_rgbA_4444 = 10,
                // YUV modes must come after RGB ones.
-               MODE_YUV = 7, MODE_YUVA = 8,  // yuv 4:2:0
-               MODE_LAST = 9
+               MODE_YUV = 11, MODE_YUVA = 12,  // yuv 4:2:0
+               MODE_LAST = 13
              } WEBP_CSP_MODE;
 
-// Generic structure for describing the sample buffer.
+// Some useful macros:
+static WEBP_INLINE int WebPIsPremultipliedMode(WEBP_CSP_MODE mode) {
+  return (mode == MODE_rgbA || mode == MODE_bgrA || mode == MODE_Argb ||
+          mode == MODE_rgbA_4444);
+}
+
+static WEBP_INLINE int WebPIsAlphaMode(WEBP_CSP_MODE mode) {
+  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
+          mode == MODE_RGBA_4444 || mode == MODE_YUVA ||
+          WebPIsPremultipliedMode(mode));
+}
+
+static WEBP_INLINE int WebPIsRGBMode(WEBP_CSP_MODE mode) {
+  return (mode < MODE_YUV);
+}
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer: Generic structure for describing the output sample buffer.
+
 typedef struct {    // view as RGBA
   uint8_t* rgba;    // pointer to RGBA samples
   int stride;       // stride in bytes from one scanline to the next.
-  int size;         // total size of the *rgba buffer.
+  size_t size;      // total size of the *rgba buffer.
 } WebPRGBABuffer;
 
 typedef struct {              // view as YUVA
@@ -130,9 +163,9 @@
   int y_stride;               // luma stride
   int u_stride, v_stride;     // chroma strides
   int a_stride;               // alpha stride
-  int y_size;                 // luma plane size
-  int u_size, v_size;         // chroma planes size
-  int a_size;                 // alpha-plane size
+  size_t y_size;              // luma plane size
+  size_t u_size, v_size;      // chroma planes size
+  size_t a_size;              // alpha-plane size
 } WebPYUVABuffer;
 
 // Output buffer
@@ -144,25 +177,27 @@
     WebPRGBABuffer RGBA;
     WebPYUVABuffer YUVA;
   } u;                       // Nameless union of buffer parameters.
+  uint32_t       pad[4];     // padding for later use
+
   uint8_t* private_memory;   // Internally allocated memory (only when
                              // is_external_memory is false). Should not be used
                              // externally, but accessed via the buffer union.
 } WebPDecBuffer;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer* const, int);
+WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
 
 // Initialize the structure as empty. Must be called before any other use.
 // Returns false in case of version mismatch
-static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) {
+static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
   return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
 }
 
 // Free any memory associated with the buffer. Must always be called last.
 // Note: doesn't free the 'buffer' structure itself.
-WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* const buffer);
+WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Enumeration of the status codes
 
 typedef enum {
@@ -176,7 +211,7 @@
   VP8_STATUS_NOT_ENOUGH_DATA
 } VP8StatusCode;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Incremental decoding
 //
 // This API allows streamlined decoding of partial data.
@@ -185,7 +220,10 @@
 // picture is only partially decoded, pending additional input.
 // Code example:
 //
-//   WebPIDecoder* const idec = WebPINew(mode);
+//   WebPInitDecBuffer(&buffer);
+//   buffer.colorspace = mode;
+//   ...
+//   WebPIDecoder* idec = WebPINewDecoder(&buffer);
 //   while (has_more_data) {
 //     // ... (get additional data)
 //     status = WebPIAppend(idec, new_data, new_data_size);
@@ -195,7 +233,7 @@
 //
 //     // The above call decodes the current available buffer.
 //     // Part of the image can now be refreshed by calling to
-//     // WebPIDecGetRGB()/WebPIDecGetYUV() etc.
+//     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
 //   }
 //   WebPIDelete(idec);
 
@@ -207,42 +245,47 @@
 // is kept, which means that the lifespan of 'output_buffer' must be larger than
 // that of the returned WebPIDecoder object.
 // Returns NULL if the allocation failed.
-WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* const output_buffer);
-
-// Creates a WebPIDecoder object. Returns NULL in case of failure.
-// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder().
-WEBP_EXTERN(WebPIDecoder*) WebPINew(WEBP_CSP_MODE mode);
+WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 
 // This function allocates and initializes an incremental-decoder object, which
-// will output the r/g/b(/a) samples specified by 'mode' into a preallocated
+// will output the RGB/A samples specified by 'csp' into a preallocated
 // buffer 'output_buffer'. The size of this buffer is at least
 // 'output_buffer_size' and the stride (distance in bytes between two scanlines)
 // is specified by 'output_stride'. Returns NULL if the allocation failed.
 WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
-    WEBP_CSP_MODE mode,
-    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+    WEBP_CSP_MODE csp,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // This function allocates and initializes an incremental-decoder object, which
 // will output the raw luma/chroma samples into a preallocated planes. The luma
 // plane is specified by its pointer 'luma', its size 'luma_size' and its stride
 // 'luma_stride'. Similarly, the chroma-u plane is specified by the 'u',
-// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v', 'v_size'
-// and 'v_size'.
+// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v'
+// and 'v_size'. And same for the alpha-plane. The 'a' pointer can be pass
+// NULL in case one is not interested in the transparency plane.
 // Returns NULL if the allocation failed.
+WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride,
+    uint8_t* a, size_t a_size, int a_stride);
+
+// Deprecated version of the above, without the alpha plane.
+// Kept for backward compatibility.
 WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
-    uint8_t* luma, int luma_size, int luma_stride,
-    uint8_t* u, int u_size, int u_stride,
-    uint8_t* v, int v_size, int v_stride);
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride);
 
 // Deletes the WebPIDecoder object and associated memory. Must always be called
-// if WebPINew, WebPINewRGB or WebPINewYUV succeeded.
-WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* const idec);
+// if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded.
+WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec);
 
 // Copies and decodes the next available data. Returns VP8_STATUS_OK when
 // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
 // data is expected. Returns error in other cases.
 WEBP_EXTERN(VP8StatusCode) WebPIAppend(
-    WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);
+    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
 // A variant of the above function to be used when data buffer contains
 // partial data from the beginning. In this case data buffer is not copied
@@ -250,23 +293,34 @@
 // Note that the value of the 'data' pointer can change between calls to
 // WebPIUpdate, for instance when the data buffer is resized to fit larger data.
 WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
-    WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);
+    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
-// Returns the r/g/b/(a) image decoded so far. Returns NULL if output params
-// are not initialized yet. The r/g/b/(a) output type corresponds to the mode
-// specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded
-// row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if
-// corresponding information is not needed.
+// Returns the RGB/A image decoded so far. Returns NULL if output params
+// are not initialized yet. The RGB/A output type corresponds to the colorspace
+// specified during call to WebPINewDecoder() or WebPINewRGB().
+// *last_y is the index of last decoded row in raster scan order. Some pointers
+// (*last_y, *width etc.) can be NULL if corresponding information is not
+// needed.
 WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
-    const WebPIDecoder* const idec, int* last_y,
+    const WebPIDecoder* idec, int* last_y,
     int* width, int* height, int* stride);
 
-// Same as above function to get YUV image. Returns pointer to the luma plane
-// or NULL in case of error.
-WEBP_EXTERN(uint8_t*) WebPIDecGetYUV(
-    const WebPIDecoder* const idec, int* last_y,
-    uint8_t** u, uint8_t** v,
-    int* width, int* height, int* stride, int* uv_stride);
+// Same as above function to get a YUVA image. Returns pointer to the luma
+// plane or NULL in case of error. If there is no alpha information
+// the alpha pointer '*a' will be returned NULL.
+WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
+    const WebPIDecoder* idec, int* last_y,
+    uint8_t** u, uint8_t** v, uint8_t** a,
+    int* width, int* height, int* stride, int* uv_stride, int* a_stride);
+
+// Deprecated alpha-less version of WebPIDecGetYUVA(): it will ignore the
+// alpha information (if present). Kept for backward compatibility.
+static WEBP_INLINE uint8_t* WebPIDecGetYUV(
+    const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v,
+    int* width, int* height, int* stride, int* uv_stride) {
+  return WebPIDecGetYUVA(idec, last_y, u, v, NULL, width, height,
+                         stride, uv_stride, NULL);
+}
 
 // Generic call to retrieve information about the displayable area.
 // If non NULL, the left/right/width/height pointers are filled with the visible
@@ -275,11 +329,9 @@
 // Otherwise returns the pointer to the internal representation. This structure
 // is read-only, tied to WebPIDecoder's lifespan and should not be modified.
 WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
-    const WebPIDecoder* const idec,
-    int* const left, int* const top,
-    int* const width, int* const height);
+    const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Advanced decoding parametrization
 //
 //  Code sample for using the advanced decoding API
@@ -314,27 +366,30 @@
 
 // Features gathered from the bitstream
 typedef struct {
-  int width;        // the original width, as read from the bitstream
-  int height;       // the original height, as read from the bitstream
-  int has_alpha;    // true if bitstream contains an alpha channel
+  int width;        // Width in pixels, as read from the bitstream.
+  int height;       // Height in pixels, as read from the bitstream.
+  int has_alpha;    // True if the bitstream contains an alpha channel.
+
+  // Unused for now:
+  int bitstream_version;        // should be 0 for now. TODO(later)
   int no_incremental_decoding;  // if true, using incremental decoding is not
                                 // recommended.
   int rotate;                   // TODO(later)
   int uv_sampling;              // should be 0 for now. TODO(later)
-  int bitstream_version;        // should be 0 for now. TODO(later)
+  uint32_t pad[3];              // padding for later use
 } WebPBitstreamFeatures;
 
 // Internal, version-checked, entry point
 WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
-    const uint8_t*, uint32_t, WebPBitstreamFeatures* const, int);
+    const uint8_t*, size_t, WebPBitstreamFeatures*, int);
 
 // Retrieve features from the bitstream. The *features structure is filled
 // with information gathered from the bitstream.
 // Returns false in case of error or version mismatch.
 // In case of error, features->bitstream_status will reflect the error code.
-static inline
-  VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size,
-                                WebPBitstreamFeatures* const features) {
+static WEBP_INLINE VP8StatusCode WebPGetFeatures(
+    const uint8_t* data, size_t data_size,
+    WebPBitstreamFeatures* features) {
   return WebPGetFeaturesInternal(data, data_size, features,
                                  WEBP_DECODER_ABI_VERSION);
 }
@@ -349,8 +404,12 @@
   int crop_width, crop_height;        // dimension of the cropping area
   int use_scaling;                    // if true, scaling is applied _afterward_
   int scaled_width, scaled_height;    // final resolution
+  int use_threads;                    // if true, use multi-threaded decoding
+
+  // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
+  uint32_t pad[6];                    // padding for later use
 } WebPDecoderOptions;
 
 // Main object storing the configuration for advanced decoding.
@@ -361,32 +420,32 @@
 } WebPDecoderConfig;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int);
+WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
 
 // Initialize the configuration as empty. This function must always be
 // called first, unless WebPGetFeatures() is to be called.
 // Returns false in case of mismatched version.
-static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) {
+static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
   return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
 }
 
-// Instantiate a new incremental decoder object with requested configuration.
-// The bitstream can be passed using *data and data_size parameter,
-// in which case the features will be parsed and stored into config->input.
-// Otherwise, 'data' can be NULL and now parsing will occur.
-// Note that 'config' can be NULL too, in which case a default configuration is
-// used.
+// Instantiate a new incremental decoder object with the requested
+// configuration. The bitstream can be passed using 'data' and 'data_size'
+// parameter, in which case the features will be parsed and stored into
+// config->input. Otherwise, 'data' can be NULL and no parsing will occur.
+// Note that 'config' can be NULL too, in which case a default configuration
+// is used.
 // The return WebPIDecoder object must always be deleted calling WebPIDelete().
 // Returns NULL in case of error (and config->status will then reflect
 // the error condition).
-WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, uint32_t data_size,
-                                       WebPDecoderConfig* const config);
+WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
+                                       WebPDecoderConfig* config);
 
 // Non-incremental version. This version decodes the full data at once, taking
-// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding
-// was successful).
-WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, uint32_t data_size,
-                                      WebPDecoderConfig* const config);
+// 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
+// if the decoding was successful).
+WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
+                                      WebPDecoderConfig* config);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/include/webp/decode_vp8.h b/include/webp/decode_vp8.h
index af276ad..cf1654f 100644
--- a/include/webp/decode_vp8.h
+++ b/include/webp/decode_vp8.h
@@ -1,154 +1,14 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  Low-level API for VP8 decoder
-//
-// Author: Skal (pascal.massimino@gmail.com)
+//  Dummy file retained to honor decode_vp8.h include in
+//  skia/src/images/SkImageDecoder_libwebp.cpp
 
 #ifndef WEBP_WEBP_DECODE_VP8_H_
 #define WEBP_WEBP_DECODE_VP8_H_
 
-#include "./decode.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// Lower-level API
-//
-// These functions provide fine-grained control of the decoding process.
-// The call flow should resemble:
-//
-//   VP8Io io;
-//   VP8InitIo(&io);
-//   io.data = data;
-//   io.data_size = size;
-//   /* customize io's functions (setup()/put()/teardown()) if needed. */
-//
-//   VP8Decoder* dec = VP8New();
-//   bool ok = VP8Decode(dec);
-//   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
-//   VP8Delete(dec);
-//   return ok;
-
-// Input / Output
-typedef struct VP8Io VP8Io;
-typedef int (*VP8IoPutHook)(const VP8Io* io);
-typedef int (*VP8IoSetupHook)(VP8Io* io);
-typedef void (*VP8IoTeardownHook)(const VP8Io* io);
-
-struct VP8Io {
-  // set by VP8GetHeaders()
-  int width, height;         // picture dimensions, in pixels (invariable).
-                             // These are the original, uncropped dimensions.
-                             // The actual area passed to put() is stored
-                             // in mb_w / mb_h fields.
-
-  // set before calling put()
-  int mb_y;                  // position of the current rows (in pixels)
-  int mb_w;                  // number of columns in the sample
-  int mb_h;                  // number of rows in the sample
-  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
-  int y_stride;              // row stride for luma
-  int uv_stride;             // row stride for chroma
-
-  void* opaque;              // user data
-
-  // called when fresh samples are available. Currently, samples are in
-  // YUV420 format, and can be up to width x 24 in size (depending on the
-  // in-loop filtering level, e.g.). Should return false in case of error
-  // or abort request. The actual size of the area to update is mb_w x mb_h
-  // in size, taking cropping into account.
-  VP8IoPutHook put;
-
-  // called just before starting to decode the blocks.
-  // Should returns 0 in case of error.
-  VP8IoSetupHook setup;
-
-  // called just after block decoding is finished (or when an error occurred).
-  VP8IoTeardownHook teardown;
-
-  // this is a recommendation for the user-side yuv->rgb converter. This flag
-  // is set when calling setup() hook and can be overwritten by it. It then
-  // can be taken into consideration during the put() method.
-  int fancy_upsampling;
-
-  // Input buffer.
-  uint32_t data_size;
-  const uint8_t* data;
-
-  // If true, in-loop filtering will not be performed even if present in the
-  // bitstream. Switching off filtering may speed up decoding at the expense
-  // of more visible blocking. Note that output will also be non-compliant
-  // with the VP8 specifications.
-  int bypass_filtering;
-
-  // Cropping parameters.
-  int use_cropping;
-  int crop_left, crop_right, crop_top, crop_bottom;
-
-  // Scaling parameters.
-  int use_scaling;
-  int scaled_width, scaled_height;
-
-  // pointer to the alpha data (if present) corresponding to the rows
-  const uint8_t* a;
-};
-
-// Internal, version-checked, entry point
-WEBP_EXTERN(int) VP8InitIoInternal(VP8Io* const, int);
-
-// Set the custom IO function pointers and user-data. The setter for IO hooks
-// should be called before initiating incremental decoding. Returns true if
-// WebPIDecoder object is successfully modified, false otherwise.
-WEBP_EXTERN(int) WebPISetIOHooks(WebPIDecoder* const idec,
-                                 VP8IoPutHook put,
-                                 VP8IoSetupHook setup,
-                                 VP8IoTeardownHook teardown,
-                                 void* user_data);
-
-// Main decoding object. This is an opaque structure.
-typedef struct VP8Decoder VP8Decoder;
-
-// Create a new decoder object.
-WEBP_EXTERN(VP8Decoder*) VP8New(void);
-
-// Must be called to make sure 'io' is initialized properly.
-// Returns false in case of version mismatch. Upon such failure, no other
-// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
-static inline int VP8InitIo(VP8Io* const io) {
-  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
-}
-
-// Start decoding a new picture. Returns true if ok.
-WEBP_EXTERN(int) VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
-
-// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
-// Returns false in case of error.
-WEBP_EXTERN(int) VP8Decode(VP8Decoder* const dec, VP8Io* const io);
-
-// Return current status of the decoder:
-WEBP_EXTERN(VP8StatusCode) VP8Status(VP8Decoder* const dec);
-
-// return readable string corresponding to the last status.
-WEBP_EXTERN(const char*) VP8StatusMessage(VP8Decoder* const dec);
-
-// Resets the decoder in its initial state, reclaiming memory.
-// Not a mandatory call between calls to VP8Decode().
-WEBP_EXTERN(void) VP8Clear(VP8Decoder* const dec);
-
-// Destroy the decoder object.
-WEBP_EXTERN(void) VP8Delete(VP8Decoder* const dec);
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
 #endif  /* WEBP_WEBP_DECODE_VP8_H_ */
diff --git a/include/webp/encode.h b/include/webp/encode.h
index af6f0a2..d87426b 100644
--- a/include/webp/encode.h
+++ b/include/webp/encode.h
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -12,26 +12,27 @@
 #ifndef WEBP_WEBP_ENCODE_H_
 #define WEBP_WEBP_ENCODE_H_
 
-#include <stdlib.h>
-
 #include "./types.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0002
+#define WEBP_ENCODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
 WEBP_EXTERN(int) WebPGetEncoderVersion(void);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // One-stop-shop call! No questions asked:
 
 // Returns the size of the compressed data (pointed to by *output), or 0 if
 // an error occurred. The compressed data must be released by the caller
 // using the call 'free(*output)'.
+// These functions compress using the lossy format, and the quality_factor
+// can go from 0 (smaller output, lower quality) to 100 (best quality,
+// larger output).
 WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb,
                                   int width, int height, int stride,
                                   float quality_factor, uint8_t** output);
@@ -45,31 +46,71 @@
                                    int width, int height, int stride,
                                    float quality_factor, uint8_t** output);
 
-//-----------------------------------------------------------------------------
+// These functions are the equivalent of the above, but compressing in a
+// lossless manner. Files are usually larger than lossy format, but will
+// not suffer any compression loss.
+WEBP_EXTERN(size_t) WebPEncodeLosslessRGB(const uint8_t* rgb,
+                                          int width, int height, int stride,
+                                          uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeLosslessBGR(const uint8_t* bgr,
+                                          int width, int height, int stride,
+                                          uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeLosslessRGBA(const uint8_t* rgba,
+                                           int width, int height, int stride,
+                                           uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra,
+                                           int width, int height, int stride,
+                                           uint8_t** output);
+
+//------------------------------------------------------------------------------
 // Coding parameters
 
-typedef struct {
-  float quality;         // between 0 (smallest file) and 100 (biggest)
-  int target_size;       // if non-zero, set the desired target size in bytes.
-                         // Takes precedence over the 'compression' parameter.
-  float target_PSNR;     // if non-zero, specifies the minimal distortion to
-                         // try to achieve. Takes precedence over target_size.
-  int method;            // quality/speed trade-off (0=fast, 6=slower-better)
-  int segments;          // maximum number of segments to use, in [1..4]
-  int sns_strength;      // Spatial Noise Shaping. 0=off, 100=maximum.
-  int filter_strength;   // range: [0 = off .. 100 = strongest]
-  int filter_sharpness;  // range: [0 = off .. 7 = least sharp]
-  int filter_type;       // filtering type: 0 = simple, 1 = strong
-                         // (only used if filter_strength > 0 or autofilter > 0)
-  int autofilter;        // Auto adjust filter's strength [0 = off, 1 = on]
-  int pass;              // number of entropy-analysis passes (in [1..10]).
+// Image characteristics hint for the underlying encoder.
+typedef enum {
+  WEBP_HINT_DEFAULT = 0,  // default preset.
+  WEBP_HINT_PICTURE,      // digital picture, like portrait, inner shot
+  WEBP_HINT_PHOTO,        // outdoor photograph, with natural lighting
+  WEBP_HINT_GRAPH,        // Discrete tone image (graph, map-tile etc).
+  WEBP_HINT_LAST
+} WebPImageHint;
 
-  int show_compressed;   // if true, export the compressed picture back.
-                         // In-loop filtering is not applied.
-  int preprocessing;     // preprocessing filter (0=none, 1=segment-smooth)
-  int partitions;        // log2(number of token partitions) in [0..3]
-                         // Default is set to 0 for easier progressive decoding.
-  int alpha_compression;  // Algorithm for optimizing the alpha plane (0 = none)
+typedef struct {
+  int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
+  float quality;          // between 0 (smallest file) and 100 (biggest)
+  int method;             // quality/speed trade-off (0=fast, 6=slower-better)
+
+  WebPImageHint image_hint;  // Hint for image type (lossless only for now).
+
+  // Parameters related to lossy compression only:
+  int target_size;        // if non-zero, set the desired target size in bytes.
+                          // Takes precedence over the 'compression' parameter.
+  float target_PSNR;      // if non-zero, specifies the minimal distortion to
+                          // try to achieve. Takes precedence over target_size.
+  int segments;           // maximum number of segments to use, in [1..4]
+  int sns_strength;       // Spatial Noise Shaping. 0=off, 100=maximum.
+  int filter_strength;    // range: [0 = off .. 100 = strongest]
+  int filter_sharpness;   // range: [0 = off .. 7 = least sharp]
+  int filter_type;        // filtering type: 0 = simple, 1 = strong (only used
+                          // if filter_strength > 0 or autofilter > 0)
+  int autofilter;         // Auto adjust filter's strength [0 = off, 1 = on]
+  int alpha_compression;  // Algorithm for encoding the alpha plane (0 = none,
+                          // 1 = compressed with WebP lossless). Default is 1.
+  int alpha_filtering;    // Predictive filtering method for alpha plane.
+                          //  0: none, 1: fast, 2: best. Default if 1.
+  int alpha_quality;      // Between 0 (smallest size) and 100 (lossless).
+                          // Default is 100.
+  int pass;               // number of entropy-analysis passes (in [1..10]).
+
+  int show_compressed;    // if true, export the compressed picture back.
+                          // In-loop filtering is not applied.
+  int preprocessing;      // preprocessing filter (0=none, 1=segment-smooth)
+  int partitions;         // log2(number of token partitions) in [0..3]. Default
+                          // is set to 0 for easier progressive decoding.
+  int partition_limit;    // quality degradation allowed to fit the 512k limit
+                          // on prediction modes coding (0: no degradation,
+                          // 100: maximum possible degradation).
+
+  uint32_t pad[8];        // padding for later use
 } WebPConfig;
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -84,13 +125,13 @@
 } WebPPreset;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPConfigInitInternal(
-    WebPConfig* const, WebPPreset, float, int);
+WEBP_EXTERN(int) WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
 
 // Should always be called, to initialize a fresh WebPConfig structure before
-// modification. Returns 0 in case of version mismatch. WebPConfigInit() must
-// have succeeded before using the 'config' object.
-static inline int WebPConfigInit(WebPConfig* const config) {
+// modification. Returns false in case of version mismatch. WebPConfigInit()
+// must have succeeded before using the 'config' object.
+// Note that the default values are lossless=0 and quality=75.
+static WEBP_INLINE int WebPConfigInit(WebPConfig* config) {
   return WebPConfigInitInternal(config, WEBP_PRESET_DEFAULT, 75.f,
                                 WEBP_ENCODER_ABI_VERSION);
 }
@@ -98,25 +139,27 @@
 // This function will initialize the configuration according to a predefined
 // set of parameters (referred to by 'preset') and a given quality factor.
 // This function can be called as a replacement to WebPConfigInit(). Will
-// return 0 in case of error.
-static inline int WebPConfigPreset(WebPConfig* const config,
-                                   WebPPreset preset, float quality) {
+// return false in case of error.
+static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
+                                        WebPPreset preset, float quality) {
   return WebPConfigInitInternal(config, preset, quality,
                                 WEBP_ENCODER_ABI_VERSION);
 }
 
-// Returns 1 if all parameters are in valid range and the configuration is OK.
-WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* const config);
+// Returns true if 'config' is non-NULL and all configuration parameters are
+// within their valid ranges.
+WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Input / Output
 
 typedef struct WebPPicture WebPPicture;   // main structure for I/O
 
-// non-essential structure for storing auxiliary statistics
+// Structure for storing auxiliary statistics (mostly for lossy encoding).
 typedef struct {
-  float PSNR[4];          // peak-signal-to-noise ratio for Y/U/V/All
   int coded_size;         // final size
+
+  float PSNR[5];          // peak-signal-to-noise ratio for Y/U/V/All/Alpha
   int block_count[3];     // number of intra4/intra16/skipped macroblocks
   int header_bytes[2];    // approximate number of bytes spent for header
                           // and mode-partition #0
@@ -128,13 +171,46 @@
 
   int alpha_data_size;    // size of the transparency data
   int layer_data_size;    // size of the enhancement layer data
+
+  // lossless encoder statistics
+  uint32_t lossless_features;  // bit0:predictor bit1:cross-color transform
+                               // bit2:subtract-green bit3:color indexing
+  int histogram_bits;          // number of precision bits of histogram
+  int transform_bits;          // precision bits for transform
+  int cache_bits;              // number of bits for color cache lookup
+  int palette_size;            // number of color in palette, if used
+  int lossless_size;           // final lossless size
+
+  uint32_t pad[4];        // padding for later use
 } WebPAuxStats;
 
-// Signature for output function. Should return 1 if writing was successful.
+// Signature for output function. Should return true if writing was successful.
 // data/data_size is the segment of data to write, and 'picture' is for
 // reference (and so one can make use of picture->custom_ptr).
 typedef int (*WebPWriterFunction)(const uint8_t* data, size_t data_size,
-                                  const WebPPicture* const picture);
+                                  const WebPPicture* picture);
+
+// WebPMemoryWrite: a special WebPWriterFunction that writes to memory using
+// the following WebPMemoryWriter object (to be set as a custom_ptr).
+typedef struct {
+  uint8_t* mem;       // final buffer (of size 'max_size', larger than 'size').
+  size_t   size;      // final size
+  size_t   max_size;  // total capacity
+  uint32_t pad[1];    // padding for later use
+} WebPMemoryWriter;
+
+// The following must be called first before any use.
+WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
+
+// The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
+// completion, writer.mem and writer.size will hold the coded data.
+WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                                 const WebPPicture* picture);
+
+// Progress hook, called from time to time to report progress. It can return
+// false to request an abort of the encoding process, or true otherwise if
+// everything is OK.
+typedef int (*WebPProgressHook)(int percent, const WebPPicture* picture);
 
 typedef enum {
   // chroma sampling
@@ -159,25 +235,49 @@
   VP8_ENC_ERROR_NULL_PARAMETER,           // a pointer parameter is NULL
   VP8_ENC_ERROR_INVALID_CONFIGURATION,    // configuration is invalid
   VP8_ENC_ERROR_BAD_DIMENSION,            // picture has invalid width/height
-  VP8_ENC_ERROR_PARTITION0_OVERFLOW,      // partition is too bigger than 16M
-  VP8_ENC_ERROR_PARTITION_OVERFLOW,       // partition is too bigger than 512k
+  VP8_ENC_ERROR_PARTITION0_OVERFLOW,      // partition is bigger than 512k
+  VP8_ENC_ERROR_PARTITION_OVERFLOW,       // partition is bigger than 16M
   VP8_ENC_ERROR_BAD_WRITE,                // error while flushing bytes
+  VP8_ENC_ERROR_FILE_TOO_BIG,             // file is bigger than 4G
+  VP8_ENC_ERROR_USER_ABORT,               // abort request by user
+  VP8_ENC_ERROR_LAST                      // list terminator. always last.
 } WebPEncodingError;
 
+// maximum width/height allowed (inclusive), in pixels
+#define WEBP_MAX_DIMENSION 16383
+
+// Main exchange structure (input samples, output bytes, statistics)
 struct WebPPicture {
-  // input
+
+  //   INPUT
+  //////////////
+  // Main flag for encoder selecting between ARGB or YUV input.
+  // It is recommended to use ARGB input (*argb, argb_stride) for lossless
+  // compression, and YUV input (*y, *u, *v, etc.) for lossy compression
+  // since these are the respective native colorspace for these formats.
+  int use_argb;
+
+  // YUV input (mostly used for input to lossy compression)
   WebPEncCSP colorspace;     // colorspace: should be YUV420 for now (=Y'CbCr).
-  int width, height;         // dimensions.
+  int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
   uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
   int y_stride, uv_stride;   // luma/chroma strides.
-  uint8_t *a;                // pointer to the alpha plane
+  uint8_t* a;                // pointer to the alpha plane
   int a_stride;              // stride of the alpha plane
+  uint32_t pad1[2];          // padding for later use
 
-  // output
+  // ARGB input (mostly used for input to lossless compression)
+  uint32_t* argb;            // Pointer to argb (32 bit) plane.
+  int argb_stride;           // This is stride in pixels units, not bytes.
+  uint32_t pad2[3];          // padding for later use
+
+  //   OUTPUT
+  ///////////////
+  // Byte-emission hook, to store compressed bytes as they are ready.
   WebPWriterFunction writer;  // can be NULL
   void* custom_ptr;           // can be used by the writer.
 
-  // map for extra information
+  // map for extra information (only for lossy compression mode)
   int extra_info_type;    // 1: intra type, 2: segment, 3: quant
                           // 4: intra-16 prediction mode,
                           // 5: chroma prediction mode,
@@ -187,84 +287,174 @@
                           // will be filled with a macroblock map, depending
                           // on extra_info_type.
 
-  // where to store statistics, if not NULL:
+  //   STATS AND REPORTS
+  ///////////////////////////
+  // Pointer to side statistics (updated only if not NULL)
   WebPAuxStats* stats;
 
-  // original samples (for non-YUV420 modes)
+  // Error code for the latest error encountered during encoding
+  WebPEncodingError error_code;
+
+  // If not NULL, report progress during encoding.
+  WebPProgressHook progress_hook;
+
+  void* user_data;        // this field is free to be set to any value and
+                          // used during callbacks (like progress-report e.g.).
+
+  uint32_t pad3[3];       // padding for later use
+
+  // Unused for now: original samples (for non-YUV420 modes)
   uint8_t *u0, *v0;
   int uv0_stride;
 
-  WebPEncodingError error_code;   // error code in case of problem.
+  uint32_t pad4[7];       // padding for later use
+
+  // PRIVATE FIELDS
+  ////////////////////
+  void* memory_;          // row chunk of memory for yuva planes
+  void* memory_argb_;     // and for argb too.
+  void* pad5[2];          // padding for later use
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture* const, int);
+WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture*, int);
 
-// Should always be called, to initialize the structure. Returns 0 in case of
-// version mismatch. WebPPictureInit() must have succeeded before using the
+// Should always be called, to initialize the structure. Returns false in case
+// of version mismatch. WebPPictureInit() must have succeeded before using the
 // 'picture' object.
-static inline int WebPPictureInit(WebPPicture* const picture) {
+// Note that, by default, use_argb is false and colorspace is WEBP_YUV420.
+static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
   return WebPPictureInitInternal(picture, WEBP_ENCODER_ABI_VERSION);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPPicture utils
 
 // Convenience allocation / deallocation based on picture->width/height:
 // Allocate y/u/v buffers as per colorspace/width/height specification.
 // Note! This function will free the previous buffer if needed.
-// Returns 0 in case of memory error.
-WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* const picture);
+// Returns false in case of memory error.
+WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* picture);
 
-// Release memory allocated by WebPPictureAlloc() or WebPPictureImport*()
-// Note that this function does _not_ free the memory pointed to by 'picture'.
-WEBP_EXTERN(void) WebPPictureFree(WebPPicture* const picture);
+// Release the memory allocated by WebPPictureAlloc() or WebPPictureImport*().
+// Note that this function does _not_ free the memory used by the 'picture'
+// object itself.
+// Besides memory (which is reclaimed) all other fields of 'picture' are
+// preserved.
+WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
 
-// Copy the pixels of *src into *dst, using WebPPictureAlloc.
-// Returns 0 in case of memory allocation error.
-WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* const src,
-                                 WebPPicture* const dst);
+// Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return,
+// *dst will fully own the copied pixels (this is not a view).
+// Returns false in case of memory allocation error.
+WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
+
+// Compute PSNR, SSIM or LSIM distortion metric between two pictures.
+// Result is in dB, stores in result[] in the Y/U/V/Alpha/All order.
+// Returns false in case of error (pic1 and pic2 don't have same dimension, ...)
+// Warning: this function is rather CPU-intensive.
+WEBP_EXTERN(int) WebPPictureDistortion(
+    const WebPPicture* pic1, const WebPPicture* pic2,
+    int metric_type,           // 0 = PSNR, 1 = SSIM, 2 = LSIM
+    float result[5]);
 
 // self-crops a picture to the rectangle defined by top/left/width/height.
-// Returns 0 in case of memory allocation error, or if the rectangle is
+// Returns false in case of memory allocation error, or if the rectangle is
 // outside of the source picture.
-WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* const picture,
+// The rectangle for the view is defined by the top-left corner pixel
+// coordinates (left, top) as well as its width and height. This rectangle
+// must be fully be comprised inside the 'src' source picture. If the source
+// picture uses the YUV420 colorspace, the top and left coordinates will be
+// snapped to even values.
+WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
                                  int left, int top, int width, int height);
 
+// Extracts a view from 'src' picture into 'dst'. The rectangle for the view
+// is defined by the top-left corner pixel coordinates (left, top) as well
+// as its width and height. This rectangle must be fully be comprised inside
+// the 'src' source picture. If the source picture uses the YUV420 colorspace,
+// the top and left coordinates will be snapped to even values.
+// Picture 'src' must out-live 'dst' picture. Self-extraction of view is allowed
+// ('src' equal to 'dst') as a mean of fast-cropping (but note that doing so,
+// the original dimension will be lost).
+// Returns false in case of memory allocation error or invalid parameters.
+WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
+                                 int left, int top, int width, int height,
+                                 WebPPicture* dst);
+
+// Returns true if the 'picture' is actually a view and therefore does
+// not own the memory for pixels.
+WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);
+
 // Rescale a picture to new dimension width x height.
 // Now gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
-WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* const pic,
-                                    int width, int height);
+WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);
 
 // Colorspace conversion function to import RGB samples.
 // Previous buffer will be free'd, if any.
 // *rgb buffer should have a size of at least height * rgb_stride.
-// Returns 0 in case of memory error.
+// Returns false in case of memory error.
 WEBP_EXTERN(int) WebPPictureImportRGB(
-    WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride);
-// Same, but for RGBA buffer
+    WebPPicture* picture, const uint8_t* rgb, int rgb_stride);
+// Same, but for RGBA buffer.
 WEBP_EXTERN(int) WebPPictureImportRGBA(
-    WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride);
+    WebPPicture* picture, const uint8_t* rgba, int rgba_stride);
+// Same, but for RGBA buffer. Imports the RGB direct from the 32-bit format
+// input buffer ignoring the alpha channel. Avoids needing to copy the data
+// to a temporary 24-bit RGB buffer to import the RGB only.
+WEBP_EXTERN(int) WebPPictureImportRGBX(
+    WebPPicture* picture, const uint8_t* rgbx, int rgbx_stride);
 
-// Variant of the above, but taking BGR(A) input:
+// Variants of the above, but taking BGR(A|X) input.
 WEBP_EXTERN(int) WebPPictureImportBGR(
-    WebPPicture* const picture, const uint8_t* const bgr, int bgr_stride);
+    WebPPicture* picture, const uint8_t* bgr, int bgr_stride);
 WEBP_EXTERN(int) WebPPictureImportBGRA(
-    WebPPicture* const picture, const uint8_t* const bgra, int bgra_stride);
+    WebPPicture* picture, const uint8_t* bgra, int bgra_stride);
+WEBP_EXTERN(int) WebPPictureImportBGRX(
+    WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
-//-----------------------------------------------------------------------------
+// Converts picture->argb data to the YUVA format specified by 'colorspace'.
+// Upon return, picture->use_argb is set to false. The presence of real
+// non-opaque transparent values is detected, and 'colorspace' will be
+// adjusted accordingly. Note that this method is lossy.
+// Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
+                                       WebPEncCSP colorspace);
+
+// Converts picture->yuv to picture->argb and sets picture->use_argb to true.
+// The input format must be YUV_420 or YUV_420A.
+// Note that the use of this method is discouraged if one has access to the
+// raw ARGB samples, since using YUV420 is comparatively lossy. Also, the
+// conversion from YUV420 to ARGB incurs a small loss too.
+// Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
+
+// Helper function: given a width x height plane of YUV(A) samples
+// (with stride 'stride'), clean-up the YUV samples under fully transparent
+// area, to help compressibility (no guarantee, though).
+WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
+
+// Scan the picture 'picture' for the presence of non fully opaque alpha values.
+// Returns true in such case. Otherwise returns false (indicating that the
+// alpha plane can be ignored altogether e.g.).
+WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture);
+
+//------------------------------------------------------------------------------
 // Main call
 
 // Main encoding call, after config and picture have been initialized.
-// 'picture' must be less than 16384x16384 in dimension, and the 'config' object
-// must be a valid one.
+// 'picture' must be less than 16384x16384 in dimension (cf WEBP_MAX_DIMENSION),
+// and the 'config' object must be a valid one.
 // Returns false in case of error, true otherwise.
 // In case of error, picture->error_code is updated accordingly.
-WEBP_EXTERN(int) WebPEncode(
-    const WebPConfig* const config, WebPPicture* const picture);
+// 'picture' can hold the source samples in both YUV(A) or ARGB input, depending
+// on the value of 'picture->use_argb'. It is highly recommended to use
+// the former for lossy encoding, and the latter for lossless encoding
+// (when config.lossless is true). Automatic conversion from one format to
+// another is provided but they both incur some loss.
+WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/include/webp/format_constants.h b/include/webp/format_constants.h
new file mode 100644
index 0000000..7ce498f
--- /dev/null
+++ b/include/webp/format_constants.h
@@ -0,0 +1,90 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//  Internal header for constants related to WebP file format.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_WEBP_FORMAT_CONSTANTS_H_
+#define WEBP_WEBP_FORMAT_CONSTANTS_H_
+
+// VP8 related constants.
+#define VP8_SIGNATURE 0x9d012a              // Signature in VP8 data.
+#define VP8_MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
+#define VP8_MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition
+#define VP8_FRAME_HEADER_SIZE 10  // Size of the frame header within VP8 data.
+
+// VP8L related constants.
+#define VP8L_SIGNATURE_SIZE          1      // VP8L signature size.
+#define VP8L_MAGIC_BYTE              0x2f   // VP8L signature byte.
+#define VP8L_IMAGE_SIZE_BITS         14     // Number of bits used to store
+                                            // width and height.
+#define VP8L_VERSION_BITS            3      // 3 bits reserved for version.
+#define VP8L_VERSION                 0      // version 0
+#define VP8L_FRAME_HEADER_SIZE       5      // Size of the VP8L frame header.
+
+#define MAX_PALETTE_SIZE             256
+#define MAX_CACHE_BITS               11
+#define HUFFMAN_CODES_PER_META_CODE  5
+#define ARGB_BLACK                   0xff000000
+
+#define DEFAULT_CODE_LENGTH          8
+#define MAX_ALLOWED_CODE_LENGTH      15
+
+#define NUM_LITERAL_CODES            256
+#define NUM_LENGTH_CODES             24
+#define NUM_DISTANCE_CODES           40
+#define CODE_LENGTH_CODES            19
+
+#define MIN_HUFFMAN_BITS             2  // min number of Huffman bits
+#define MAX_HUFFMAN_BITS             9  // max number of Huffman bits
+
+#define TRANSFORM_PRESENT            1  // The bit to be written when next data
+                                        // to be read is a transform.
+#define NUM_TRANSFORMS               4  // Maximum number of allowed transform
+                                        // in a bitstream.
+typedef enum {
+  PREDICTOR_TRANSFORM      = 0,
+  CROSS_COLOR_TRANSFORM    = 1,
+  SUBTRACT_GREEN           = 2,
+  COLOR_INDEXING_TRANSFORM = 3
+} VP8LImageTransformType;
+
+// Alpha related constants.
+#define ALPHA_HEADER_LEN            1
+#define ALPHA_NO_COMPRESSION        0
+#define ALPHA_LOSSLESS_COMPRESSION  1
+#define ALPHA_PREPROCESSED_LEVELS   1
+
+// Mux related constants.
+#define TAG_SIZE           4     // Size of a chunk tag (e.g. "VP8L").
+#define CHUNK_SIZE_BYTES   4     // Size needed to store chunk's size.
+#define CHUNK_HEADER_SIZE  8     // Size of a chunk header.
+#define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
+#define FRAME_CHUNK_SIZE   15    // Size of a FRM chunk.
+#define LOOP_CHUNK_SIZE    2     // Size of a LOOP chunk.
+#define TILE_CHUNK_SIZE    6     // Size of a TILE chunk.
+#define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
+
+#define TILING_FLAG_BIT    0x01  // Set if tiles are possibly used.
+#define ANIMATION_FLAG_BIT 0x02  // Set if some animation is expected
+#define ICC_FLAG_BIT       0x04  // Whether ICC is present or not.
+#define METADATA_FLAG_BIT  0x08  // Set if some META chunk is possibly present.
+#define ALPHA_FLAG_BIT     0x10  // Should be same as the ALPHA_FLAG in mux.h
+#define ROTATION_FLAG_BITS 0xe0  // all 3 bits for rotation + symmetry
+
+#define MAX_CANVAS_SIZE     (1 << 24)    // 24-bit max for VP8X width/height.
+#define MAX_IMAGE_AREA      (1ULL << 32) // 32-bit max for width x height.
+#define MAX_LOOP_COUNT      (1 << 16)    // maximum value for loop-count
+#define MAX_DURATION        (1 << 24)    // maximum duration
+#define MAX_POSITION_OFFSET (1 << 24)    // maximum frame/tile x/y offset
+
+// Maximum chunk payload is such that adding the header and padding won't
+// overflow a uint32_t.
+#define MAX_CHUNK_PAYLOAD (~0U - CHUNK_HEADER_SIZE - 1)
+
+#endif  /* WEBP_WEBP_FORMAT_CONSTANTS_H_ */
diff --git a/include/webp/types.h b/include/webp/types.h
index c48527b..3e27190 100644
--- a/include/webp/types.h
+++ b/include/webp/types.h
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -12,11 +12,15 @@
 #ifndef WEBP_WEBP_TYPES_H_
 #define WEBP_WEBP_TYPES_H_
 
+#include <stddef.h>  // for size_t
+
 #ifndef _MSC_VER
 #include <inttypes.h>
-#ifdef ANSI
-#define inline
-#endif  /* ANSI */
+#ifdef __STRICT_ANSI__
+#define WEBP_INLINE
+#else  /* __STRICT_ANSI__ */
+#define WEBP_INLINE inline
+#endif
 #else
 typedef signed   char int8_t;
 typedef unsigned char uint8_t;
@@ -26,7 +30,7 @@
 typedef unsigned int uint32_t;
 typedef unsigned long long int uint64_t;
 typedef long long int int64_t;
-#define inline __forceinline
+#define WEBP_INLINE __forceinline
 #endif  /* _MSC_VER */
 
 #ifndef WEBP_EXTERN
@@ -35,4 +39,7 @@
 #define WEBP_EXTERN(type) extern type
 #endif  /* WEBP_EXTERN */
 
+// Macro to check ABI compatibility (same major revision number)
+#define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
+
 #endif  /* WEBP_WEBP_TYPES_H_ */
diff --git a/src/dec/Android.mk b/src/dec/Android.mk
index 2f15866..ab795ae 100644
--- a/src/dec/Android.mk
+++ b/src/dec/Android.mk
@@ -16,27 +16,43 @@
 
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
-	alpha.c \
-	bits.c \
-	buffer.c \
-	dsp.c \
-	dsp_sse2.c \
-	frame.c \
-	idec.c \
-	io.c \
-	io_sse2.c \
-	layer.c \
-	quant.c \
-	tree.c \
-	vp8.c \
-	webp.c \
-	yuv.c
+        alpha.c \
+        buffer.c \
+        frame.c \
+        idec.c \
+        io.c \
+        layer.c \
+        quant.c \
+        tree.c \
+        vp8.c \
+        vp8l.c \
+        webp.c \
+        ../dsp/cpu.c \
+        ../dsp/dec.c \
+        ../dsp/dec_neon.c \
+        ../dsp/dec_sse2.c \
+        ../dsp/enc.c \
+        ../dsp/enc_sse2.c \
+        ../dsp/lossless.c \
+        ../dsp/upsampling.c \
+        ../dsp/upsampling_sse2.c \
+        ../dsp/yuv.c \
+        ../utils/bit_reader.c \
+        ../utils/bit_writer.c \
+        ../utils/color_cache.c \
+        ../utils/filters.c \
+        ../utils/huffman.c \
+        ../utils/huffman_encode.c \
+        ../utils/quant_levels.c \
+        ../utils/rescaler.c \
+        ../utils/thread.c \
+        ../utils/utils.c
 
 LOCAL_CFLAGS := -DANDROID
 
 LOCAL_C_INCLUDES += \
-	$(LOCAL_PATH) \
-	$(LOCAL_PATH)/../../include
+        $(LOCAL_PATH) \
+        $(LOCAL_PATH)/../../include
 
 LOCAL_MODULE:= libwebp-decode
 
diff --git a/src/dec/alpha.c b/src/dec/alpha.c
index 585695e..3c19a89 100644
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -10,60 +10,132 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "vp8i.h"
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#include "zlib.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels.h"
+#include "webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+// TODO(skal): move to dsp/ ?
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// The 'output' buffer should be pre-allocated and must be of the same
+// dimension 'height'x'stride', as that of the image.
+//
+// Returns 1 on successfully decoding the compressed alpha and
+//         0 if either:
+//           error in bit-stream header (invalid compression mode or filter), or
+//           error returned by appropriate compression method.
+
+static int DecodeAlpha(const uint8_t* data, size_t data_size,
+                       int width, int height, int stride, uint8_t* output) {
+  uint8_t* decoded_data = NULL;
+  const size_t decoded_size = height * width;
+  uint8_t* unfiltered_data = NULL;
+  WEBP_FILTER_TYPE filter;
+  int pre_processing;
+  int rsrv;
+  int ok = 0;
+  int method;
+
+  assert(width > 0 && height > 0 && stride >= width);
+  assert(data != NULL && output != NULL);
+
+  if (data_size <= ALPHA_HEADER_LEN) {
+    return 0;
+  }
+
+  method = (data[0] >> 0) & 0x03;
+  filter = (data[0] >> 2) & 0x03;
+  pre_processing = (data[0] >> 4) & 0x03;
+  rsrv = (data[0] >> 6) & 0x03;
+  if (method < ALPHA_NO_COMPRESSION ||
+      method > ALPHA_LOSSLESS_COMPRESSION ||
+      filter >= WEBP_FILTER_LAST ||
+      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+      rsrv != 0) {
+    return 0;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    ok = (data_size >= decoded_size);
+    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
+  } else {
+    decoded_data = (uint8_t*)malloc(decoded_size);
+    if (decoded_data == NULL) return 0;
+    ok = VP8LDecodeAlphaImageStream(width, height,
+                                    data + ALPHA_HEADER_LEN,
+                                    data_size - ALPHA_HEADER_LEN,
+                                    decoded_data);
+  }
+
+  if (ok) {
+    WebPFilterFunc unfilter_func = WebPUnfilters[filter];
+    if (unfilter_func != NULL) {
+      unfiltered_data = (uint8_t*)malloc(decoded_size);
+      if (unfiltered_data == NULL) {
+        ok = 0;
+        goto Error;
+      }
+      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
+      // and apply filter per image-row.
+      unfilter_func(decoded_data, width, height, 1, width, unfiltered_data);
+      // Construct raw_data (height x stride) from alpha data (height x width).
+      CopyPlane(unfiltered_data, width, output, stride, width, height);
+      free(unfiltered_data);
+    } else {
+      // Construct raw_data (height x stride) from alpha data (height x width).
+      CopyPlane(decoded_data, width, output, stride, width, height);
+    }
+    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
+      ok = DequantizeLevels(decoded_data, width, height);
+    }
+  }
+
+ Error:
+  if (method != ALPHA_NO_COMPRESSION) {
+    free(decoded_data);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
 
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows) {
-  uint8_t* output = dec->alpha_plane_;
   const int stride = dec->pic_hdr_.width_;
-  if (row < 0 || row + num_rows > dec->pic_hdr_.height_) {
-    return NULL;    // sanity check
+
+  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
+    return NULL;    // sanity check.
   }
+
   if (row == 0) {
-    // TODO(skal): for now, we just decompress everything during the first call.
-    // Later, we'll decode progressively, but we need to store the
-    // z_stream state.
-    const uint8_t* data = dec->alpha_data_;
-    size_t data_size = dec->alpha_data_size_;
-    const size_t output_size = stride * dec->pic_hdr_.height_;
-    int ret = Z_OK;
-    z_stream strm;
-
-    memset(&strm, 0, sizeof(strm));
-    if (inflateInit(&strm) != Z_OK) {
-      return 0;
-    }
-    strm.avail_in = data_size;
-    strm.next_in = (unsigned char*)data;
-    do {
-      strm.avail_out = output_size;
-      strm.next_out = output;
-      ret = inflate(&strm, Z_NO_FLUSH);
-      if (ret == Z_NEED_DICT || ret == Z_DATA_ERROR || ret == Z_MEM_ERROR) {
-        break;
-      }
-    } while (strm.avail_out == 0);
-
-    inflateEnd(&strm);
-    if (ret != Z_STREAM_END) {
-      return NULL;    // error
+    // Decode everything during the first call.
+    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
+                     dec->alpha_plane_)) {
+      // TODO(urvang): Add a test where DecodeAlpha fails to test this.
+      return NULL;  // Error.
     }
   }
-  return output + row * stride;
+
+  // Return a pointer to the current decoded row.
+  return dec->alpha_plane_ + row * stride;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
-
-#endif    // WEBP_EXPERIMENTAL_FEATURES
diff --git a/src/dec/bits.c b/src/dec/bits.c
deleted file mode 100644
index da3b777..0000000
--- a/src/dec/bits.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Boolean decoder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "bits.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// VP8BitReader
-
-void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end) {
-  assert(br);
-  assert(start);
-  assert(start <= end);
-  br->range_   = 255 - 1;
-  br->buf_     = start;
-  br->buf_end_ = end;
-  br->value_   = 0;
-  br->missing_ = 8;
-  br->eof_     = 0;
-}
-
-const uint8_t kVP8Log2Range[128] = {
-     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0
-};
-
-// range = ((range + 1) << kVP8Log2Range[range]) - 1
-const uint8_t kVP8NewRange[128] = {
-  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
-  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
-  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
-  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
-  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
-  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
-  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
-  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
-  241, 243, 245, 247, 249, 251, 253, 127
-};
-
-//-----------------------------------------------------------------------------
-// Higher-level calls
-
-uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
-  uint32_t v = 0;
-  while (bits-- > 0) {
-    v |= VP8GetBit(br, 0x80) << bits;
-  }
-  return v;
-}
-
-int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
-  const int value = VP8GetValue(br, bits);
-  return VP8Get(br) ? -value : value;
-}
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/src/dec/bits.h b/src/dec/bits.h
deleted file mode 100644
index e33b0f8..0000000
--- a/src/dec/bits.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Boolean decoder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_DEC_BITS_H_
-#define WEBP_DEC_BITS_H_
-
-#include <assert.h>
-#include "webp/decode_vp8.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// Bitreader and code-tree reader
-
-typedef struct {
-  const uint8_t* buf_;        // next byte to be read
-  const uint8_t* buf_end_;    // end of read buffer
-  int eof_;                   // true if input is exhausted
-
-  // boolean decoder
-  uint32_t range_;            // current range minus 1. In [127, 254] interval.
-  uint32_t value_;            // current value
-  int missing_;               // number of missing bits in value_ (8bit)
-} VP8BitReader;
-
-// Initialize the bit reader and the boolean decoder.
-void VP8InitBitReader(VP8BitReader* const br,
-                      const uint8_t* const start, const uint8_t* const end);
-
-// return the next value made of 'num_bits' bits
-uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
-static inline uint32_t VP8Get(VP8BitReader* const br) {
-  return VP8GetValue(br, 1);
-}
-
-// return the next value with sign-extension.
-int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
-
-// Read a bit with proba 'prob'. Speed-critical function!
-extern const uint8_t kVP8Log2Range[128];
-extern const uint8_t kVP8NewRange[128];
-static inline uint32_t VP8GetByte(VP8BitReader* const br) {
-  assert(br);
-  if (br->buf_ < br->buf_end_) {
-    assert(br->buf_);
-    return *br->buf_++;
-  }
-  br->eof_ = 1;
-  return 0xff;
-}
-
-static inline uint32_t VP8BitUpdate(VP8BitReader* const br, uint32_t split) {
-  uint32_t bit;
-  const uint32_t value_split = (split + 1) << 8;
-  // Make sure we have a least 8 bits in 'value_'
-  if (br->missing_ > 0) {
-    br->value_ |= VP8GetByte(br) << br->missing_;
-    br->missing_ -= 8;
-  }
-  bit = (br->value_ >= value_split);
-  if (bit) {
-    br->range_ -= split + 1;
-    br->value_ -= value_split;
-  } else {
-    br->range_ = split;
-  }
-  return bit;
-}
-
-static inline void VP8Shift(VP8BitReader* const br) {
-  // range_ is in [0..127] interval here.
-  const int shift = kVP8Log2Range[br->range_];
-  br->range_ = kVP8NewRange[br->range_];
-  br->value_ <<= shift;
-  br->missing_ += shift;
-}
-
-static inline uint32_t VP8GetBit(VP8BitReader* const br, int prob) {
-  const uint32_t split = (br->range_ * prob) >> 8;
-  const uint32_t bit = VP8BitUpdate(br, split);
-  if (br->range_ < 0x7f) {
-    VP8Shift(br);
-  }
-  return bit;
-}
-
-static inline int VP8GetSigned(VP8BitReader* const br, int v) {
-  const uint32_t split = br->range_ >> 1;
-  const uint32_t bit = VP8BitUpdate(br, split);
-  VP8Shift(br);
-  return bit ? -v : v;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_DEC_BITS_H_
diff --git a/src/dec/buffer.c b/src/dec/buffer.c
index c433d63..c159f6f 100644
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -10,44 +10,64 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
+
+#include "./vp8i.h"
+#include "./webpi.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPDecBuffer
 
 // Number of bytes per pixel for the different color-spaces.
-static const int kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 1, 1 };
+static const int kModeBpp[MODE_LAST] = {
+  3, 4, 3, 4, 4, 2, 2,
+  4, 4, 4, 2,    // pre-multiplied modes
+  1, 1 };
+
+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+// Convert to an integer to handle both the unsigned/signed enum cases
+// without the need for casting to remove type limit warnings.
+static int IsValidColorspace(int webp_csp_mode) {
+  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
+}
 
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   int ok = 1;
-  WEBP_CSP_MODE mode = buffer->colorspace;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
   const int width = buffer->width;
   const int height = buffer->height;
-  if (mode >= MODE_YUV) {   // YUV checks
+  if (!IsValidColorspace(mode)) {
+    ok = 0;
+  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
     const WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int size = buf->y_stride * height;
-    const int u_size = buf->u_stride * ((height + 1) / 2);
-    const int v_size = buf->v_stride * ((height + 1) / 2);
-    const int a_size = buf->a_stride * height;
-    ok &= (size <= buf->y_size);
+    const uint64_t y_size = (uint64_t)buf->y_stride * height;
+    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    ok &= (y_size <= buf->y_size);
     ok &= (u_size <= buf->u_size);
     ok &= (v_size <= buf->v_size);
-    ok &= (a_size <= buf->a_size);
     ok &= (buf->y_stride >= width);
     ok &= (buf->u_stride >= (width + 1) / 2);
     ok &= (buf->v_stride >= (width + 1) / 2);
-    if (buf->a) {
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
       ok &= (buf->a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
     }
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    ok &= (buf->stride * height <= buf->size);
+    const uint64_t size = (uint64_t)buf->stride * height;
+    ok &= (size <= buf->size);
     ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
   }
   return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
@@ -55,24 +75,22 @@
 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
   const int w = buffer->width;
   const int h = buffer->height;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
 
-  if (w <= 0 || h <= 0) {
+  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
     return VP8_STATUS_INVALID_PARAM;
   }
 
   if (!buffer->is_external_memory && buffer->private_memory == NULL) {
     uint8_t* output;
-    WEBP_CSP_MODE mode = buffer->colorspace;
-    int stride;
     int uv_stride = 0, a_stride = 0;
-    int uv_size = 0;
-    uint64_t size, a_size = 0, total_size;
+    uint64_t uv_size = 0, a_size = 0, total_size;
     // We need memory and it hasn't been allocated yet.
     // => initialize output buffer, now that dimensions are known.
-    stride = w * kModeBpp[mode];
-    size = (uint64_t)stride * h;
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;
 
-    if (mode >= MODE_YUV) {
+    if (!WebPIsRGBMode(mode)) {
       uv_stride = (w + 1) / 2;
       uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
       if (mode == MODE_YUVA) {
@@ -83,36 +101,33 @@
     total_size = size + 2 * uv_size + a_size;
 
     // Security/sanity checks
-    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
-      return VP8_STATUS_INVALID_PARAM;
-    }
-
-    buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
+    buffer->private_memory = output;
 
-    if (mode >= MODE_YUV) {   // YUVA initialization
+    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
       WebPYUVABuffer* const buf = &buffer->u.YUVA;
       buf->y = output;
       buf->y_stride = stride;
-      buf->y_size = size;
+      buf->y_size = (size_t)size;
       buf->u = output + size;
       buf->u_stride = uv_stride;
-      buf->u_size = uv_size;
+      buf->u_size = (size_t)uv_size;
       buf->v = output + size + uv_size;
       buf->v_stride = uv_stride;
-      buf->v_size = uv_size;
+      buf->v_size = (size_t)uv_size;
       if (mode == MODE_YUVA) {
         buf->a = output + size + 2 * uv_size;
       }
-      buf->a_size = a_size;
+      buf->a_size = (size_t)a_size;
       buf->a_stride = a_stride;
     } else {  // RGBA initialization
       WebPRGBABuffer* const buf = &buffer->u.RGBA;
       buf->rgba = output;
       buf->stride = stride;
-      buf->size = size;
+      buf->size = (size_t)size;
     }
   }
   return CheckDecBuffer(buffer);
@@ -140,7 +155,7 @@
       if (options->scaled_width <= 0 || options->scaled_height <= 0) {
         return VP8_STATUS_INVALID_PARAM;
       }
-      w  = options->scaled_width;
+      w = options->scaled_width;
       h = options->scaled_height;
     }
   }
@@ -151,18 +166,20 @@
   return AllocateBuffer(out);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // constructors / destructors
 
-int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
-  if (version != WEBP_DECODER_ABI_VERSION) return 0;  // version mismatch
-  if (!buffer) return 0;
+int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // version mismatch
+  }
+  if (buffer == NULL) return 0;
   memset(buffer, 0, sizeof(*buffer));
   return 1;
 }
 
-void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
-  if (buffer) {
+void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
+  if (buffer != NULL) {
     if (!buffer->is_external_memory)
       free(buffer->private_memory);
     buffer->private_memory = NULL;
@@ -171,9 +188,9 @@
 
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
                        WebPDecBuffer* const dst) {
-  if (src && dst) {
+  if (src != NULL && dst != NULL) {
     *dst = *src;
-    if (src->private_memory) {
+    if (src->private_memory != NULL) {
       dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
       dst->private_memory = NULL;
     }
@@ -182,16 +199,16 @@
 
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
-  if (src && dst) {
+  if (src != NULL && dst != NULL) {
     *dst = *src;
-    if (src->private_memory) {
+    if (src->private_memory != NULL) {
       src->is_external_memory = 1;   // src relinquishes ownership
       src->private_memory = NULL;
     }
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/decode_vp8.h b/src/dec/decode_vp8.h
new file mode 100644
index 0000000..ee914c8
--- /dev/null
+++ b/src/dec/decode_vp8.h
@@ -0,0 +1,182 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//  Low-level API for VP8 decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_DECODE_VP8_H_
+#define WEBP_WEBP_DECODE_VP8_H_
+
+#include "webp/decode.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Lower-level API
+//
+// These functions provide fine-grained control of the decoding process.
+// The call flow should resemble:
+//
+//   VP8Io io;
+//   VP8InitIo(&io);
+//   io.data = data;
+//   io.data_size = size;
+//   /* customize io's functions (setup()/put()/teardown()) if needed. */
+//
+//   VP8Decoder* dec = VP8New();
+//   bool ok = VP8Decode(dec);
+//   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
+//   VP8Delete(dec);
+//   return ok;
+
+// Input / Output
+typedef struct VP8Io VP8Io;
+typedef int (*VP8IoPutHook)(const VP8Io* io);
+typedef int (*VP8IoSetupHook)(VP8Io* io);
+typedef void (*VP8IoTeardownHook)(const VP8Io* io);
+
+struct VP8Io {
+  // set by VP8GetHeaders()
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
+
+  // set before calling put()
+  int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
+  int mb_h;                  // number of rows in the sample
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
+  int y_stride;              // row stride for luma
+  int uv_stride;             // row stride for chroma
+
+  void* opaque;              // user data
+
+  // called when fresh samples are available. Currently, samples are in
+  // YUV420 format, and can be up to width x 24 in size (depending on the
+  // in-loop filtering level, e.g.). Should return false in case of error
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
+  VP8IoPutHook put;
+
+  // called just before starting to decode the blocks.
+  // Must return false in case of setup error, true otherwise. If false is
+  // returned, teardown() will NOT be called. But if the setup succeeded
+  // and true is returned, then teardown() will always be called afterward.
+  VP8IoSetupHook setup;
+
+  // Called just after block decoding is finished (or when an error occurred
+  // during put()). Is NOT called if setup() failed.
+  VP8IoTeardownHook teardown;
+
+  // this is a recommendation for the user-side yuv->rgb converter. This flag
+  // is set when calling setup() hook and can be overwritten by it. It then
+  // can be taken into consideration during the put() method.
+  int fancy_upsampling;
+
+  // Input buffer.
+  size_t data_size;
+  const uint8_t* data;
+
+  // If true, in-loop filtering will not be performed even if present in the
+  // bitstream. Switching off filtering may speed up decoding at the expense
+  // of more visible blocking. Note that output will also be non-compliant
+  // with the VP8 specifications.
+  int bypass_filtering;
+
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
+  // If non NULL, pointer to the alpha data (if present) corresponding to the
+  // start of the current row (That is: it is pre-offset by mb_y and takes
+  // cropping into account).
+  const uint8_t* a;
+};
+
+// Internal, version-checked, entry point
+int VP8InitIoInternal(VP8Io* const, int);
+
+// Set the custom IO function pointers and user-data. The setter for IO hooks
+// should be called before initiating incremental decoding. Returns true if
+// WebPIDecoder object is successfully modified, false otherwise.
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data);
+
+// Main decoding object. This is an opaque structure.
+typedef struct VP8Decoder VP8Decoder;
+
+// Create a new decoder object.
+VP8Decoder* VP8New(void);
+
+// Must be called to make sure 'io' is initialized properly.
+// Returns false in case of version mismatch. Upon such failure, no other
+// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
+static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
+}
+
+// Start decoding a new picture. Returns true if ok.
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+
+// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
+// Returns false in case of error.
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+
+// Return current status of the decoder:
+VP8StatusCode VP8Status(VP8Decoder* const dec);
+
+// return readable string corresponding to the last status.
+const char* VP8StatusMessage(VP8Decoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Not a mandatory call between calls to VP8Decode().
+void VP8Clear(VP8Decoder* const dec);
+
+// Destroy the decoder object.
+void VP8Delete(VP8Decoder* const dec);
+
+//------------------------------------------------------------------------------
+// Miscellaneous VP8/VP8L bitstream probing functions.
+
+// Returns true if the next 3 bytes in data contain the VP8 signature.
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+
+// Validates the VP8 data-header and retrieves basic header information viz
+// width and height. Returns 0 in case of formatting error. *width/*height
+// can be passed NULL.
+WEBP_EXTERN(int) VP8GetInfo(
+    const uint8_t* data,
+    size_t data_size,    // data available so far
+    size_t chunk_size,   // total data size expected in the chunk
+    int* const width, int* const height);
+
+// Returns true if the next byte(s) in data is a VP8L signature.
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+
+// Validates the VP8L data-header and retrieves basic header information viz
+// width, height and alpha. Returns 0 in case of formatting error.
+// width/height/has_alpha can be passed NULL.
+WEBP_EXTERN(int) VP8LGetInfo(
+    const uint8_t* data, size_t data_size,  // data available so far
+    int* const width, int* const height, int* const has_alpha);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 29a0f75..9c91a48 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -11,6 +11,7 @@
 
 #include <stdlib.h>
 #include "./vp8i.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -19,7 +20,7 @@
 #define ALIGN_MASK (32 - 1)
 
 //------------------------------------------------------------------------------
-// Memory setup
+// Filtering
 
 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
 // for caching, given a filtering level.
@@ -28,99 +29,7 @@
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
-  const int mb_w = dec->mb_w_;
-  const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const int top_size = (16 + 8 + 8) * mb_w;
-  const int info_size = (mb_w + 1) * sizeof(VP8MB);
-  const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const int coeffs_size = 384 * sizeof(*dec->coeffs_);
-  const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
-  const int cache_size = top_size * cache_height;
-  const int alpha_size =
-    dec->alpha_data_ ? (dec->pic_hdr_.width_ * dec->pic_hdr_.height_) : 0;
-  const int needed = intra_pred_mode_size
-                   + top_size + info_size
-                   + yuv_size + coeffs_size
-                   + cache_size + alpha_size + ALIGN_MASK;
-  uint8_t* mem;
-
-  if (needed > dec->mem_size_) {
-    free(dec->mem_);
-    dec->mem_size_ = 0;
-    dec->mem_ = (uint8_t*)malloc(needed);
-    if (dec->mem_ == NULL) {
-      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                         "no memory during frame initialization.");
-    }
-    dec->mem_size_ = needed;
-  }
-
-  mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = (uint8_t*)mem;
-  mem += intra_pred_mode_size;
-
-  dec->y_t_ = (uint8_t*)mem;
-  mem += 16 * mb_w;
-  dec->u_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-  dec->v_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-
-  dec->mb_info_ = ((VP8MB*)mem) + 1;
-  mem += info_size;
-
-  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
-  assert((yuv_size & ALIGN_MASK) == 0);
-  dec->yuv_b_ = (uint8_t*)mem;
-  mem += yuv_size;
-
-  dec->coeffs_ = (int16_t*)mem;
-  mem += coeffs_size;
-
-  dec->cache_y_stride_ = 16 * mb_w;
-  dec->cache_uv_stride_ = 8 * mb_w;
-  {
-    const int extra_rows = kFilterExtraRows[dec->filter_type_];
-    const int extra_y = extra_rows * dec->cache_y_stride_;
-    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
-    dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv;
-    dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv;
-  }
-  mem += cache_size;
-
-  // alpha plane
-  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
-  mem += alpha_size;
-
-  // note: left-info is initialized once for all.
-  memset(dec->mb_info_ - 1, 0, (mb_w + 1) * sizeof(*dec->mb_info_));
-
-  // initialize top
-  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
-
-  // prepare 'io'
-  io->mb_y = 0;
-  io->y = dec->cache_y_;
-  io->u = dec->cache_u_;
-  io->v = dec->cache_v_;
-  io->y_stride = dec->cache_y_stride_;
-  io->uv_stride = dec->cache_uv_stride_;
-  io->fancy_upsampling = 0;    // default
-  io->a = NULL;
-
-  // Init critical function pointers and look-up tables.
-  VP8DspInitTables();
-  VP8DspInit();
-
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-// Filtering
-
-static inline int hev_thresh_from_level(int level, int keyframe) {
+static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
   if (keyframe) {
     return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
   } else {
@@ -129,11 +38,12 @@
 }
 
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
-  VP8MB* const mb = dec->mb_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
   const int y_bps = dec->cache_y_stride_;
-  const int level = mb->f_level_;
-  const int ilevel = mb->f_ilevel_;
+  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  const int level = f_info->f_level_;
+  const int ilevel = f_info->f_ilevel_;
   const int limit = 2 * level + ilevel;
   if (level == 0) {
     return;
@@ -142,26 +52,26 @@
     if (mb_x > 0) {
       VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
     }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
       VP8SimpleHFilter16i(y_dst, y_bps, limit);
     }
     if (mb_y > 0) {
       VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
     }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
       VP8SimpleVFilter16i(y_dst, y_bps, limit);
     }
   } else {    // complex
-    uint8_t* const u_dst = dec->cache_u_ + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + mb_x * 8;
     const int uv_bps = dec->cache_uv_stride_;
+    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
     const int hev_thresh =
         hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
     if (mb_x > 0) {
       VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
       VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
     }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
       VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
       VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
     }
@@ -169,21 +79,20 @@
       VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
       VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
     }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
       VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
       VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
     }
   }
 }
 
-void VP8FilterRow(const VP8Decoder* const dec) {
+// Filter the decoded macroblock row (if needed)
+static void FilterRow(const VP8Decoder* const dec) {
   int mb_x;
-  assert(dec->filter_type_ > 0);
-  if (dec->mb_y_ < dec->tl_mb_y_ || dec->mb_y_ > dec->br_mb_y_) {
-    return;
-  }
+  const int mb_y = dec->thread_ctx_.mb_y_;
+  assert(dec->thread_ctx_.filter_row_);
   for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
-    DoFilter(dec, mb_x, dec->mb_y_);
+    DoFilter(dec, mb_x, mb_y);
   }
 }
 
@@ -191,7 +100,8 @@
 
 void VP8StoreBlock(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
-    VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
+    const int skip = dec->mb_info_[dec->mb_x_].skip_;
     int level = dec->filter_levels_[dec->segment_];
     if (dec->filter_hdr_.use_lf_delta_) {
       // TODO(skal): only CURRENT is handled for now.
@@ -215,14 +125,16 @@
     }
 
     info->f_ilevel_ = (level < 1) ? 1 : level;
-    info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+    info->f_inner_ = (!skip || dec->is_i4x4_);
   }
   {
     // Transfer samples to row cache
     int y;
-    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16;
-    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8;
-    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8;
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
     for (y = 0; y < 16; ++y) {
       memcpy(ydst + y * dec->cache_y_stride_,
              dec->yuv_b_ + Y_OFF + y * BPS, 16);
@@ -249,17 +161,27 @@
 
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
 
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
+// Finalize and transmit a complete row. Return false in case of user-abort.
+static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  uint8_t* const ydst = dec->cache_y_ - ysize;
-  uint8_t* const udst = dec->cache_u_ - uvsize;
-  uint8_t* const vdst = dec->cache_v_ - uvsize;
-  const int first_row = (dec->mb_y_ == 0);
-  const int last_row = (dec->mb_y_ >= dec->br_mb_y_ - 1);
-  int y_start = MACROBLOCK_VPOS(dec->mb_y_);
-  int y_end = MACROBLOCK_VPOS(dec->mb_y_ + 1);
+  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
+  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
+  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
+  const int first_row = (ctx->mb_y_ == 0);
+  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+
+  if (ctx->filter_row_) {
+    FilterRow(dec);
+  }
+
   if (io->put) {
     if (!first_row) {
       y_start -= extra_y_rows;
@@ -267,9 +189,9 @@
       io->u = udst;
       io->v = vdst;
     } else {
-      io->y = dec->cache_y_;
-      io->u = dec->cache_u_;
-      io->v = dec->cache_v_;
+      io->y = dec->cache_y_ + y_offset;
+      io->u = dec->cache_u_ + uv_offset;
+      io->v = dec->cache_v_ + uv_offset;
     }
 
     if (!last_row) {
@@ -279,15 +201,18 @@
       y_end = io->crop_bottom;    // make sure we don't overflow on last row.
     }
     io->a = NULL;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (dec->alpha_data_) {
+    if (dec->alpha_data_ != NULL && y_start < y_end) {
+      // TODO(skal): several things to correct here:
+      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // * we're actually decompressing the full plane only once. It should be
+      //   more obvious from signature.
+      // * we could free alpha_data_ right after this call, but we don't own.
       io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
       if (io->a == NULL) {
         return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                            "Could not decode alpha data.");
       }
     }
-#endif
     if (y_start < io->crop_top) {
       const int delta_y = io->crop_top - y_start;
       y_start = io->crop_top;
@@ -295,7 +220,7 @@
       io->y += dec->cache_y_stride_ * delta_y;
       io->u += dec->cache_uv_stride_ * (delta_y >> 1);
       io->v += dec->cache_uv_stride_ * (delta_y >> 1);
-      if (io->a) {
+      if (io->a != NULL) {
         io->a += io->width * delta_y;
       }
     }
@@ -303,33 +228,69 @@
       io->y += io->crop_left;
       io->u += io->crop_left >> 1;
       io->v += io->crop_left >> 1;
-      if (io->a) {
+      if (io->a != NULL) {
         io->a += io->crop_left;
       }
       io->mb_y = y_start - io->crop_top;
       io->mb_w = io->crop_right - io->crop_left;
       io->mb_h = y_end - y_start;
-      if (!io->put(io)) {
-        return 0;
-      }
+      ok = io->put(io);
     }
   }
-  // rotate top samples
-  if (!last_row) {
-    memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
-    memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
-    memcpy(vdst, vdst + 8 * dec->cache_uv_stride_, uvsize);
+  // rotate top samples if needed
+  if (ctx->id_ + 1 == dec->num_caches_) {
+    if (!last_row) {
+      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
+      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
+      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
+    }
   }
-  return 1;
+
+  return ok;
 }
 
 #undef MACROBLOCK_VPOS
 
 //------------------------------------------------------------------------------
+
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  if (!dec->use_threads_) {
+    // ctx->id_ and ctx->f_info_ are already set
+    ctx->mb_y_ = dec->mb_y_;
+    ctx->filter_row_ = dec->filter_row_;
+    ok = FinishRow(dec, io);
+  } else {
+    WebPWorker* const worker = &dec->worker_;
+    // Finish previous job *before* updating context
+    ok &= WebPWorkerSync(worker);
+    assert(worker->status_ == OK);
+    if (ok) {   // spawn a new deblocking/output job
+      ctx->io_ = *io;
+      ctx->id_ = dec->cache_id_;
+      ctx->mb_y_ = dec->mb_y_;
+      ctx->filter_row_ = dec->filter_row_;
+      if (ctx->filter_row_) {    // just swap filter info
+        VP8FInfo* const tmp = ctx->f_info_;
+        ctx->f_info_ = dec->f_info_;
+        dec->f_info_ = tmp;
+      }
+      WebPWorkerLaunch(worker);
+      if (++dec->cache_id_ == dec->num_caches_) {
+        dec->cache_id_ = 0;
+      }
+    }
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
 // Finish setting up the decoding parameter once user's setup() is called.
 
-VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io) {
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   // Call setup() first. This may trigger additional decoding features on 'io'.
+  // Note: Afterward, we must call teardown() not matter what.
   if (io->setup && !io->setup(io)) {
     VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
     return dec->status_;
@@ -360,8 +321,13 @@
       dec->tl_mb_y_ = 0;
     } else {
       // For simple filter, we can filter only the cropped region.
-      dec->tl_mb_y_ = io->crop_top >> 4;
-      dec->tl_mb_x_ = io->crop_left >> 4;
+      // We include 'extra_pixels' on the other side of the boundary, since
+      // vertical or horizontal filtering of the previous macroblock can
+      // modify some abutting pixels.
+      dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
+      dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
+      if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
+      if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
     }
     // We need some 'extra' pixels on the right/bottom.
     dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
@@ -376,6 +342,189 @@
   return VP8_STATUS_OK;
 }
 
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  if (dec->use_threads_) {
+    ok = WebPWorkerSync(&dec->worker_);
+  }
+
+  if (io->teardown) {
+    io->teardown(io);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
+//
+// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
+// immediately, and needs to wait for first few rows of the next macroblock to
+// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
+// on strength).
+// With two threads, the vertical positions of the rows being decoded are:
+// Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
+// Deblock:         [ 0..11][12..27][28..43][44..59][...
+// If we use two threads and two caches of 16 pixels, the sequence would be:
+// Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
+// Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
+// The problem occurs during row [12..15!!] that both the decoding and
+// deblocking threads are writing simultaneously.
+// With 3 cache lines, one get a safe write pattern:
+// Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
+// Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
+// Note that multi-threaded output _without_ deblocking can make use of two
+// cache lines of 16 pixels only, since there's no lagging behind. The decoding
+// and output process have non-concurrent writing:
+// Decode:  [ 0..15][16..31][ 0..15][16..31][...
+// io->put:         [ 0..15][16..31][ 0..15][...
+
+#define MT_CACHE_LINES 3
+#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
+
+// Initialize multi/single-thread worker
+static int InitThreadContext(VP8Decoder* const dec) {
+  dec->cache_id_ = 0;
+  if (dec->use_threads_) {
+    WebPWorker* const worker = &dec->worker_;
+    if (!WebPWorkerReset(worker)) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "thread initialization failed.");
+    }
+    worker->data1 = dec;
+    worker->data2 = (void*)&dec->thread_ctx_.io_;
+    worker->hook = (WebPWorkerHook)FinishRow;
+    dec->num_caches_ =
+      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
+  } else {
+    dec->num_caches_ = ST_CACHE_LINES;
+  }
+  return 1;
+}
+
+#undef MT_CACHE_LINES
+#undef ST_CACHE_LINES
+
+//------------------------------------------------------------------------------
+// Memory setup
+
+static int AllocateMemory(VP8Decoder* const dec) {
+  const int num_caches = dec->num_caches_;
+  const int mb_w = dec->mb_w_;
+  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
+  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
+  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
+  const size_t f_info_size =
+      (dec->filter_type_ > 0) ?
+          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+        : 0;
+  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
+  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t cache_height = (16 * num_caches
+                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const size_t cache_size = top_size * cache_height;
+  // alpha_size is the only one that scales as width x height.
+  const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
+      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
+  const uint64_t needed = (uint64_t)intra_pred_mode_size
+                        + top_size + mb_info_size + f_info_size
+                        + yuv_size + coeffs_size
+                        + cache_size + alpha_size + ALIGN_MASK;
+  uint8_t* mem;
+
+  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (needed > dec->mem_size_) {
+    free(dec->mem_);
+    dec->mem_size_ = 0;
+    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
+    if (dec->mem_ == NULL) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "no memory during frame initialization.");
+    }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    dec->mem_size_ = (size_t)needed;
+  }
+
+  mem = (uint8_t*)dec->mem_;
+  dec->intra_t_ = (uint8_t*)mem;
+  mem += intra_pred_mode_size;
+
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+
+  dec->mb_info_ = ((VP8MB*)mem) + 1;
+  mem += mb_info_size;
+
+  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
+  mem += f_info_size;
+  dec->thread_ctx_.id_ = 0;
+  dec->thread_ctx_.f_info_ = dec->f_info_;
+  if (dec->use_threads_) {
+    // secondary cache line. The deblocking process need to make use of the
+    // filtering strength from previous macroblock row, while the new ones
+    // are being decoded in parallel. We'll just swap the pointers.
+    dec->thread_ctx_.f_info_ += mb_w;
+  }
+
+  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
+  assert((yuv_size & ALIGN_MASK) == 0);
+  dec->yuv_b_ = (uint8_t*)mem;
+  mem += yuv_size;
+
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;
+
+  dec->cache_y_stride_ = 16 * mb_w;
+  dec->cache_uv_stride_ = 8 * mb_w;
+  {
+    const int extra_rows = kFilterExtraRows[dec->filter_type_];
+    const int extra_y = extra_rows * dec->cache_y_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_u_ = dec->cache_y_
+                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_
+                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_id_ = 0;
+  }
+  mem += cache_size;
+
+  // alpha plane
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  mem += alpha_size;
+
+  // note: left-info is initialized once for all.
+  memset(dec->mb_info_ - 1, 0, mb_info_size);
+
+  // initialize top
+  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
+
+  return 1;
+}
+
+static void InitIo(VP8Decoder* const dec, VP8Io* io) {
+  // prepare 'io'
+  io->mb_y = 0;
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->a = NULL;
+}
+
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
+  if (!AllocateMemory(dec)) return 0;
+  InitIo(dec, io);
+  VP8DspInit();  // Init critical function pointers and look-up tables.
+  return 1;
+}
+
 //------------------------------------------------------------------------------
 // Main reconstruction function.
 
@@ -386,7 +535,7 @@
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };
 
-static inline int CheckMode(VP8Decoder* const dec, int mode) {
+static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
   if (mode == B_DC_PRED) {
     if (dec->mb_x_ == 0) {
       return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
@@ -397,7 +546,7 @@
   return mode;
 }
 
-static inline void Copy32b(uint8_t* dst, uint8_t* src) {
+static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
   *(uint32_t*)dst = *(uint32_t*)src;
 }
 
diff --git a/src/dec/idec.c b/src/dec/idec.c
index 1e51f0d..7df790c 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -13,16 +13,16 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "webpi.h"
-#include "vp8i.h"
+#include "./webpi.h"
+#include "./vp8i.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define RIFF_HEADER_SIZE 20
-#define VP8_HEADER_SIZE 10
-#define WEBP_HEADER_SIZE (RIFF_HEADER_SIZE + VP8_HEADER_SIZE)
+// In append mode, buffer allocations increase as multiples of this value.
+// Needs to be a power of 2.
 #define CHUNK_SIZE 4096
 #define MAX_MB_SIZE 4096
 
@@ -31,21 +31,29 @@
 
 // Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
 // If there is any error the decoder goes into state ERROR.
-typedef enum { STATE_HEADER = 0, STATE_PARTS0 = 1,
-               STATE_DATA = 2, STATE_DONE = 3,
-               STATE_ERROR = 4
+typedef enum {
+  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_VP8_PARTS0,
+  STATE_VP8_DATA,
+  STATE_VP8L_HEADER,
+  STATE_VP8L_DATA,
+  STATE_DONE,
+  STATE_ERROR
 } DecState;
 
 // Operating state for the MemBuffer
-typedef enum { MEM_MODE_NONE = 0,
-               MEM_MODE_APPEND, MEM_MODE_MAP
+typedef enum {
+  MEM_MODE_NONE = 0,
+  MEM_MODE_APPEND,
+  MEM_MODE_MAP
 } MemBufferMode;
 
 // storage for partition #0 and partial data (in a rolling fashion)
 typedef struct {
   MemBufferMode mode_;  // Operation mode
-  uint32_t start_;      // start location of the data to be decoded
-  uint32_t end_;        // end location
+  size_t start_;        // start location of the data to be decoded
+  size_t end_;          // end location
   size_t buf_size_;     // size of the allocated buffer
   uint8_t* buf_;        // We don't own this buffer in case WebPIUpdate()
 
@@ -56,11 +64,13 @@
 struct WebPIDecoder {
   DecState state_;         // current decoding state
   WebPDecParams params_;   // Params to store output info
-  VP8Decoder* dec_;
+  int is_lossless_;        // for down-casting 'dec_'.
+  void* dec_;              // either a VP8Decoder or a VP8LDecoder instance
   VP8Io io_;
 
   MemBuffer mem_;          // input memory buffer.
   WebPDecBuffer output_;   // output buffer (when no external one is supplied)
+  size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
 };
 
 // MB context to restore in case VP8DecodeMB() fails
@@ -76,102 +86,105 @@
 //------------------------------------------------------------------------------
 // MemBuffer: incoming data handling
 
-#define REMAP(PTR, OLD_BASE, NEW_BASE) (PTR) = (NEW_BASE) + ((PTR) - OLD_BASE)
+static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+  }
+}
 
-static inline size_t MemDataSize(const MemBuffer* mem) {
+static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
   return (mem->end_ - mem->start_);
 }
 
+static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* const new_base = mem->buf_ + mem->start_;
+  // note: for VP8, setting up idec->io_ is only really needed at the beginning
+  // of the decoding, till partition #0 is complete.
+  idec->io_.data = new_base;
+  idec->io_.data_size = MemDataSize(mem);
+
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+      const int last_part = dec->num_parts_ - 1;
+      if (offset != 0) {
+        int p;
+        for (p = 0; p <= last_part; ++p) {
+          RemapBitReader(dec->parts_ + p, offset);
+        }
+        // Remap partition #0 data pointer to new offset, but only in MAP
+        // mode (in APPEND mode, partition #0 is copied into a fixed memory).
+        if (mem->mode_ == MEM_MODE_MAP) {
+          RemapBitReader(&dec->br_, offset);
+        }
+      }
+      assert(last_part >= 0);
+      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+    } else {    // Resize lossless bitreader
+      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
+    }
+  }
+}
+
 // Appends data to the end of MemBuffer->buf_. It expands the allocated memory
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                              const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
-  VP8Decoder* const dec = idec->dec_;
-  const int last_part = dec->num_parts_ - 1;
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
   assert(mem->mode_ == MEM_MODE_APPEND);
+  if (data_size > MAX_CHUNK_PAYLOAD) {
+    // security safeguard: trying to allocate more than what the format
+    // allows for a chunk should be considered a smoke smell.
+    return 0;
+  }
 
   if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    int p;
-    uint8_t* new_buf = NULL;
-    const int num_chunks = (MemDataSize(mem) + data_size + CHUNK_SIZE - 1)
-        / CHUNK_SIZE;
-    const size_t new_size = num_chunks * CHUNK_SIZE;
-    const uint8_t* const base = mem->buf_ + mem->start_;
-
-    new_buf = (uint8_t*)malloc(new_size);
-    if (!new_buf) return 0;
-    memcpy(new_buf, base, MemDataSize(mem));
-
-    // adjust VP8BitReader pointers
-    for (p = 0; p <= last_part; ++p) {
-      if (dec->parts_[p].buf_) {
-        REMAP(dec->parts_[p].buf_, base, new_buf);
-        REMAP(dec->parts_[p].buf_end_, base, new_buf);
-      }
-    }
-
-    // adjust memory pointers
+    const size_t current_size = MemDataSize(mem);
+    const uint64_t new_size = (uint64_t)current_size + data_size;
+    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
+    uint8_t* const new_buf =
+        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
+    if (new_buf == NULL) return 0;
+    memcpy(new_buf, old_base, current_size);
     free(mem->buf_);
     mem->buf_ = new_buf;
-    mem->buf_size_ = new_size;
-
-    mem->end_ = MemDataSize(mem);
+    mem->buf_size_ = (size_t)extra_size;
     mem->start_ = 0;
+    mem->end_ = current_size;
   }
 
   memcpy(mem->buf_ + mem->end_, data, data_size);
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
-  dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
 
-  // note: setting up idec->io_ is only really needed at the beginning
-  // of the decoding, till partition #0 is complete.
-  idec->io_.data = mem->buf_ + mem->start_;
-  idec->io_.data_size = MemDataSize(mem);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
   return 1;
 }
 
 static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
-  int p;
   MemBuffer* const mem = &idec->mem_;
-  VP8Decoder* const dec = idec->dec_;
-  const int last_part = dec->num_parts_ - 1;
-  const uint8_t* base = mem->buf_;
-
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
   assert(mem->mode_ == MEM_MODE_MAP);
-  if (data_size < mem->buf_size_) {
-    return 0;  // we cannot remap to a shorter buffer!
-  }
 
-  for (p = 0; p <= last_part; ++p) {
-    if (dec->parts_[p].buf_) {
-      REMAP(dec->parts_[p].buf_, base, data);
-      REMAP(dec->parts_[p].buf_end_, base, data);
-    }
-  }
-  dec->parts_[last_part].buf_end_ = data + data_size;
-
-  // Remap partition #0 data pointer to new offset.
-  if (dec->br_.buf_) {
-    REMAP(dec->br_.buf_, base, data);
-    REMAP(dec->br_.buf_end_, base, data);
-  }
+  if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
 
   mem->buf_ = (uint8_t*)data;
   mem->end_ = mem->buf_size_ = data_size;
 
-  idec->io_.data = data;
-  idec->io_.data_size = data_size;
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
   return 1;
 }
 
 static void InitMemBuffer(MemBuffer* const mem) {
   mem->mode_       = MEM_MODE_NONE;
-  mem->buf_        = 0;
+  mem->buf_        = NULL;
   mem->buf_size_   = 0;
-  mem->part0_buf_  = 0;
+  mem->part0_buf_  = NULL;
   mem->part0_size_ = 0;
 }
 
@@ -193,8 +206,6 @@
   return 1;
 }
 
-#undef REMAP
-
 //------------------------------------------------------------------------------
 // Macroblock-decoding contexts
 
@@ -228,8 +239,8 @@
 
 //------------------------------------------------------------------------------
 
-static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) {
-  if (idec->state_ == STATE_DATA) {
+static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
+  if (idec->state_ == STATE_VP8_DATA) {
     VP8Io* const io = &idec->io_;
     if (io->teardown) {
       io->teardown(io);
@@ -239,65 +250,112 @@
   return error;
 }
 
-// Header
-static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
-  uint32_t riff_header_size, bits;
-  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
-  uint32_t curr_size = MemDataSize(&idec->mem_);
-  uint32_t chunk_size;
+static void ChangeState(WebPIDecoder* const idec, DecState new_state,
+                        size_t consumed_bytes) {
+  MemBuffer* const mem = &idec->mem_;
+  idec->state_ = new_state;
+  mem->start_ += consumed_bytes;
+  assert(mem->start_ <= mem->end_);
+  idec->io_.data = mem->buf_ + mem->start_;
+  idec->io_.data_size = MemDataSize(mem);
+}
 
-  if (curr_size < WEBP_HEADER_SIZE) {
-    return VP8_STATUS_SUSPENDED;
+// Headers
+static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* data = mem->buf_ + mem->start_;
+  size_t curr_size = MemDataSize(mem);
+  VP8StatusCode status;
+  WebPHeaderStructure headers;
+
+  headers.data = data;
+  headers.data_size = curr_size;
+  status = WebPParseHeaders(&headers);
+  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
+  } else if (status != VP8_STATUS_OK) {
+    return IDecError(idec, status);
   }
 
-  // Validate and Skip over RIFF header
-  chunk_size = WebPCheckRIFFHeader(&data, &curr_size);
-  if (chunk_size == 0 ||
-      curr_size < VP8_HEADER_SIZE ||
-      !VP8GetInfo(data, curr_size, chunk_size, NULL, NULL, NULL)) {
+  idec->chunk_size_ = headers.compressed_size;
+  idec->is_lossless_ = headers.is_lossless;
+  if (!idec->is_lossless_) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = (idec->params_.options != NULL) &&
+                        (idec->params_.options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+    ChangeState(idec, STATE_VP8L_HEADER, headers.offset);
+  }
+  return VP8_STATUS_OK;
+}
+
+static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
+  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  const size_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t bits;
+
+  if (curr_size < VP8_FRAME_HEADER_SIZE) {
+    // Not enough data bytes to extract VP8 Frame Header.
+    return VP8_STATUS_SUSPENDED;
+  }
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
-  riff_header_size = idec->mem_.end_ - curr_size;
   bits = data[0] | (data[1] << 8) | (data[2] << 16);
+  idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;
 
-  idec->mem_.part0_size_ = (bits >> 5) + VP8_HEADER_SIZE;
-  idec->mem_.start_ += riff_header_size;
-  assert(idec->mem_.start_ <= idec->mem_.end_);
-
-  idec->io_.data_size -= riff_header_size;
   idec->io_.data = data;
-  idec->state_ = STATE_PARTS0;
+  idec->io_.data_size = curr_size;
+  idec->state_ = STATE_VP8_PARTS0;
   return VP8_STATUS_OK;
 }
 
 // Partition #0
-static int CopyParts0Data(WebPIDecoder* idec) {
-  VP8BitReader* const br = &idec->dec_->br_;
+static int CopyParts0Data(WebPIDecoder* const idec) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+  VP8BitReader* const br = &dec->br_;
   const size_t psize = br->buf_end_ - br->buf_;
   MemBuffer* const mem = &idec->mem_;
-  assert(!mem->part0_buf_);
+  assert(!idec->is_lossless_);
+  assert(mem->part0_buf_ == NULL);
   assert(psize > 0);
-  assert(psize <= mem->part0_size_);
+  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
   if (mem->mode_ == MEM_MODE_APPEND) {
     // We copy and grab ownership of the partition #0 data.
     uint8_t* const part0_buf = (uint8_t*)malloc(psize);
-    if (!part0_buf) {
+    if (part0_buf == NULL) {
       return 0;
     }
     memcpy(part0_buf, br->buf_, psize);
     mem->part0_buf_ = part0_buf;
-    mem->start_ += psize;
     br->buf_ = part0_buf;
     br->buf_end_ = part0_buf + psize;
   } else {
     // Else: just keep pointers to the partition #0's data in dec_->br_.
   }
+  mem->start_ += psize;
   return 1;
 }
 
 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
-  VP8Decoder* const dec = idec->dec_;
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   VP8Io* const io = &idec->io_;
   const WebPDecParams* const params = &idec->params_;
   WebPDecBuffer* const output = params->output;
@@ -324,43 +382,37 @@
     return IDecError(idec, dec->status_);
   }
 
-  // Allocate memory and prepare everything.
-  if (!VP8InitFrame(dec, io)) {
-    return IDecError(idec, dec->status_);
-  }
-
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
 
-  // Finish setting up the decoding parameters.
-  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
+  // Finish setting up the decoding parameters. Will call io->setup().
+  if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) {
     return IDecError(idec, dec->status_);
   }
+
   // Note: past this point, teardown() must always be called
   // in case of error.
-  idec->state_ = STATE_DATA;
+  idec->state_ = STATE_VP8_DATA;
+  // Allocate memory and prepare everything.
+  if (!VP8InitFrame(dec, io)) {
+    return IDecError(idec, dec->status_);
+  }
   return VP8_STATUS_OK;
 }
 
 // Remaining partitions
 static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
-  VP8BitReader*  br;
-  VP8Decoder* const dec = idec->dec_;
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   VP8Io* const io = &idec->io_;
 
   assert(dec->ready_);
 
-  br = &dec->br_;
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
     VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
     if (dec->mb_x_ == 0) {
-      VP8MB* const left = dec->mb_info_ - 1;
-      left->nz_ = 0;
-      left->dc_nz_ = 0;
-      memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+      VP8InitScanline(dec);
     }
-
     for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
       MBContext context;
       SaveContext(dec, token_br, &context);
@@ -383,17 +435,14 @@
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
-    if (dec->filter_type_ > 0) {
-      VP8FilterRow(dec);
-    }
-    if (!VP8FinishRow(dec, io)) {
+    if (!VP8ProcessRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
     dec->mb_x_ = 0;
   }
-
-  if (io->teardown) {
-    io->teardown(io);
+  // Synchronize the thread and check for errors.
+  if (!VP8ExitCritical(dec, io)) {
+    return IDecError(idec, VP8_STATUS_USER_ABORT);
   }
   dec->ready_ = 0;
   idec->state_ = STATE_DONE;
@@ -401,39 +450,99 @@
   return VP8_STATUS_OK;
 }
 
+static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  return IDecError(idec, status);
+}
+
+static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {
+  VP8Io* const io = &idec->io_;
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const WebPDecParams* const params = &idec->params_;
+  WebPDecBuffer* const output = params->output;
+  size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // Wait until there's enough data for decoding header.
+  if (curr_size < (idec->chunk_size_ >> 3)) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  if (!VP8LDecodeHeader(dec, io)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+  // Allocate/verify output buffer now.
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_VP8L_DATA;
+  return VP8_STATUS_OK;
+}
+
+static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // At present Lossless decoder can't decode image incrementally. So wait till
+  // all the image data is aggregated before image can be decoded.
+  if (curr_size < idec->chunk_size_) {
+    return VP8_STATUS_SUSPENDED;
+  }
+
+  if (!VP8LDecodeImage(dec)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_DONE;
+
+  return VP8_STATUS_OK;
+}
+
   // Main decoding loop
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
   VP8StatusCode status = VP8_STATUS_SUSPENDED;
-  assert(idec->dec_);
 
-  if (idec->state_ == STATE_HEADER) {
-    status = DecodeHeader(idec);
+  if (idec->state_ == STATE_PRE_VP8) {
+    status = DecodeWebPHeaders(idec);
+  } else {
+    if (idec->dec_ == NULL) {
+      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
+    }
   }
-  if (idec->state_ == STATE_PARTS0) {
+  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+    status = DecodeVP8FrameHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8_PARTS0) {
     status = DecodePartition0(idec);
   }
-  if (idec->state_ == STATE_DATA) {
+  if (idec->state_ == STATE_VP8_DATA) {
     status = DecodeRemaining(idec);
   }
+  if (idec->state_ == STATE_VP8L_HEADER) {
+    status = DecodeVP8LHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8L_DATA) {
+    status = DecodeVP8LData(idec);
+  }
   return status;
 }
 
 //------------------------------------------------------------------------------
 // Public functions
 
-WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
+  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
   if (idec == NULL) {
     return NULL;
   }
 
-  idec->dec_ = VP8New();
-  if (idec->dec_ == NULL) {
-    free(idec);
-    return NULL;
-  }
-
-  idec->state_ = STATE_HEADER;
+  idec->state_ = STATE_PRE_VP8;
+  idec->chunk_size_ = 0;
 
   InitMemBuffer(&idec->mem_);
   WebPInitDecBuffer(&idec->output_);
@@ -446,8 +555,8 @@
   return idec;
 }
 
-WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
-                          WebPDecoderConfig* const config) {
+WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
+                          WebPDecoderConfig* config) {
   WebPIDecoder* idec;
 
   // Parse the bitstream's features, if requested:
@@ -458,7 +567,7 @@
   }
   // Create an instance of the incremental decoder
   idec = WebPINewDecoder(config ? &config->output : NULL);
-  if (!idec) {
+  if (idec == NULL) {
     return NULL;
   }
   // Finish initialization
@@ -468,9 +577,15 @@
   return idec;
 }
 
-void WebPIDelete(WebPIDecoder* const idec) {
-  if (!idec) return;
-  VP8Delete(idec->dec_);
+void WebPIDelete(WebPIDecoder* idec) {
+  if (idec == NULL) return;
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Delete(idec->dec_);
+    } else {
+      VP8LDelete(idec->dec_);
+    }
+  }
   ClearMemBuffer(&idec->mem_);
   WebPFreeDecBuffer(&idec->output_);
   free(idec);
@@ -479,19 +594,12 @@
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder
 
-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
-  WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = mode;
-  return idec;
-}
-
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
-                          int output_buffer_size, int output_stride) {
+                          size_t output_buffer_size, int output_stride) {
   WebPIDecoder* idec;
   if (mode >= MODE_YUV) return NULL;
   idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
+  if (idec == NULL) return NULL;
   idec->output_.colorspace = mode;
   idec->output_.is_external_memory = 1;
   idec->output_.u.RGBA.rgba = output_buffer;
@@ -500,12 +608,13 @@
   return idec;
 }
 
-WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
-                          uint8_t* u, int u_size, int u_stride,
-                          uint8_t* v, int v_size, int v_stride) {
+WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride,
+                           uint8_t* a, size_t a_size, int a_stride) {
   WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = MODE_YUV;
+  if (idec == NULL) return NULL;
+  idec->output_.colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
   idec->output_.is_external_memory = 1;
   idec->output_.u.YUVA.y = luma;
   idec->output_.u.YUVA.y_stride = luma_stride;
@@ -516,16 +625,25 @@
   idec->output_.u.YUVA.v = v;
   idec->output_.u.YUVA.v_stride = v_stride;
   idec->output_.u.YUVA.v_size = v_size;
+  idec->output_.u.YUVA.a = a;
+  idec->output_.u.YUVA.a_stride = a_stride;
+  idec->output_.u.YUVA.a_size = a_size;
   return idec;
 }
 
+WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+                          uint8_t* u, size_t u_size, int u_stride,
+                          uint8_t* v, size_t v_size, int v_stride) {
+  return WebPINewYUVA(luma, luma_size, luma_stride,
+                      u, u_size, u_stride,
+                      v, v_size, v_stride,
+                      NULL, 0, 0);
+}
+
 //------------------------------------------------------------------------------
 
 static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
   assert(idec);
-  if (idec->dec_ == NULL) {
-    return VP8_STATUS_USER_ABORT;
-  }
   if (idec->state_ == STATE_ERROR) {
     return VP8_STATUS_BITSTREAM_ERROR;
   }
@@ -535,8 +653,8 @@
   return VP8_STATUS_SUSPENDED;
 }
 
-VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size) {
+VP8StatusCode WebPIAppend(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
   VP8StatusCode status;
   if (idec == NULL || data == NULL) {
     return VP8_STATUS_INVALID_PARAM;
@@ -556,8 +674,8 @@
   return IDecode(idec);
 }
 
-VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size) {
+VP8StatusCode WebPIUpdate(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
   VP8StatusCode status;
   if (idec == NULL || data == NULL) {
     return VP8_STATUS_INVALID_PARAM;
@@ -580,61 +698,67 @@
 //------------------------------------------------------------------------------
 
 static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
-  if (!idec || !idec->dec_ || idec->state_ <= STATE_PARTS0) {
+  if (idec == NULL || idec->dec_ == NULL) {
+    return NULL;
+  }
+  if (idec->state_ <= STATE_VP8_PARTS0) {
     return NULL;
   }
   return idec->params_.output;
 }
 
-const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* const idec,
-                                      int* const left, int* const top,
-                                      int* const width, int* const height) {
+const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
+                                      int* left, int* top,
+                                      int* width, int* height) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (left) *left = 0;
-  if (top) *top = 0;
+  if (left != NULL) *left = 0;
+  if (top != NULL) *top = 0;
   // TODO(skal): later include handling of rotations.
   if (src) {
-    if (width) *width = src->width;
-    if (height) *height = idec->params_.last_y;
+    if (width != NULL) *width = src->width;
+    if (height != NULL) *height = idec->params_.last_y;
   } else {
-    if (width) *width = 0;
-    if (height) *height = 0;
+    if (width != NULL) *width = 0;
+    if (height != NULL) *height = 0;
   }
   return src;
 }
 
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
                         int* width, int* height, int* stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
   if (src->colorspace >= MODE_YUV) {
     return NULL;
   }
 
-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.RGBA.stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.RGBA.stride;
 
   return src->u.RGBA.rgba;
 }
 
-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
-                        uint8_t** u, uint8_t** v,
-                        int* width, int* height, int *stride, int* uv_stride) {
+uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                         uint8_t** u, uint8_t** v, uint8_t** a,
+                         int* width, int* height,
+                         int* stride, int* uv_stride, int* a_stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
   if (src->colorspace < MODE_YUV) {
     return NULL;
   }
 
-  if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = src->u.YUVA.u;
-  if (v) *v = src->u.YUVA.v;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.YUVA.y_stride;
-  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (u != NULL) *u = src->u.YUVA.u;
+  if (v != NULL) *v = src->u.YUVA.v;
+  if (a != NULL) *a = src->u.YUVA.a;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.YUVA.y_stride;
+  if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
+  if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;
 
   return src->u.YUVA.y;
 }
@@ -644,7 +768,7 @@
                     VP8IoSetupHook setup,
                     VP8IoTeardownHook teardown,
                     void* user_data) {
-  if (!idec || !idec->dec_ || idec->state_ > STATE_HEADER) {
+  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
     return 0;
   }
 
diff --git a/src/dec/io.c b/src/dec/io.c
index f1137e4..c5746f7 100644
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -11,224 +11,15 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
-#include "yuv.h"
+#include "../dec/vp8i.h"
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
-
-//------------------------------------------------------------------------------
-// Fancy upsampler
-
-#ifdef FANCY_UPSAMPLING
-
-// Given samples laid out in a square as:
-//  [a b]
-//  [c d]
-// we interpolate u/v as:
-//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
-//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
-
-// We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u,v) ((u) | ((v) << 16))
-
-#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int x;                                                                       \
-  const int last_pixel_pair = (len - 1) >> 1;                                  \
-  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
-  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
-    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
-    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
-  }                                                                            \
-  if (bottom_y) {                                                              \
-    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
-    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
-  }                                                                            \
-  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
-    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
-    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
-    /* precompute invariant values associated with first and second diagonals*/\
-    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
-    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
-    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
-      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
-      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
-           top_dst + (2 * x - 1) * XSTEP);                                     \
-      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
-           top_dst + (2 * x - 0) * XSTEP);                                     \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
-      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
-      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
-           bottom_dst + (2 * x - 1) * XSTEP);                                  \
-      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
-           bottom_dst + (2 * x + 0) * XSTEP);                                  \
-    }                                                                          \
-    tl_uv = t_uv;                                                              \
-    l_uv = uv;                                                                 \
-  }                                                                            \
-  if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
-      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
-           top_dst + (len - 1) * XSTEP);                                       \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
-      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
-           bottom_dst + (len - 1) * XSTEP);                                    \
-    }                                                                          \
-  }                                                                            \
-}
-
-// All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
-// These two don't erase the alpha value
-UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
-UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
-UPSAMPLE_FUNC(UpsampleArgbKeepAlphaLinePair, VP8YuvToArgbKeepA, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444KeepAlphaLinePair, VP8YuvToRgba4444KeepA, 2)
-
-#undef LOAD_UV
-#undef UPSAMPLE_FUNC
-
-// Fancy upsampling functions to convert YUV to RGB
-WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
-WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];
-
-static void InitUpsamplers(void) {
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-
-  WebPUpsamplersKeepAlpha[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGBA]      = UpsampleRgbKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplersKeepAlpha[MODE_BGRA]      = UpsampleBgrKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_ARGB]      = UpsampleArgbKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGBA_4444] = UpsampleRgba4444KeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGB_565]   = UpsampleRgb565LinePair;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8DecGetCPUInfo) {
-    if (VP8DecGetCPUInfo(kSSE2)) {
-#if defined(__SSE2__) || defined(_MSC_VER)
-      WebPInitUpsamplersSSE2();
-#endif
-    }
-  }
-}
-
-#endif  // FANCY_UPSAMPLING
-
-//------------------------------------------------------------------------------
-// simple point-sampling
-
-#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* u, const uint8_t* v,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int i;                                                                       \
-  for (i = 0; i < len - 1; i += 2) {                                           \
-    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
-    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
-    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
-    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
-    top_y += 2;                                                                \
-    bottom_y += 2;                                                             \
-    u++;                                                                       \
-    v++;                                                                       \
-    top_dst += 2 * XSTEP;                                                      \
-    bottom_dst += 2 * XSTEP;                                                   \
-  }                                                                            \
-  if (i == len - 1) {    /* last one */                                        \
-    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
-    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
-  }                                                                            \
-}
-
-// All variants implemented.
-SAMPLE_FUNC(SampleRgbLinePair,      VP8YuvToRgb,  3)
-SAMPLE_FUNC(SampleBgrLinePair,      VP8YuvToBgr,  3)
-SAMPLE_FUNC(SampleRgbaLinePair,     VP8YuvToRgba, 4)
-SAMPLE_FUNC(SampleBgraLinePair,     VP8YuvToBgra, 4)
-SAMPLE_FUNC(SampleArgbLinePair,     VP8YuvToArgb, 4)
-SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SAMPLE_FUNC(SampleRgb565LinePair,   VP8YuvToRgb565, 2)
-
-#undef SAMPLE_FUNC
-
-// Main methods.
-typedef void (*SampleLinePairFunc)(
-  const uint8_t* top_y, const uint8_t* bottom_y,
-  const uint8_t* u, const uint8_t* v,
-  uint8_t* top_dst, uint8_t* bottom_dst, int len);
-
-static const SampleLinePairFunc kSamplers[MODE_LAST] = {
-  SampleRgbLinePair,       // MODE_RGB
-  SampleRgbaLinePair,      // MODE_RGBA
-  SampleBgrLinePair,       // MODE_BGR
-  SampleBgraLinePair,      // MODE_BGRA
-  SampleArgbLinePair,      // MODE_ARGB
-  SampleRgba4444LinePair,  // MODE_RGBA_4444
-  SampleRgb565LinePair     // MODE_RGB_565
-};
-
-//------------------------------------------------------------------------------
-// YUV444 converter
-
-#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len) {                                 \
-  int i;                                                                       \
-  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
-}
-
-YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
-YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
-YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
-YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
-YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
-YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
-YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
-
-#undef YUV444_FUNC
-
-typedef void (*YUV444Func)(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                           uint8_t* dst, int len);
-
-static const YUV444Func kYUV444Converters[MODE_LAST] = {
-  Yuv444ToRgb,       // MODE_RGB
-  Yuv444ToRgba,      // MODE_RGBA
-  Yuv444ToBgr,       // MODE_BGR
-  Yuv444ToBgra,      // MODE_BGRA
-  Yuv444ToArgb,      // MODE_ARGB
-  Yuv444ToRgba4444,  // MODE_RGBA_4444
-  Yuv444ToRgb565     // MODE_RGB_565
-};
-
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
 
@@ -241,11 +32,12 @@
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   const int uv_w = (mb_w + 1) / 2;
+  const int uv_h = (mb_h + 1) / 2;
   int j;
   for (j = 0; j < mb_h; ++j) {
     memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
   }
-  for (j = 0; j < (mb_h + 1) / 2; ++j) {
+  for (j = 0; j < uv_h; ++j) {
     memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
     memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
   }
@@ -260,7 +52,7 @@
   const uint8_t* y_src = io->y;
   const uint8_t* u_src = io->u;
   const uint8_t* v_src = io->v;
-  const SampleLinePairFunc sample = kSamplers[output->colorspace];
+  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
   const int mb_w = io->mb_w;
   const int last = io->mb_h - 1;
   int j;
@@ -289,7 +81,7 @@
   const uint8_t* y_src = io->y;
   const uint8_t* u_src = io->u;
   const uint8_t* v_src = io->v;
-  const YUV444Func convert = kYUV444Converters[output->colorspace];
+  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
   const int mb_w = io->mb_w;
   const int last = io->mb_h;
   int j;
@@ -312,9 +104,7 @@
   int num_lines_out = io->mb_h;   // a priori guess
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
   uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const WebPUpsampleLinePairFunc upsample =
-      io->a ? WebPUpsamplersKeepAlpha[p->output->colorspace]
-            : WebPUpsamplers[p->output->colorspace];
+  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
   const uint8_t* cur_y = io->y;
   const uint8_t* cur_u = io->u;
   const uint8_t* cur_v = io->v;
@@ -330,11 +120,9 @@
     upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
   } else {
     // We can finish the left-over line from previous call.
-    // Warning! Don't overwrite the alpha values (if any), as they
-    // are not lagging one line behind but are already written.
     upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
              dst - buf->stride, dst, mb_w);
-    num_lines_out++;
+    ++num_lines_out;
   }
   // Loop over each output pairs of row.
   for (; y + 2 < y_end; y += 2) {
@@ -372,151 +160,133 @@
 
 //------------------------------------------------------------------------------
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
 static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
-  int j;
-  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
-  const uint8_t* alpha = io->a;
-  if (alpha) {
+  int j;
+
+  if (alpha != NULL) {
     for (j = 0; j < mb_h; ++j) {
       memcpy(dst, alpha, mb_w * sizeof(*dst));
       alpha += io->width;
       dst += buf->a_stride;
     }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
   }
   return 0;
 }
 
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaing rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
 static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
-  const int mb_w = io->mb_w;
-  const int mb_h = io->mb_h;
-  int i, j;
-  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
   const uint8_t* alpha = io->a;
-  if (alpha) {
-    for (j = 0; j < mb_h; ++j) {
-      for (i = 0; i < mb_w; ++i) {
-        dst[4 * i + 3] = alpha[i];
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    int i, j;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint32_t alpha_mask = 0xff;
+
+    {
+      uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+      uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+      for (j = 0; j < num_rows; ++j) {
+        for (i = 0; i < mb_w; ++i) {
+          const uint32_t alpha_value = alpha[i];
+          dst[4 * i] = alpha_value;
+          alpha_mask &= alpha_value;
+        }
+        alpha += io->width;
+        dst += buf->stride;
       }
-      alpha += io->width;
-      dst += buf->stride;
+      // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+      if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+        WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                               mb_w, num_rows, buf->stride);
+      }
     }
   }
   return 0;
 }
 
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    int i, j;
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint32_t alpha_mask = 0x0f;
 
-//------------------------------------------------------------------------------
-// Simple picture rescaler
-
-// TODO(skal): start a common library for encoder and decoder, and factorize
-// this code in.
-
-#define RFIX 30
-#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
-
-static void InitRescaler(WebPRescaler* const wrk,
-                         int src_width, int src_height,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         int x_add, int x_sub, int y_add, int y_sub,
-                         int32_t* work) {
-  wrk->x_expand = (src_width < dst_width);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
-  wrk->irow = work;
-  wrk->frow = work + dst_width;
-}
-
-static inline void ImportRow(const uint8_t* const src,
-                             WebPRescaler* const wrk) {
-  int x_in = 0;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-      accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in++];
+    {
+      uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+      uint8_t* alpha_dst = base_rgba + 1;
+      for (j = 0; j < num_rows; ++j) {
+        for (i = 0; i < mb_w; ++i) {
+          // Fill in the alpha value (converted to 4 bits).
+          const uint32_t alpha_value = alpha[i] >> 4;
+          alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+          alpha_mask &= alpha_value;
+        }
+        alpha += io->width;
+        alpha_dst += buf->stride;
       }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in++];
-        const int32_t frac = base * (-accum);
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
-        // fresh fractional start for next pixel
-        sum = MULT(frac, wrk->fx_scale);
+      if (alpha_mask != 0x0f && p->output->colorspace == MODE_rgbA_4444) {
+        WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
       }
     }
-  } else {        // simple bilinear interpolation
-    int left = src[0], right = src[0];
-    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-      if (accum < 0) {
-        left = right;
-        right = src[++x_in];
-        accum += wrk->x_add;
-      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
-    }
   }
-  // Accumulate the new row's contribution
-  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-    wrk->irow[x_out] += wrk->frow[x_out];
-  }
+  return 0;
 }
 
-static void ExportRow(WebPRescaler* const wrk) {
-  int x_out;
-  const int yscale = wrk->fy_scale * (-wrk->y_accum);
-  assert(wrk->y_accum <= 0);
-  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-    const int frac = MULT(wrk->frow[x_out], yscale);
-    const int v = MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
-    wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-    wrk->irow[x_out] = frac;   // new fractional start
-  }
-  wrk->y_accum += wrk->y_add;
-  wrk->dst += wrk->dst_stride;
-}
-
-#undef MULT
-#undef RFIX
-
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)
 
 static int Rescale(const uint8_t* src, int src_stride,
                    int new_lines, WebPRescaler* const wrk) {
   int num_lines_out = 0;
-  while (new_lines-- > 0) {    // import new contribution of one source row.
-    ImportRow(src, wrk);
-    src += src_stride;
-    wrk->y_accum -= wrk->y_sub;
-    while (wrk->y_accum <= 0) {      // emit output row(s)
-      ExportRow(wrk);
-      num_lines_out++;
-    }
+  while (new_lines > 0) {    // import new contributions of source rows.
+    const int lines_in = WebPRescalerImport(wrk, new_lines, src, src_stride);
+    src += lines_in * src_stride;
+    new_lines -= lines_in;
+    num_lines_out += WebPRescalerExport(wrk);    // emit output row(s)
   }
   return num_lines_out;
 }
@@ -531,19 +301,14 @@
 }
 
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
-  if (io->a) {
+  if (io->a != NULL) {
     Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
   }
   return 0;
 }
 
-static int IsAlphaMode(WEBP_CSP_MODE mode) {
-  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
-          mode == MODE_RGBA_4444 || mode == MODE_YUVA);
-}
-
 static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
-  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int out_width  = io->scaled_width;
   const int out_height = io->scaled_height;
@@ -565,26 +330,27 @@
     return 0;   // memory error
   }
   work = (int32_t*)p->memory;
-  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
-               buf->y, out_width, out_height, buf->y_stride,
-               io->mb_w, out_width, io->mb_h, out_height,
-               work);
-  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
-               buf->u, uv_out_width, uv_out_height, buf->u_stride,
-               uv_in_width, uv_out_width,
-               uv_in_height, uv_out_height,
-               work + work_size);
-  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
-               buf->v, uv_out_width, uv_out_height, buf->v_stride,
-               uv_in_width, uv_out_width,
-               uv_in_height, uv_out_height,
-               work + work_size + uv_work_size);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   buf->y, out_width, out_height, buf->y_stride, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size + uv_work_size);
   p->emit = EmitRescaledYUV;
+
   if (has_alpha) {
-    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
-                 buf->a, out_width, out_height, buf->a_stride,
-                 io->mb_w, out_width, io->mb_h, out_height,
-                 work + work_size + 2 * uv_work_size);
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     buf->a, out_width, out_height, buf->a_stride, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
   }
   return 1;
@@ -593,37 +359,25 @@
 //------------------------------------------------------------------------------
 // RGBA rescaling
 
-// import new contributions until one row is ready to be output, or all input
-// is consumed.
-static int Import(const uint8_t* src, int src_stride,
-                  int new_lines, WebPRescaler* const wrk) {
-  int num_lines_in = 0;
-  while (num_lines_in < new_lines && wrk->y_accum > 0) {
-    ImportRow(src, wrk);
-    src += src_stride;
-    ++num_lines_in;
-    wrk->y_accum -= wrk->y_sub;
-  }
-  return num_lines_in;
-}
-
 static int ExportRGB(WebPDecParams* const p, int y_pos) {
-  const YUV444Func convert = kYUV444Converters[p->output->colorspace];
+  const WebPYUV444Converter convert =
+      WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
   uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
-  while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
+  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
+         WebPRescalerHasPendingOutput(&p->scaler_u)) {
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
     assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    ExportRow(&p->scaler_y);
-    ExportRow(&p->scaler_u);
-    ExportRow(&p->scaler_v);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
     convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
             dst, p->scaler_y.dst_width);
     dst += buf->stride;
-    num_lines_out++;
+    ++num_lines_out;
   }
   return num_lines_out;
 }
@@ -634,12 +388,15 @@
   int j = 0, uv_j = 0;
   int num_lines_out = 0;
   while (j < mb_h) {
-    const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
-                                  mb_h - j, &p->scaler_y);
-    const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
-                                  uv_mb_h - uv_j, &p->scaler_u);
-    const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
-                                  uv_mb_h - uv_j, &p->scaler_v);
+    const int y_lines_in =
+        WebPRescalerImport(&p->scaler_y, mb_h - j,
+                           io->y + j * io->y_stride, io->y_stride);
+    const int u_lines_in =
+        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                           io->u + uv_j * io->uv_stride, io->uv_stride);
+    const int v_lines_in =
+        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                           io->v + uv_j * io->uv_stride, io->uv_stride);
     (void)v_lines_in;   // remove a gcc warning
     assert(u_lines_in == v_lines_in);
     j += y_lines_in;
@@ -651,34 +408,80 @@
 
 static int ExportAlpha(WebPDecParams* const p, int y_pos) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int alpha_first =
+      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
   int num_lines_out = 0;
-  while (p->scaler_a.y_accum <= 0) {
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
+  const int width = p->scaler_a.dst_width;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    ExportRow(&p->scaler_a);
-    for (i = 0; i < p->scaler_a.dst_width; ++i) {
-      dst[4 * i + 3] = p->scaler_a.dst[i];
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
     }
     dst += buf->stride;
-    num_lines_out++;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0xff) {
+    WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                           width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* alpha_dst = base_rgba + 1;
+  int num_lines_out = 0;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int width = p->scaler_a.dst_width;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      // Fill in the alpha value (converted to 4 bits).
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha_dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0x0f) {
+    WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
   }
   return num_lines_out;
 }
 
 static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
-  if (io->a) {
-    int j = 0, pos = 0;
+  if (io->a != NULL) {
+    WebPRescaler* const scaler = &p->scaler_a;
+    int j = 0;
+    int pos = 0;
     while (j < io->mb_h) {
-      j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
-      pos += ExportAlpha(p, pos);
+      j += WebPRescalerImport(scaler, io->mb_h - j,
+                              io->a + j * io->width, io->width);
+      pos += p->emit_alpha_row(p, pos);
     }
   }
   return 0;
 }
 
 static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
-  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
   const int out_width  = io->scaled_width;
   const int out_height = io->scaled_height;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
@@ -701,26 +504,32 @@
   }
   work = (int32_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
-  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
-               tmp + 0 * out_width, out_width, out_height, 0,
-               io->mb_w, out_width, io->mb_h, out_height,
-               work + 0 * work_size);
-  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
-               tmp + 1 * out_width, out_width, out_height, 0,
-               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
-               work + 1 * work_size);
-  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
-               tmp + 2 * out_width, out_width, out_height, 0,
-               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
-               work + 2 * work_size);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   tmp + 0 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work + 0 * work_size);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   tmp + 1 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 1 * work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   tmp + 2 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 2 * work_size);
   p->emit = EmitRescaledRGB;
 
   if (has_alpha) {
-    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
-                 tmp + 3 * out_width, out_width, out_height, 0,
-                 io->mb_w, out_width, io->mb_h, out_height,
-                 work + 3 * work_size);
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     tmp + 3 * out_width, out_width, out_height, 0, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + 3 * work_size);
     p->emit_alpha = EmitRescaledAlphaRGB;
+    if (p->output->colorspace == MODE_RGBA_4444 ||
+        p->output->colorspace == MODE_rgbA_4444) {
+      p->emit_alpha_row = ExportAlphaRGBA4444;
+    } else {
+      p->emit_alpha_row = ExportAlpha;
+    }
   }
   return 1;
 }
@@ -728,67 +537,17 @@
 //------------------------------------------------------------------------------
 // Default custom functions
 
-// Setup crop_xxx fields, mb_w and mb_h
-static int InitFromOptions(const WebPDecoderOptions* const options,
-                           VP8Io* const io) {
-  const int W = io->width;
-  const int H = io->height;
-  int x = 0, y = 0, w = W, h = H;
-
-  // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
-  if (io->use_cropping) {
-    w = options->crop_width;
-    h = options->crop_height;
-    // TODO(skal): take colorspace into account. Don't assume YUV420.
-    x = options->crop_left & ~1;
-    y = options->crop_top & ~1;
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
-      return 0;  // out of frame boundary error
-    }
-  }
-  io->crop_left   = x;
-  io->crop_top    = y;
-  io->crop_right  = x + w;
-  io->crop_bottom = y + h;
-  io->mb_w = w;
-  io->mb_h = h;
-
-  // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
-  if (io->use_scaling) {
-    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
-      return 0;
-    }
-    io->scaled_width = options->scaled_width;
-    io->scaled_height = options->scaled_height;
-  }
-
-  // Filter
-  io->bypass_filtering = options && options->bypass_filtering;
-
-  // Fancy upsampler
-#ifdef FANCY_UPSAMPLING
-  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
-#endif
-
-  if (io->use_scaling) {
-    // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
-    io->fancy_upsampling = 0;
-  }
-  return 1;
-}
-
 static int CustomSetup(VP8Io* io) {
   WebPDecParams* const p = (WebPDecParams*)io->opaque;
-  const int is_rgb = (p->output->colorspace < MODE_YUV);
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int is_rgb = WebPIsRGBMode(colorspace);
+  const int is_alpha = WebPIsAlphaMode(colorspace);
 
   p->memory = NULL;
   p->emit = NULL;
   p->emit_alpha = NULL;
-  if (!InitFromOptions(p->options, io)) {
+  p->emit_alpha_row = NULL;
+  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
     return 0;
   }
 
@@ -811,18 +570,20 @@
         p->tmp_u = p->tmp_y + io->mb_w;
         p->tmp_v = p->tmp_u + uv_width;
         p->emit = EmitFancyRGB;
-        InitUpsamplers();
+        WebPInitUpsamplers();
       }
 #endif
     } else {
       p->emit = EmitYUV;
     }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (IsAlphaMode(p->output->colorspace)) {
-      // We need transparency output
-      p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
+    if (is_alpha) {  // need transparency output
+      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
+      p->emit_alpha =
+          (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
+              EmitAlphaRGBA4444
+          : is_rgb ? EmitAlphaRGB
+          : EmitAlphaYUV;
     }
-#endif
   }
 
   if (is_rgb) {
diff --git a/src/dec/layer.c b/src/dec/layer.c
index 357ad21..a3a5bdc 100644
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -11,13 +11,14 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "vp8i.h"
+
+#include "./vp8i.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 int VP8DecodeLayer(VP8Decoder* const dec) {
   assert(dec);
diff --git a/src/dec/quant.c b/src/dec/quant.c
index 47edbf5..d54097a 100644
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,13 +9,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "vp8i.h"
+#include "./vp8i.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-static inline int clip(int v, int M) {
+static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
 }
 
@@ -58,7 +58,7 @@
   249, 254, 259, 264, 269, 274, 279, 284
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 9.6
 
 void VP8ParseQuant(VP8Decoder* const dec) {
@@ -94,8 +94,10 @@
       m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];
 
       m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
-      // TODO(skal): make it another table?
-      m->y2_mat_[1] = kAcTable[clip(q + dqy2_ac, 127)] * 155 / 100;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
       if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
 
       m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
@@ -104,7 +106,7 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/tree.c b/src/dec/tree.c
index ed6caad..82484e4 100644
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -59,13 +59,13 @@
 };
 
 static const int8_t kMVRef4[6] = {
-  -LEFT4, 1
-    -ABOVE4, 2
+  -LEFT4, 1,
+    -ABOVE4, 2,
       -ZERO4, -NEW4
 };
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Default probabilities
 
 // Inter
@@ -385,7 +385,7 @@
                : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 13
 
 static const uint8_t
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 1f1ce29..b0ccfa2 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -10,19 +10,23 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "vp8i.h"
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 int WebPGetDecoderVersion(void) {
   return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Decoder
 
 static void SetOk(VP8Decoder* const dec) {
@@ -31,19 +35,22 @@
 }
 
 int VP8InitIoInternal(VP8Io* const io, int version) {
-  if (version != WEBP_DECODER_ABI_VERSION)
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
     return 0;  // mismatch error
-  if (io) {
+  }
+  if (io != NULL) {
     memset(io, 0, sizeof(*io));
   }
   return 1;
 }
 
 VP8Decoder* VP8New(void) {
-  VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
-  if (dec) {
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  if (dec != NULL) {
     SetOk(dec);
+    WebPWorkerInit(&dec->worker_);
     dec->ready_ = 0;
+    dec->num_parts_ = 1;
   }
   return dec;
 }
@@ -54,36 +61,46 @@
 }
 
 const char* VP8StatusMessage(VP8Decoder* const dec) {
-  if (!dec) return "no object";
+  if (dec == NULL) return "no object";
   if (!dec->error_msg_) return "OK";
   return dec->error_msg_;
 }
 
 void VP8Delete(VP8Decoder* const dec) {
-  if (dec) {
+  if (dec != NULL) {
     VP8Clear(dec);
     free(dec);
   }
 }
 
 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg) {
-  dec->status_ = error;
-  dec->error_msg_ = msg;
-  dec->ready_ = 0;
+                VP8StatusCode error, const char* const msg) {
+  // TODO This check would be unnecessary if alpha decompression was separated
+  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
+  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
+  // failure.
+  if (dec->status_ == VP8_STATUS_OK) {
+    dec->status_ = error;
+    dec->error_msg_ = msg;
+    dec->ready_ = 0;
+  }
   return 0;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
-int VP8GetInfo(const uint8_t* data,
-               uint32_t data_size, uint32_t chunk_size,
-               int* width, int* height, int* has_alpha) {
-  if (data_size < 10) {
+int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
+  return (data_size >= 3 &&
+          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
+}
+
+int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
+               int* const width, int* const height) {
+  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
     return 0;         // not enough data
   }
   // check signature
-  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
+  if (!VP8CheckSignature(data + 3, data_size - 3)) {
     return 0;         // Wrong signature.
   } else {
     const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
@@ -91,14 +108,6 @@
     const int w = ((data[7] << 8) | data[6]) & 0x3fff;
     const int h = ((data[9] << 8) | data[8]) & 0x3fff;
 
-    if (has_alpha) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (data_size < 11) return 0;
-      *has_alpha = !!(data[10] & 0x80);    // the colorspace_ bit
-#else
-      *has_alpha = 0;
-#endif
-    }
     if (!key_frame) {   // Not a keyframe.
       return 0;
     }
@@ -124,11 +133,11 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Header parsing
 
 static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
-  assert(hdr);
+  assert(hdr != NULL);
   hdr->use_segment_ = 0;
   hdr->update_map_ = 0;
   hdr->absolute_delta_ = 1;
@@ -139,8 +148,8 @@
 // Paragraph 9.3
 static int ParseSegmentHeader(VP8BitReader* br,
                               VP8SegmentHeader* hdr, VP8Proba* proba) {
-  assert(br);
-  assert(hdr);
+  assert(br != NULL);
+  assert(hdr != NULL);
   hdr->use_segment_ = VP8Get(br);
   if (hdr->use_segment_) {
     hdr->update_map_ = VP8Get(br);
@@ -176,7 +185,7 @@
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
-                                     const uint8_t* buf, uint32_t size) {
+                                     const uint8_t* buf, size_t size) {
   VP8BitReader* const br = &dec->br_;
   const uint8_t* sz = buf;
   const uint8_t* buf_end = buf + size;
@@ -244,18 +253,15 @@
   return !br->eof_;
 }
 
-static inline uint32_t get_le32(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
-}
-
 // Topmost call
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
-  uint8_t* buf;
-  uint32_t buf_size;
+  const uint8_t* buf;
+  size_t buf_size;
   VP8FrameHeader* frm_hdr;
   VP8PictureHeader* pic_hdr;
   VP8BitReader* br;
   VP8StatusCode status;
+  WebPHeaderStructure headers;
 
   if (dec == NULL) {
     return 0;
@@ -266,41 +272,35 @@
                        "null VP8Io passed to VP8GetHeaders()");
   }
 
-  buf = (uint8_t*)io->data;
-  buf_size = io->data_size;
-  if (buf == NULL || buf_size <= 4) {
-    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                       "Not enough data to parse frame header");
+  // Process Pre-VP8 chunks.
+  headers.data = io->data;
+  headers.data_size = io->data_size;
+  status = WebPParseHeaders(&headers);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "Incorrect/incomplete header.");
+  }
+  if (headers.is_lossless) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "Unexpected lossless format encountered.");
   }
 
-  // Skip over valid RIFF headers
-  if (!memcmp(buf, "RIFF", 4)) {
-    uint32_t riff_size;
-    uint32_t chunk_size;
-    if (buf_size < 20 + 4) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 8, "WEBP", 4)) {   // wrong image file signature
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: WEBP signature not found.");
-    }
-    riff_size = get_le32(buf + 4);
-    if (riff_size < 12) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 12, "VP8 ", 4)) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Invalid compression format.");
-    }
-    chunk_size = get_le32(buf + 16);
-    if (chunk_size > riff_size - 12) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Inconsistent size information.");
-    }
-    buf += 20;
-    buf_size -= 20;
+  if (dec->alpha_data_ == NULL) {
+    assert(dec->alpha_data_size_ == 0);
+    // We have NOT set alpha data yet. Set it now.
+    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
+    // WebPParseHeaders() is called more than once, as in incremental decoding
+    // case.)
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+  }
+
+  // Process the VP8 frame header.
+  buf = headers.data + headers.offset;
+  buf_size = headers.data_size - headers.offset;
+  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
+  if (buf_size < 4) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "Truncated header.");
   }
 
   // Paragraph 9.1
@@ -328,7 +328,7 @@
       return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                          "cannot parse picture header");
     }
-    if (buf[0] != 0x9d || buf[1] != 0x01 || buf[2] != 0x2a) {
+    if (!VP8CheckSignature(buf, buf_size)) {
       return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                          "Bad code word");
     }
@@ -365,9 +365,6 @@
                        "bad partition length");
   }
 
-  dec->alpha_data_ = NULL;
-  dec->alpha_data_size_ = 0;
-
   br = &dec->br_;
   VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
   buf += frm_hdr->partition_length_;
@@ -436,22 +433,14 @@
   if (dec->pic_hdr_.colorspace_) {
     const size_t kTrailerSize = 8;
     const uint8_t kTrailerMarker = 0x01;
-    uint8_t* const ext_buf = buf - kTrailerSize;
+    const uint8_t* ext_buf = buf - kTrailerSize;
     size_t size;
 
     if (frm_hdr->partition_length_ < kTrailerSize ||
         ext_buf[kTrailerSize - 1] != kTrailerMarker) {
- Error:
       return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                          "RIFF: Inconsistent extra information.");
     }
-    // Alpha
-    size = (ext_buf[4] << 0) | (ext_buf[5] << 8) | (ext_buf[6] << 16);
-    if (frm_hdr->partition_length_ < size + kTrailerSize) {
-      goto Error;
-    }
-    dec->alpha_data_ = (size > 0) ? ext_buf - size : NULL;
-    dec->alpha_data_size_ = size;
 
     // Layer
     size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
@@ -466,7 +455,7 @@
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)
 
 static const uint8_t kBands[16 + 1] = {
@@ -489,8 +478,9 @@
 // Returns the position of the last non-zero coeff plus one
 // (and 0 if there's no coeff at all)
 static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
-                     int ctx, const uint16_t dq[2], int n, int16_t* out) {
-  const uint8_t* p = prob[kBands[n]][ctx];
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
+  const uint8_t* p = prob[n][ctx];
   if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
     return 0;
   }
@@ -579,6 +569,7 @@
   uint32_t non_zero_dc = 0;
   int x, y, ch;
 
+  nz_dc.i32 = nz_ac.i32 = 0;
   memset(dst, 0, 384 * sizeof(*dst));
   if (!dec->is_i4x4_) {    // parse DC
     int16_t dc[16] = { 0 };
@@ -645,7 +636,7 @@
 }
 #undef PACK
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main loop
 
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
@@ -682,15 +673,21 @@
   return (!token_br->eof_);
 }
 
+void VP8InitScanline(VP8Decoder* const dec) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  left->nz_ = 0;
+  left->dc_nz_ = 0;
+  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+  dec->filter_row_ =
+    (dec->filter_type_ > 0) &&
+    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+}
+
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
   for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
-    VP8MB* const left = dec->mb_info_ - 1;
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    left->nz_ = 0;
-    left->dc_nz_ = 0;
-    memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-
+    VP8InitScanline(dec);
     for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
       if (!VP8DecodeMB(dec, token_br)) {
         return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -701,13 +698,13 @@
       // Store data and save block's filtering params
       VP8StoreBlock(dec);
     }
-    if (dec->filter_type_ > 0) {
-      VP8FilterRow(dec);
-    }
-    if (!VP8FinishRow(dec, io)) {
+    if (!VP8ProcessRow(dec, io)) {
       return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
+  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+    return 0;
+  }
 
   // Finish
 #ifndef ONLY_KEYFRAME_CODE
@@ -729,6 +726,7 @@
 
 // Main entry point
 int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 0;
   if (dec == NULL) {
     return 0;
   }
@@ -744,38 +742,35 @@
   }
   assert(dec->ready_);
 
-  // Will allocate memory and prepare everything.
-  if (!VP8InitFrame(dec, io)) {
-    VP8Clear(dec);
-    return 0;
+  // Finish setting up the decoding parameter. Will call io->setup().
+  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
+  if (ok) {   // good to go.
+    // Will allocate memory and prepare everything.
+    if (ok) ok = VP8InitFrame(dec, io);
+
+    // Main decoding loop
+    if (ok) ok = ParseFrame(dec, io);
+
+    // Exit.
+    ok &= VP8ExitCritical(dec, io);
   }
 
-  // Finish setting up the decoding parameter
-  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
+  if (!ok) {
     VP8Clear(dec);
     return 0;
   }
 
-  // Main decoding loop
-  {
-    const int ret = ParseFrame(dec, io);
-    if (io->teardown) {
-      io->teardown(io);
-    }
-    if (!ret) {
-      VP8Clear(dec);
-      return 0;
-    }
-  }
-
   dec->ready_ = 0;
-  return 1;
+  return ok;
 }
 
 void VP8Clear(VP8Decoder* const dec) {
   if (dec == NULL) {
     return;
   }
+  if (dec->use_threads_) {
+    WebPWorkerEnd(&dec->worker_);
+  }
   if (dec->mem_) {
     free(dec->mem_);
   }
@@ -785,7 +780,7 @@
   dec->ready_ = 0;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 587b1cb..4382edf 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -13,19 +13,22 @@
 #define WEBP_DEC_VP8I_H_
 
 #include <string.h>     // for memcpy()
-#include "bits.h"
+#include "./vp8li.h"
+#include "../utils/bit_reader.h"
+#include "../utils/thread.h"
+#include "../dsp/dsp.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Various defines and enums
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 1
-#define DEC_REV_VERSION 2
+#define DEC_MIN_VERSION 2
+#define DEC_REV_VERSION 0
 
 #define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
 
@@ -95,7 +98,7 @@
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Headers
 
 typedef struct {
@@ -144,27 +147,37 @@
   int mode_lf_delta_[NUM_MODE_LF_DELTAS];
 } VP8FilterHeader;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Informations about the macroblocks.
 
-typedef struct {
-  // block type
-  uint8_t skip_:1;
-  // filter specs
-  uint8_t f_level_:6;      // filter strength: 0..63
-  uint8_t f_ilevel_:6;     // inner limit: 1..63
-  uint8_t f_inner_:1;      // do inner filtering?
-  // cbp
-  uint8_t nz_;        // non-zero AC/DC coeffs
-  uint8_t dc_nz_;     // non-zero DC coeffs
+typedef struct {  // filter specs
+  unsigned int f_level_:6;      // filter strength: 0..63
+  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  unsigned int f_inner_:1;      // do inner filtering?
+} VP8FInfo;
+
+typedef struct {  // used for syntax-parsing
+  unsigned int nz_;          // non-zero AC/DC coeffs
+  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  unsigned int skip_:1;      // block type
 } VP8MB;
 
 // Dequantization matrices
+typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
-  uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2];    // [DC / AC]
+  quant_t y1_mat_, y2_mat_, uv_mat_;
 } VP8QuantMatrix;
 
-//-----------------------------------------------------------------------------
+// Persistent information needed by the parallel processing
+typedef struct {
+  int id_;            // cache row to process (in [0..2])
+  int mb_y_;          // macroblock position of the row
+  int filter_row_;    // true if row-filtering is needed
+  VP8FInfo* f_info_;  // filter strengths
+  VP8Io io_;          // copy of the VP8Io to pass to put()
+} VP8ThreadContext;
+
+//------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
 
 struct VP8Decoder {
@@ -181,6 +194,13 @@
   VP8FilterHeader  filter_hdr_;
   VP8SegmentHeader segment_hdr_;
 
+  // Worker
+  WebPWorker worker_;
+  int use_threads_;    // use multi-thread
+  int cache_id_;       // current cache row
+  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
+  VP8ThreadContext thread_ctx_;  // Thread context
+
   // dimension, in macroblock units.
   int mb_w_, mb_h_;
 
@@ -219,7 +239,8 @@
   uint8_t* y_t_;         // top luma samples: 16 * mb_w_
   uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
 
-  VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
+  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;     // filter strength info
   uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
   int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
 
@@ -231,7 +252,7 @@
 
   // main memory chunk for the above data. Persistent.
   void* mem_;
-  int mem_size_;
+  size_t mem_size_;
 
   // Per macroblock non-persistent infos.
   int mb_x_, mb_y_;       // current position, in macroblock units
@@ -249,31 +270,25 @@
 
   // Filtering side-info
   int filter_type_;                         // 0=off, 1=simple, 2=complex
+  int filter_row_;                          // per-row flag
   uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
 
   // extensions
   const uint8_t* alpha_data_;   // compressed alpha data (if present)
   size_t alpha_data_size_;
-  uint8_t* alpha_plane_;        // output
+  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
 
   int layer_colorspace_;
   const uint8_t* layer_data_;   // compressed layer data (if present)
   size_t layer_data_size_;
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // internal functions. Not public.
 
 // in vp8.c
 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg);
-// Validates the VP8 data-header and retrieve basic header information viz width
-// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
-// can be passed NULL.
-int VP8GetInfo(const uint8_t* data,
-               uint32_t data_size,    // data available so far
-               uint32_t chunk_size,   // total data size expect in the chunk
-               int *width, int *height, int *has_alpha);
+                VP8StatusCode error, const char* const msg);
 
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
@@ -288,13 +303,19 @@
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
-VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io);
-// Filter the decoded macroblock row (if needed)
-void VP8FilterRow(const VP8Decoder* const dec);
+// After this call returns, one must always call VP8ExitCritical() with the
+// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
+// if ok, otherwise sets and returns the error status on *dec.
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
+// Must always be called in pair with VP8EnterCritical().
+// Returns false in case of error.
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+// Process the last decoded row (filtering + output)
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // Store a block, along with filtering params
 void VP8StoreBlock(VP8Decoder* const dec);
-// Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
+// To be called at the start of a new scanline, to initialize predictors.
+void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 
@@ -305,62 +326,10 @@
 // in layer.c
 int VP8DecodeLayer(VP8Decoder* const dec);
 
-// in dsp.c
-typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
-// when doing two transforms, coeffs is actually int16_t[2][16].
-typedef void (*VP8Idct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
-extern VP8Idct2 VP8Transform;
-extern VP8Idct VP8TransformUV;
-extern VP8Idct VP8TransformDC;
-extern VP8Idct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
-
-// *dst is the destination block, with stride BPS. Boundary samples are
-// assumed accessible when needed.
-typedef void (*VP8PredFunc)(uint8_t* dst);
-extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
-
-void VP8DspInit(void);        // must be called before anything using the above
-void VP8DspInitTables(void);  // needs to be called no matter what.
-
-// simple filter (only for luma)
-typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
-extern VP8SimpleFilterFunc VP8SimpleVFilter16;
-extern VP8SimpleFilterFunc VP8SimpleHFilter16;
-extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
-extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
-
-// regular filter (on both macroblock edges and inner edges)
-typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
-                                  int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
-                                    int thresh, int ithresh, int hev_t);
-// on outter edge
-extern VP8LumaFilterFunc VP8VFilter16;
-extern VP8LumaFilterFunc VP8HFilter16;
-extern VP8ChromaFilterFunc VP8VFilter8;
-extern VP8ChromaFilterFunc VP8HFilter8;
-
-// on inner edge
-extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
-extern VP8LumaFilterFunc VP8HFilter16i;
-extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
-extern VP8ChromaFilterFunc VP8HFilter8i;
-
-typedef enum {
-  kSSE2,
-  kSSE3
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8DecGetCPUInfo;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
-#endif  // WEBP_DEC_VP8I_H_
+#endif  /* WEBP_DEC_VP8I_H_ */
diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c
new file mode 100644
index 0000000..897e439
--- /dev/null
+++ b/src/dec/vp8l.c
@@ -0,0 +1,1200 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// main entry for the decoder
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./vp8li.h"
+#include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
+#include "../utils/huffman.h"
+#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define NUM_ARGB_CACHE_ROWS          16
+
+static const int kCodeLengthLiterals = 16;
+static const int kCodeLengthRepeatCode = 16;
+static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+
+// -----------------------------------------------------------------------------
+//  Five Huffman codes are used at each meta code:
+//  1. green + length prefix codes + color cache codes,
+//  2. alpha,
+//  3. red,
+//  4. blue, and,
+//  5. distance prefix codes.
+typedef enum {
+  GREEN = 0,
+  RED   = 1,
+  BLUE  = 2,
+  ALPHA = 3,
+  DIST  = 4
+} HuffIndex;
+
+static const uint16_t kAlphabetSize[HUFFMAN_CODES_PER_META_CODE] = {
+  NUM_LITERAL_CODES + NUM_LENGTH_CODES,
+  NUM_LITERAL_CODES, NUM_LITERAL_CODES, NUM_LITERAL_CODES,
+  NUM_DISTANCE_CODES
+};
+
+
+#define NUM_CODE_LENGTH_CODES       19
+static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
+  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+#define CODE_TO_PLANE_CODES        120
+static const uint8_t code_to_plane_lut[CODE_TO_PLANE_CODES] = {
+   0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+   0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+   0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+   0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+   0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+   0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+   0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+   0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+   0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+   0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+   0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+   0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
+};
+
+static int DecodeImageStream(int xsize, int ysize,
+                             int is_level0,
+                             VP8LDecoder* const dec,
+                             uint32_t** const decoded_data);
+
+//------------------------------------------------------------------------------
+
+int VP8LCheckSignature(const uint8_t* const data, size_t size) {
+  return (size >= 1) && (data[0] == VP8L_MAGIC_BYTE);
+}
+
+static int ReadImageInfo(VP8LBitReader* const br,
+                         int* const width, int* const height,
+                         int* const has_alpha) {
+  const uint8_t signature = VP8LReadBits(br, 8);
+  if (!VP8LCheckSignature(&signature, 1)) {
+    return 0;
+  }
+  *width = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
+  *height = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
+  *has_alpha = VP8LReadBits(br, 1);
+  VP8LReadBits(br, VP8L_VERSION_BITS);  // Read/ignore the version number.
+  return 1;
+}
+
+int VP8LGetInfo(const uint8_t* data, size_t data_size,
+                int* const width, int* const height, int* const has_alpha) {
+  if (data == NULL || data_size < VP8L_FRAME_HEADER_SIZE) {
+    return 0;         // not enough data
+  } else {
+    int w, h, a;
+    VP8LBitReader br;
+    VP8LInitBitReader(&br, data, data_size);
+    if (!ReadImageInfo(&br, &w, &h, &a)) {
+      return 0;
+    }
+    if (width != NULL) *width = w;
+    if (height != NULL) *height = h;
+    if (has_alpha != NULL) *has_alpha = a;
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int GetCopyDistance(int distance_symbol,
+                                       VP8LBitReader* const br) {
+  int extra_bits, offset;
+  if (distance_symbol < 4) {
+    return distance_symbol + 1;
+  }
+  extra_bits = (distance_symbol - 2) >> 1;
+  offset = (2 + (distance_symbol & 1)) << extra_bits;
+  return offset + VP8LReadBits(br, extra_bits) + 1;
+}
+
+static WEBP_INLINE int GetCopyLength(int length_symbol,
+                                     VP8LBitReader* const br) {
+  // Length and distance prefixes are encoded the same way.
+  return GetCopyDistance(length_symbol, br);
+}
+
+static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
+  if (plane_code > CODE_TO_PLANE_CODES) {
+    return plane_code - CODE_TO_PLANE_CODES;
+  } else {
+    const int dist_code = code_to_plane_lut[plane_code - 1];
+    const int yoffset = dist_code >> 4;
+    const int xoffset = 8 - (dist_code & 0xf);
+    const int dist = yoffset * xsize + xoffset;
+    return (dist >= 1) ? dist : 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decodes the next Huffman code from bit-stream.
+// FillBitWindow(br) needs to be called at minimum every second call
+// to ReadSymbolUnsafe.
+static int ReadSymbolUnsafe(const HuffmanTree* tree, VP8LBitReader* const br) {
+  const HuffmanTreeNode* node = tree->root_;
+  assert(node != NULL);
+  while (!HuffmanTreeNodeIsLeaf(node)) {
+    node = HuffmanTreeNextNode(node, VP8LReadOneBitUnsafe(br));
+  }
+  return node->symbol_;
+}
+
+static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
+                                  VP8LBitReader* const br) {
+  const int read_safe = (br->pos_ + 8 > br->len_);
+  if (!read_safe) {
+    return ReadSymbolUnsafe(tree, br);
+  } else {
+    const HuffmanTreeNode* node = tree->root_;
+    assert(node != NULL);
+    while (!HuffmanTreeNodeIsLeaf(node)) {
+      node = HuffmanTreeNextNode(node, VP8LReadOneBit(br));
+    }
+    return node->symbol_;
+  }
+}
+
+static int ReadHuffmanCodeLengths(
+    VP8LDecoder* const dec, const int* const code_length_code_lengths,
+    int num_symbols, int* const code_lengths) {
+  int ok = 0;
+  VP8LBitReader* const br = &dec->br_;
+  int symbol;
+  int max_symbol;
+  int prev_code_len = DEFAULT_CODE_LENGTH;
+  HuffmanTree tree;
+
+  if (!HuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
+                                NUM_CODE_LENGTH_CODES)) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    return 0;
+  }
+
+  if (VP8LReadBits(br, 1)) {    // use length
+    const int length_nbits = 2 + 2 * VP8LReadBits(br, 3);
+    max_symbol = 2 + VP8LReadBits(br, length_nbits);
+    if (max_symbol > num_symbols) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      goto End;
+    }
+  } else {
+    max_symbol = num_symbols;
+  }
+
+  symbol = 0;
+  while (symbol < num_symbols) {
+    int code_len;
+    if (max_symbol-- == 0) break;
+    VP8LFillBitWindow(br);
+    code_len = ReadSymbol(&tree, br);
+    if (code_len < kCodeLengthLiterals) {
+      code_lengths[symbol++] = code_len;
+      if (code_len != 0) prev_code_len = code_len;
+    } else {
+      const int use_prev = (code_len == kCodeLengthRepeatCode);
+      const int slot = code_len - kCodeLengthLiterals;
+      const int extra_bits = kCodeLengthExtraBits[slot];
+      const int repeat_offset = kCodeLengthRepeatOffsets[slot];
+      int repeat = VP8LReadBits(br, extra_bits) + repeat_offset;
+      if (symbol + repeat > num_symbols) {
+        dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+        goto End;
+      } else {
+        const int length = use_prev ? prev_code_len : 0;
+        while (repeat-- > 0) code_lengths[symbol++] = length;
+      }
+    }
+  }
+  ok = 1;
+
+ End:
+  HuffmanTreeRelease(&tree);
+  return ok;
+}
+
+static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
+                           HuffmanTree* const tree) {
+  int ok = 0;
+  VP8LBitReader* const br = &dec->br_;
+  const int simple_code = VP8LReadBits(br, 1);
+
+  if (simple_code) {  // Read symbols, codes & code lengths directly.
+    int symbols[2];
+    int codes[2];
+    int code_lengths[2];
+    const int num_symbols = VP8LReadBits(br, 1) + 1;
+    const int first_symbol_len_code = VP8LReadBits(br, 1);
+    // The first code is either 1 bit or 8 bit code.
+    symbols[0] = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
+    codes[0] = 0;
+    code_lengths[0] = num_symbols - 1;
+    // The second code (if present), is always 8 bit long.
+    if (num_symbols == 2) {
+      symbols[1] = VP8LReadBits(br, 8);
+      codes[1] = 1;
+      code_lengths[1] = num_symbols - 1;
+    }
+    ok = HuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
+                                  alphabet_size, num_symbols);
+  } else {  // Decode Huffman-coded code lengths.
+    int* code_lengths = NULL;
+    int i;
+    int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
+    const int num_codes = VP8LReadBits(br, 4) + 4;
+    if (num_codes > NUM_CODE_LENGTH_CODES) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      return 0;
+    }
+
+    code_lengths =
+        (int*)WebPSafeCalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
+    if (code_lengths == NULL) {
+      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+      return 0;
+    }
+
+    for (i = 0; i < num_codes; ++i) {
+      code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
+    }
+    ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths, alphabet_size,
+                                code_lengths);
+    if (ok) {
+      ok = HuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
+    }
+    free(code_lengths);
+  }
+  ok = ok && !br->error_;
+  if (!ok) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    return 0;
+  }
+  return 1;
+}
+
+static void DeleteHtreeGroups(HTreeGroup* htree_groups, int num_htree_groups) {
+  if (htree_groups != NULL) {
+    int i, j;
+    for (i = 0; i < num_htree_groups; ++i) {
+      HuffmanTree* const htrees = htree_groups[i].htrees_;
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        HuffmanTreeRelease(&htrees[j]);
+      }
+    }
+    free(htree_groups);
+  }
+}
+
+static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
+                            int color_cache_bits, int allow_recursion) {
+  int i, j;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  uint32_t* huffman_image = NULL;
+  HTreeGroup* htree_groups = NULL;
+  int num_htree_groups = 1;
+
+  if (allow_recursion && VP8LReadBits(br, 1)) {
+    // use meta Huffman codes.
+    const int huffman_precision = VP8LReadBits(br, 3) + 2;
+    const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
+    const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
+    const int huffman_pixs = huffman_xsize * huffman_ysize;
+    if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
+                           &huffman_image)) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      goto Error;
+    }
+    hdr->huffman_subsample_bits_ = huffman_precision;
+    for (i = 0; i < huffman_pixs; ++i) {
+      // The huffman data is stored in red and green bytes.
+      const int index = (huffman_image[i] >> 8) & 0xffff;
+      huffman_image[i] = index;
+      if (index >= num_htree_groups) {
+        num_htree_groups = index + 1;
+      }
+    }
+  }
+
+  if (br->error_) goto Error;
+
+  assert(num_htree_groups <= 0x10000);
+  htree_groups =
+      (HTreeGroup*)WebPSafeCalloc((uint64_t)num_htree_groups,
+                                  sizeof(*htree_groups));
+  if (htree_groups == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  for (i = 0; i < num_htree_groups; ++i) {
+    HuffmanTree* const htrees = htree_groups[i].htrees_;
+    for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+      int alphabet_size = kAlphabetSize[j];
+      if (j == 0 && color_cache_bits > 0) {
+        alphabet_size += 1 << color_cache_bits;
+      }
+      if (!ReadHuffmanCode(alphabet_size, dec, htrees + j)) goto Error;
+    }
+  }
+
+  // All OK. Finalize pointers and return.
+  hdr->huffman_image_ = huffman_image;
+  hdr->num_htree_groups_ = num_htree_groups;
+  hdr->htree_groups_ = htree_groups;
+  return 1;
+
+ Error:
+  free(huffman_image);
+  DeleteHtreeGroups(htree_groups, num_htree_groups);
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Scaling.
+
+static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
+  const int num_channels = 4;
+  const int in_width = io->mb_w;
+  const int out_width = io->scaled_width;
+  const int in_height = io->mb_h;
+  const int out_height = io->scaled_height;
+  const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
+  int32_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
+  const uint64_t memory_size = sizeof(*dec->rescaler) +
+                               work_size * sizeof(*work) +
+                               scaled_data_size * sizeof(*scaled_data);
+  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
+  if (memory == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
+  assert(dec->rescaler_memory == NULL);
+  dec->rescaler_memory = memory;
+
+  dec->rescaler = (WebPRescaler*)memory;
+  memory += sizeof(*dec->rescaler);
+  work = (int32_t*)memory;
+  memory += work_size * sizeof(*work);
+  scaled_data = (uint32_t*)memory;
+
+  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
+                   out_width, out_height, 0, num_channels,
+                   in_width, out_width, in_height, out_height, work);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Export to ARGB
+
+// We have special "export" function since we need to convert from BGRA
+static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
+                  int rgba_stride, uint8_t* const rgba) {
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  const int dst_width = rescaler->dst_width;
+  int num_lines_out = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
+    WebPRescalerExportRow(rescaler);
+    VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+// Emit scaled rows.
+static int EmitRescaledRows(const VP8LDecoder* const dec,
+                            const uint32_t* const data, int in_stride, int mb_h,
+                            uint8_t* const out, int out_stride) {
+  const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
+  const uint8_t* const in = (const uint8_t*)data;
+  int num_lines_in = 0;
+  int num_lines_out = 0;
+  while (num_lines_in < mb_h) {
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    uint8_t* const row_out = out + num_lines_out * out_stride;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
+    num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
+  }
+  return num_lines_out;
+}
+
+// Emit rows without any scaling.
+static int EmitRows(WEBP_CSP_MODE colorspace,
+                    const uint32_t* const data, int in_stride,
+                    int mb_w, int mb_h,
+                    uint8_t* const out, int out_stride) {
+  int lines = mb_h;
+  const uint8_t* row_in = (const uint8_t*)data;
+  uint8_t* row_out = out;
+  while (lines-- > 0) {
+    VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
+    row_in += in_stride;
+    row_out += out_stride;
+  }
+  return mb_h;  // Num rows out == num rows in.
+}
+
+//------------------------------------------------------------------------------
+// Export to YUVA
+
+static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
+                          const WebPDecBuffer* const output) {
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  // first, the luma plane
+  {
+    int i;
+    uint8_t* const y = buf->y + y_pos * buf->y_stride;
+    for (i = 0; i < width; ++i) {
+      const uint32_t p = src[i];
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
+    }
+  }
+
+  // then U/V planes
+  {
+    uint8_t* const u = buf->u + (y_pos >> 1) * buf->u_stride;
+    uint8_t* const v = buf->v + (y_pos >> 1) * buf->v_stride;
+    const int uv_width = width >> 1;
+    int i;
+    for (i = 0; i < uv_width; ++i) {
+      const uint32_t v0 = src[2 * i + 0];
+      const uint32_t v1 = src[2 * i + 1];
+      // VP8RGBToU/V expects four accumulated pixels. Hence we need to
+      // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
+      const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
+      const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
+      const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
+      if (!(y_pos & 1)) {  // even lines: store values
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
+      } else {             // odd lines: average with previous values
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
+        // Approximated average-of-four. But it's an acceptable diff.
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+    if (width & 1) {       // last pixel
+      const uint32_t v0 = src[2 * i + 0];
+      const int r = (v0 >> 14) & 0x3fc;
+      const int g = (v0 >>  6) & 0x3fc;
+      const int b = (v0 <<  2) & 0x3fc;
+      if (!(y_pos & 1)) {  // even lines
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
+      } else {             // odd lines (note: we could just skip this)
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+  }
+  // Lastly, store alpha if needed.
+  if (buf->a != NULL) {
+    int i;
+    uint8_t* const a = buf->a + y_pos * buf->a_stride;
+    for (i = 0; i < width; ++i) a[i] = (src[i] >> 24);
+  }
+}
+
+static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
+  WebPRescaler* const rescaler = dec->rescaler;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  const int dst_width = rescaler->dst_width;
+  int num_lines_out = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    WebPRescalerExportRow(rescaler);
+    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
+    ++y_pos;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
+                                const uint32_t* const data,
+                                int in_stride, int mb_h) {
+  const uint8_t* const in = (const uint8_t*)data;
+  int num_lines_in = 0;
+  int y_pos = dec->last_out_row_;
+  while (num_lines_in < mb_h) {
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
+    y_pos += ExportYUVA(dec, y_pos);
+  }
+  return y_pos;
+}
+
+static int EmitRowsYUVA(const VP8LDecoder* const dec,
+                        const uint32_t* const data, int in_stride,
+                        int mb_w, int num_rows) {
+  int y_pos = dec->last_out_row_;
+  const uint8_t* row_in = (const uint8_t*)data;
+  while (num_rows-- > 0) {
+    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
+    row_in += in_stride;
+    ++y_pos;
+  }
+  return y_pos;
+}
+
+//------------------------------------------------------------------------------
+// Cropping.
+
+// Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
+// crop options. Also updates the input data pointer, so that it points to the
+// start of the cropped window.
+// Note that 'pixel_stride' is in units of 'uint32_t' (and not 'bytes).
+// Returns true if the crop window is not empty.
+static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
+                         const uint32_t** const in_data, int pixel_stride) {
+  assert(y_start < y_end);
+  assert(io->crop_left < io->crop_right);
+  if (y_end > io->crop_bottom) {
+    y_end = io->crop_bottom;  // make sure we don't overflow on last row.
+  }
+  if (y_start < io->crop_top) {
+    const int delta = io->crop_top - y_start;
+    y_start = io->crop_top;
+    *in_data += pixel_stride * delta;
+  }
+  if (y_start >= y_end) return 0;  // Crop window is empty.
+
+  *in_data += io->crop_left;
+
+  io->mb_y = y_start - io->crop_top;
+  io->mb_w = io->crop_right - io->crop_left;
+  io->mb_h = y_end - y_start;
+  return 1;  // Non-empty crop window.
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int GetMetaIndex(
+    const uint32_t* const image, int xsize, int bits, int x, int y) {
+  if (bits == 0) return 0;
+  return image[xsize * (y >> bits) + (x >> bits)];
+}
+
+static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
+                                                   int x, int y) {
+  const int meta_index = GetMetaIndex(hdr->huffman_image_, hdr->huffman_xsize_,
+                                      hdr->huffman_subsample_bits_, x, y);
+  assert(meta_index < hdr->num_htree_groups_);
+  return hdr->htree_groups_ + meta_index;
+}
+
+//------------------------------------------------------------------------------
+// Main loop, with custom row-processing function
+
+typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
+
+static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+                                   const uint32_t* const rows) {
+  int n = dec->next_transform_;
+  const int cache_pixs = dec->width_ * num_rows;
+  const int start_row = dec->last_row_;
+  const int end_row = start_row + num_rows;
+  const uint32_t* rows_in = rows;
+  uint32_t* const rows_out = dec->argb_cache_;
+
+  // Inverse transforms.
+  // TODO: most transforms only need to operate on the cropped region only.
+  memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
+  while (n-- > 0) {
+    VP8LTransform* const transform = &dec->transforms_[n];
+    VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
+    rows_in = rows_out;
+  }
+}
+
+// Processes (transforms, scales & color-converts) the rows decoded after the
+// last call.
+static void ProcessRows(VP8LDecoder* const dec, int row) {
+  const uint32_t* const rows = dec->argb_ + dec->width_ * dec->last_row_;
+  const int num_rows = row - dec->last_row_;
+
+  if (num_rows <= 0) return;  // Nothing to be done.
+  ApplyInverseTransforms(dec, num_rows, rows);
+
+  // Emit output.
+  {
+    VP8Io* const io = dec->io_;
+    const uint32_t* rows_data = dec->argb_cache_;
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
+      // Nothing to output (this time).
+    } else {
+      const WebPDecBuffer* const output = dec->output_;
+      const int in_stride = io->width * sizeof(*rows_data);
+      if (output->colorspace < MODE_YUV) {  // convert to RGBA
+        const WebPRGBABuffer* const buf = &output->u.RGBA;
+        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        const int num_rows_out = io->use_scaling ?
+            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
+                             rgba, buf->stride) :
+            EmitRows(output->colorspace, rows_data, in_stride,
+                     io->mb_w, io->mb_h, rgba, buf->stride);
+        // Update 'last_out_row_'.
+        dec->last_out_row_ += num_rows_out;
+      } else {                              // convert to YUVA
+        dec->last_out_row_ = io->use_scaling ?
+            EmitRescaledRowsYUVA(dec, rows_data, in_stride, io->mb_h) :
+            EmitRowsYUVA(dec, rows_data, in_stride, io->mb_w, io->mb_h);
+      }
+      assert(dec->last_out_row_ <= output->height);
+    }
+  }
+
+  // Update 'last_row_'.
+  dec->last_row_ = row;
+  assert(dec->last_row_ <= dec->height_);
+}
+
+static int DecodeImageData(VP8LDecoder* const dec,
+                           uint32_t* const data, int width, int height,
+                           ProcessRowsFunc process_func) {
+  int ok = 1;
+  int col = 0, row = 0;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  HTreeGroup* htree_group = hdr->htree_groups_;
+  uint32_t* src = data;
+  uint32_t* last_cached = data;
+  uint32_t* const src_end = data + width * height;
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
+  VP8LColorCache* const color_cache =
+      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
+  const int mask = hdr->huffman_mask_;
+
+  assert(htree_group != NULL);
+
+  while (!br->eos_ && src < src_end) {
+    int code;
+    // Only update when changing tile. Note we could use the following test:
+    //   if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
+    // but that's actually slower and requires storing the previous col/row
+    if ((col & mask) == 0) {
+      htree_group = GetHtreeGroupForPos(hdr, col, row);
+    }
+    VP8LFillBitWindow(br);
+    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
+    if (code < NUM_LITERAL_CODES) {   // Literal.
+      int red, green, blue, alpha;
+      red = ReadSymbol(&htree_group->htrees_[RED], br);
+      green = code;
+      VP8LFillBitWindow(br);
+      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
+      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
+      *src = (alpha << 24) + (red << 16) + (green << 8) + blue;
+ AdvanceByOne:
+      ++src;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
+          process_func(dec, row);
+        }
+        if (color_cache != NULL) {
+          while (last_cached < src) {
+            VP8LColorCacheInsert(color_cache, *last_cached++);
+          }
+        }
+      }
+    } else if (code < len_code_limit) {           // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
+      VP8LFillBitWindow(br);
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (src - data < dist || src_end - src < length) {
+        ok = 0;
+        goto End;
+      }
+      {
+        int i;
+        for (i = 0; i < length; ++i) src[i] = src[i - dist];
+        src += length;
+      }
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {
+          process_func(dec, row);
+        }
+      }
+      if (src < src_end) {
+        htree_group = GetHtreeGroupForPos(hdr, col, row);
+        if (color_cache != NULL) {
+          while (last_cached < src) {
+            VP8LColorCacheInsert(color_cache, *last_cached++);
+          }
+        }
+      }
+    } else if (code < color_cache_limit) {    // Color cache.
+      const int key = code - len_code_limit;
+      assert(color_cache != NULL);
+      while (last_cached < src) {
+        VP8LColorCacheInsert(color_cache, *last_cached++);
+      }
+      *src = VP8LColorCacheLookup(color_cache, key);
+      goto AdvanceByOne;
+    } else {    // Not reached.
+      ok = 0;
+      goto End;
+    }
+    ok = !br->error_;
+    if (!ok) goto End;
+  }
+  // Process the remaining rows corresponding to last row-block.
+  if (process_func != NULL) process_func(dec, row);
+
+ End:
+  if (br->error_ || !ok || (br->eos_ && src < src_end)) {
+    ok = 0;
+    dec->status_ = (!br->eos_) ?
+        VP8_STATUS_BITSTREAM_ERROR : VP8_STATUS_SUSPENDED;
+  } else if (src == src_end) {
+    dec->state_ = READ_DATA;
+  }
+
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LTransform
+
+static void ClearTransform(VP8LTransform* const transform) {
+  free(transform->data_);
+  transform->data_ = NULL;
+}
+
+// For security reason, we need to remap the color map to span
+// the total possible bundled values, and not just the num_colors.
+static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
+  int i;
+  const int final_num_colors = 1 << (8 >> transform->bits_);
+  uint32_t* const new_color_map =
+      (uint32_t*)WebPSafeMalloc((uint64_t)final_num_colors,
+                                sizeof(*new_color_map));
+  if (new_color_map == NULL) {
+    return 0;
+  } else {
+    uint8_t* const data = (uint8_t*)transform->data_;
+    uint8_t* const new_data = (uint8_t*)new_color_map;
+    new_color_map[0] = transform->data_[0];
+    for (i = 4; i < 4 * num_colors; ++i) {
+      // Equivalent to AddPixelEq(), on a byte-basis.
+      new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
+    }
+    for (; i < 4 * final_num_colors; ++i)
+      new_data[i] = 0;  // black tail.
+    free(transform->data_);
+    transform->data_ = new_color_map;
+  }
+  return 1;
+}
+
+static int ReadTransform(int* const xsize, int const* ysize,
+                         VP8LDecoder* const dec) {
+  int ok = 1;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LTransform* transform = &dec->transforms_[dec->next_transform_];
+  const VP8LImageTransformType type =
+      (VP8LImageTransformType)VP8LReadBits(br, 2);
+
+  // Each transform type can only be present once in the stream.
+  if (dec->transforms_seen_ & (1U << type)) {
+    return 0;  // Already there, let's not accept the second same transform.
+  }
+  dec->transforms_seen_ |= (1U << type);
+
+  transform->type_ = type;
+  transform->xsize_ = *xsize;
+  transform->ysize_ = *ysize;
+  transform->data_ = NULL;
+  ++dec->next_transform_;
+  assert(dec->next_transform_ <= NUM_TRANSFORMS);
+
+  switch (type) {
+    case PREDICTOR_TRANSFORM:
+    case CROSS_COLOR_TRANSFORM:
+      transform->bits_ = VP8LReadBits(br, 3) + 2;
+      ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
+                                               transform->bits_),
+                             VP8LSubSampleSize(transform->ysize_,
+                                               transform->bits_),
+                             0, dec, &transform->data_);
+      break;
+    case COLOR_INDEXING_TRANSFORM: {
+       const int num_colors = VP8LReadBits(br, 8) + 1;
+       const int bits = (num_colors > 16) ? 0
+                      : (num_colors > 4) ? 1
+                      : (num_colors > 2) ? 2
+                      : 3;
+       *xsize = VP8LSubSampleSize(transform->xsize_, bits);
+       transform->bits_ = bits;
+       ok = DecodeImageStream(num_colors, 1, 0, dec, &transform->data_);
+       ok = ok && ExpandColorMap(num_colors, transform);
+      break;
+    }
+    case SUBTRACT_GREEN:
+      break;
+    default:
+      assert(0);    // can't happen
+      break;
+  }
+
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LMetadata
+
+static void InitMetadata(VP8LMetadata* const hdr) {
+  assert(hdr);
+  memset(hdr, 0, sizeof(*hdr));
+}
+
+static void ClearMetadata(VP8LMetadata* const hdr) {
+  assert(hdr);
+
+  free(hdr->huffman_image_);
+  DeleteHtreeGroups(hdr->htree_groups_, hdr->num_htree_groups_);
+  VP8LColorCacheClear(&hdr->color_cache_);
+  InitMetadata(hdr);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LDecoder
+
+VP8LDecoder* VP8LNew(void) {
+  VP8LDecoder* const dec = (VP8LDecoder*)calloc(1, sizeof(*dec));
+  if (dec == NULL) return NULL;
+  dec->status_ = VP8_STATUS_OK;
+  dec->action_ = READ_DIM;
+  dec->state_ = READ_DIM;
+  return dec;
+}
+
+void VP8LClear(VP8LDecoder* const dec) {
+  int i;
+  if (dec == NULL) return;
+  ClearMetadata(&dec->hdr_);
+
+  free(dec->argb_);
+  dec->argb_ = NULL;
+  for (i = 0; i < dec->next_transform_; ++i) {
+    ClearTransform(&dec->transforms_[i]);
+  }
+  dec->next_transform_ = 0;
+  dec->transforms_seen_ = 0;
+
+  free(dec->rescaler_memory);
+  dec->rescaler_memory = NULL;
+
+  dec->output_ = NULL;   // leave no trace behind
+}
+
+void VP8LDelete(VP8LDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LClear(dec);
+    free(dec);
+  }
+}
+
+static void UpdateDecoder(VP8LDecoder* const dec, int width, int height) {
+  VP8LMetadata* const hdr = &dec->hdr_;
+  const int num_bits = hdr->huffman_subsample_bits_;
+  dec->width_ = width;
+  dec->height_ = height;
+
+  hdr->huffman_xsize_ = VP8LSubSampleSize(width, num_bits);
+  hdr->huffman_mask_ = (num_bits == 0) ? ~0 : (1 << num_bits) - 1;
+}
+
+static int DecodeImageStream(int xsize, int ysize,
+                             int is_level0,
+                             VP8LDecoder* const dec,
+                             uint32_t** const decoded_data) {
+  int ok = 1;
+  int transform_xsize = xsize;
+  int transform_ysize = ysize;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  uint32_t* data = NULL;
+  int color_cache_bits = 0;
+
+  // Read the transforms (may recurse).
+  if (is_level0) {
+    while (ok && VP8LReadBits(br, 1)) {
+      ok = ReadTransform(&transform_xsize, &transform_ysize, dec);
+    }
+  }
+
+  // Color cache
+  if (ok && VP8LReadBits(br, 1)) {
+    color_cache_bits = VP8LReadBits(br, 4);
+    ok = (color_cache_bits >= 1 && color_cache_bits <= MAX_CACHE_BITS);
+    if (!ok) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      goto End;
+    }
+  }
+
+  // Read the Huffman codes (may recurse).
+  ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
+                              color_cache_bits, is_level0);
+  if (!ok) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    goto End;
+  }
+
+  // Finish setting up the color-cache
+  if (color_cache_bits > 0) {
+    hdr->color_cache_size_ = 1 << color_cache_bits;
+    if (!VP8LColorCacheInit(&hdr->color_cache_, color_cache_bits)) {
+      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+      ok = 0;
+      goto End;
+    }
+  } else {
+    hdr->color_cache_size_ = 0;
+  }
+  UpdateDecoder(dec, transform_xsize, transform_ysize);
+
+  if (is_level0) {   // level 0 complete
+    dec->state_ = READ_HDR;
+    goto End;
+  }
+
+  {
+    const uint64_t total_size = (uint64_t)transform_xsize * transform_ysize;
+    data = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*data));
+    if (data == NULL) {
+      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+      ok = 0;
+      goto End;
+    }
+  }
+
+  // Use the Huffman trees to decode the LZ77 encoded data.
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
+  ok = ok && !br->error_;
+
+ End:
+
+  if (!ok) {
+    free(data);
+    ClearMetadata(hdr);
+    // If not enough data (br.eos_) resulted in BIT_STREAM_ERROR, update the
+    // status appropriately.
+    if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR && dec->br_.eos_) {
+      dec->status_ = VP8_STATUS_SUSPENDED;
+    }
+  } else {
+    if (decoded_data != NULL) {
+      *decoded_data = data;
+    } else {
+      // We allocate image data in this function only for transforms. At level 0
+      // (that is: not the transforms), we shouldn't have allocated anything.
+      assert(data == NULL);
+      assert(is_level0);
+    }
+    if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Allocate dec->argb_ and dec->argb_cache_ using dec->width_ and dec->height_
+
+static int AllocateARGBBuffers(VP8LDecoder* const dec, int final_width) {
+  const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
+  // Scratch buffer corresponding to top-prediction row for transforming the
+  // first row in the row-blocks.
+  const uint64_t cache_top_pixels = final_width;
+  // Scratch buffer for temporary BGRA storage.
+  const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
+  const uint64_t total_num_pixels =
+      num_pixels + cache_top_pixels + cache_pixels;
+
+  assert(dec->width_ <= final_width);
+  dec->argb_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(*dec->argb_));
+  if (dec->argb_ == NULL) {
+    dec->argb_cache_ = NULL;    // for sanity check
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
+  dec->argb_cache_ = dec->argb_ + num_pixels + cache_top_pixels;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Special row-processing that only stores the alpha data.
+
+static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint32_t* const in = dec->argb_ + dec->width_ * dec->last_row_;
+
+  if (num_rows <= 0) return;  // Nothing to be done.
+  ApplyInverseTransforms(dec, num_rows, in);
+
+  // Extract alpha (which is stored in the green plane).
+  {
+    const int width = dec->io_->width;      // the final width (!= dec->width_)
+    const int cache_pixs = width * num_rows;
+    uint8_t* const dst = (uint8_t*)dec->io_->opaque + width * dec->last_row_;
+    const uint32_t* const src = dec->argb_cache_;
+    int i;
+    for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
+  }
+
+  dec->last_row_ = dec->last_out_row_ = row;
+}
+
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output) {
+  VP8Io io;
+  int ok = 0;
+  VP8LDecoder* const dec = VP8LNew();
+  if (dec == NULL) return 0;
+
+  dec->width_ = width;
+  dec->height_ = height;
+  dec->io_ = &io;
+
+  VP8InitIo(&io);
+  WebPInitCustomIo(NULL, &io);    // Just a sanity Init. io won't be used.
+  io.opaque = output;
+  io.width = width;
+  io.height = height;
+
+  dec->status_ = VP8_STATUS_OK;
+  VP8LInitBitReader(&dec->br_, data, data_size);
+
+  dec->action_ = READ_HDR;
+  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Err;
+
+  // Allocate output (note that dec->width_ may have changed here).
+  if (!AllocateARGBBuffers(dec, width)) goto Err;
+
+  // Decode (with special row processing).
+  dec->action_ = READ_DATA;
+  ok = DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
+                       ExtractAlphaRows);
+
+ Err:
+  VP8LDelete(dec);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
+  int width, height, has_alpha;
+
+  if (dec == NULL) return 0;
+  if (io == NULL) {
+    dec->status_ = VP8_STATUS_INVALID_PARAM;
+    return 0;
+  }
+
+  dec->io_ = io;
+  dec->status_ = VP8_STATUS_OK;
+  VP8LInitBitReader(&dec->br_, io->data, io->data_size);
+  if (!ReadImageInfo(&dec->br_, &width, &height, &has_alpha)) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    goto Error;
+  }
+  dec->state_ = READ_DIM;
+  io->width = width;
+  io->height = height;
+
+  dec->action_ = READ_HDR;
+  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Error;
+  return 1;
+
+ Error:
+   VP8LClear(dec);
+   assert(dec->status_ != VP8_STATUS_OK);
+   return 0;
+}
+
+int VP8LDecodeImage(VP8LDecoder* const dec) {
+  VP8Io* io = NULL;
+  WebPDecParams* params = NULL;
+
+  // Sanity checks.
+  if (dec == NULL) return 0;
+
+  io = dec->io_;
+  assert(io != NULL);
+  params = (WebPDecParams*)io->opaque;
+  assert(params != NULL);
+  dec->output_ = params->output;
+  assert(dec->output_ != NULL);
+
+  // Initialization.
+  if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
+    dec->status_ = VP8_STATUS_INVALID_PARAM;
+    goto Err;
+  }
+
+  if (!AllocateARGBBuffers(dec, io->width)) goto Err;
+
+  if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
+
+  // Decode.
+  dec->action_ = READ_DATA;
+  if (!DecodeImageData(dec, dec->argb_, dec->width_, dec->height_,
+                       ProcessRows)) {
+    goto Err;
+  }
+
+  // Cleanup.
+  params->last_y = dec->last_out_row_;
+  VP8LClear(dec);
+  return 1;
+
+ Err:
+  VP8LClear(dec);
+  assert(dec->status_ != VP8_STATUS_OK);
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/vp8li.h b/src/dec/vp8li.h
new file mode 100644
index 0000000..be50e45
--- /dev/null
+++ b/src/dec/vp8li.h
@@ -0,0 +1,121 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Lossless decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora(vikaas.arora@gmail.com)
+
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_
+
+#include <string.h>     // for memcpy()
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+#include "../utils/color_cache.h"
+#include "../utils/huffman.h"
+#include "webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  READ_DATA = 0,
+  READ_HDR = 1,
+  READ_DIM = 2
+} VP8LDecodeState;
+
+typedef struct VP8LTransform VP8LTransform;
+struct VP8LTransform {
+  VP8LImageTransformType type_;   // transform type.
+  int                    bits_;   // subsampling bits defining transform window.
+  int                    xsize_;  // transform window X index.
+  int                    ysize_;  // transform window Y index.
+  uint32_t              *data_;   // transform data.
+};
+
+typedef struct {
+  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
+} HTreeGroup;
+
+typedef struct {
+  int             color_cache_size_;
+  VP8LColorCache  color_cache_;
+
+  int             huffman_mask_;
+  int             huffman_subsample_bits_;
+  int             huffman_xsize_;
+  uint32_t       *huffman_image_;
+  int             num_htree_groups_;
+  HTreeGroup     *htree_groups_;
+} VP8LMetadata;
+
+typedef struct {
+  VP8StatusCode    status_;
+  VP8LDecodeState  action_;
+  VP8LDecodeState  state_;
+  VP8Io           *io_;
+
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
+  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+
+  VP8LBitReader    br_;
+
+  int              width_;
+  int              height_;
+  int              last_row_;      // last input row decoded so far.
+  int              last_out_row_;  // last row output so far.
+
+  VP8LMetadata     hdr_;
+
+  int              next_transform_;
+  VP8LTransform    transforms_[NUM_TRANSFORMS];
+  // or'd bitset storing the transforms types.
+  uint32_t         transforms_seen_;
+
+  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+} VP8LDecoder;
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8l.c
+
+// Decodes a raw image stream (without header) and store the alpha data
+// into *output, which must be of size width x height. Returns false in case
+// of error.
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output);
+
+// Allocates and initialize a new lossless decoder instance.
+VP8LDecoder* VP8LNew(void);
+
+// Decodes the image header. Returns false in case of error.
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+
+// Decodes an image. It's required to decode the lossless header before calling
+// this function. Returns false in case of error, with updated dec->status_.
+int VP8LDecodeImage(VP8LDecoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+void VP8LClear(VP8LDecoder* const dec);
+
+// Clears and deallocate a lossless decoder instance.
+void VP8LDelete(VP8LDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8LI_H_ */
diff --git a/src/dec/webp.c b/src/dec/webp.c
index eea5e6e..54cb6d3 100644
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -10,60 +10,367 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // RIFF layout is:
-//   0ffset  tag
+//   Offset  tag
 //   0...3   "RIFF" 4-byte tag
 //   4...7   size of image data (including metadata) starting at offset 8
 //   8...11  "WEBP"   our form-type signature
-//   12..15  "VP8 ": 4-bytes tags, describing the raw video format used
+// The RIFF container (12 bytes) is followed by appropriate chunks:
+//   12..15  "VP8 ": 4-bytes tags, signaling the use of VP8 video format
 //   16..19  size of the raw VP8 image data, starting at offset 20
 //   20....  the VP8 bytes
-// There can be extra chunks after the "VP8 " chunk (ICMT, ICOP, ...)
-// All 32-bits sizes are in little-endian order.
-// Note: chunk data must be padded to multiple of 2 in size
+// Or,
+//   12..15  "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
+//   16..19  size of the raw VP8L image data, starting at offset 20
+//   20....  the VP8L bytes
+// Or,
+//   12..15  "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
+//   16..19  size of the VP8X chunk starting at offset 20.
+//   20..23  VP8X flags bit-map corresponding to the chunk-types present.
+//   24..26  Width of the Canvas Image.
+//   27..29  Height of the Canvas Image.
+// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8,
+// META  ...)
+// All sizes are in little-endian order.
+// Note: chunk data size must be padded to multiple of 2 when written.
 
-static inline uint32_t get_le32(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16);
 }
 
-// If a RIFF container is detected, validate it and skip over it.
-uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t* data_size_ptr) {
-  uint32_t chunk_size = 0xffffffffu;
-  if (*data_size_ptr >= 10 + 20 && !memcmp(*data_ptr, "RIFF", 4)) {
-    if (memcmp(*data_ptr + 8, "WEBP", 4)) {
-      return 0;  // wrong image file signature
+static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
+  return (uint32_t)get_le24(data) | (data[3] << 24);
+}
+
+// Validates the RIFF container (if detected) and skips over it.
+// If a RIFF container is detected,
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
+//         VP8_STATUS_OK otherwise.
+// In case there are not enough bytes (partial RIFF container), return 0 for
+// *riff_size. Else return the RIFF size extracted from the header.
+static VP8StatusCode ParseRIFF(const uint8_t** const data,
+                               size_t* const data_size,
+                               size_t* const riff_size) {
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(riff_size != NULL);
+
+  *riff_size = 0;  // Default: no RIFF present.
+  if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
+    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
     } else {
-      const uint32_t riff_size = get_le32(*data_ptr + 4);
-      if (riff_size < 12) {
-        return 0;   // we should have at least one chunk
-      }
-      if (memcmp(*data_ptr + 12, "VP8 ", 4)) {
-        return 0;   // invalid compression format
-      }
-      chunk_size = get_le32(*data_ptr + 16);
-      if (chunk_size > riff_size - 12) {
-        return 0;  // inconsistent size information.
+      const uint32_t size = get_le32(*data + TAG_SIZE);
+      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
+      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
+        return VP8_STATUS_BITSTREAM_ERROR;
       }
       // We have a RIFF container. Skip it.
-      *data_ptr += 20;
-      *data_size_ptr -= 20;
-      // Note: we don't report error for odd-sized chunks.
+      *riff_size = size;
+      *data += RIFF_HEADER_SIZE;
+      *data_size -= RIFF_HEADER_SIZE;
     }
-    return chunk_size;
   }
-  return *data_size_ptr;
+  return VP8_STATUS_OK;
 }
 
-//-----------------------------------------------------------------------------
+// Validates the VP8X header and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
+// *height_ptr and *flags_ptr are set to the corresponding values extracted
+// from the VP8X chunk.
+static VP8StatusCode ParseVP8X(const uint8_t** const data,
+                               size_t* const data_size,
+                               int* const found_vp8x,
+                               int* const width_ptr, int* const height_ptr,
+                               uint32_t* const flags_ptr) {
+  const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(found_vp8x != NULL);
+
+  *found_vp8x = 0;
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
+    int width, height;
+    uint32_t flags;
+    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    if (chunk_size != VP8X_CHUNK_SIZE) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
+    }
+
+    // Verify if enough data is available to validate the VP8X chunk.
+    if (*data_size < vp8x_size) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+    }
+    flags = get_le32(*data + 8);
+    width = 1 + get_le24(*data + 12);
+    height = 1 + get_le24(*data + 15);
+    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
+    }
+
+    if (flags_ptr != NULL) *flags_ptr = flags;
+    if (width_ptr != NULL) *width_ptr = width;
+    if (height_ptr != NULL) *height_ptr = height;
+    // Skip over VP8X header bytes.
+    *data += vp8x_size;
+    *data_size -= vp8x_size;
+    *found_vp8x = 1;
+  }
+  return VP8_STATUS_OK;
+}
+
+// Skips to the next VP8/VP8L chunk header in the data given the size of the
+// RIFF chunk 'riff_size'.
+// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If an alpha chunk is found, *alpha_data and *alpha_size are set
+// appropriately.
+static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
+                                         size_t* const data_size,
+                                         size_t const riff_size,
+                                         const uint8_t** const alpha_data,
+                                         size_t* const alpha_size) {
+  const uint8_t* buf;
+  size_t buf_size;
+  uint32_t total_size = TAG_SIZE +           // "WEBP".
+                        CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
+                        VP8X_CHUNK_SIZE;     // data.
+  assert(data != NULL);
+  assert(data_size != NULL);
+  buf = *data;
+  buf_size = *data_size;
+
+  assert(alpha_data != NULL);
+  assert(alpha_size != NULL);
+  *alpha_data = NULL;
+  *alpha_size = 0;
+
+  while (1) {
+    uint32_t chunk_size;
+    uint32_t disk_chunk_size;   // chunk_size with padding
+
+    *data = buf;
+    *data_size = buf_size;
+
+    if (buf_size < CHUNK_HEADER_SIZE) {  // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    chunk_size = get_le32(buf + TAG_SIZE);
+    // For odd-sized chunk-payload, there's one byte padding at the end.
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    total_size += disk_chunk_size;
+
+    // Check that total bytes skipped so far does not exceed riff_size.
+    if (riff_size > 0 && (total_size > riff_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+
+    if (buf_size < disk_chunk_size) {             // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
+      *alpha_data = buf + CHUNK_HEADER_SIZE;
+      *alpha_size = chunk_size;
+    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
+      return VP8_STATUS_OK;  // Found.
+    }
+
+    // We have a full and valid chunk; skip it.
+    buf += disk_chunk_size;
+    buf_size -= disk_chunk_size;
+  }
+}
+
+// Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
+//         riff_size) VP8/VP8L header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
+// extracted from the VP8/VP8L chunk header.
+// The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
+static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
+                                    size_t* const data_size,
+                                    size_t riff_size,
+                                    size_t* const chunk_size,
+                                    int* const is_lossless) {
+  const uint8_t* const data = *data_ptr;
+  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
+  const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
+  const uint32_t minimal_size =
+      TAG_SIZE + CHUNK_HEADER_SIZE;  // "WEBP" + "VP8 nnnn" OR
+                                     // "WEBP" + "VP8Lnnnn"
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(chunk_size != NULL);
+  assert(is_lossless != NULL);
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (is_vp8 || is_vp8l) {
+    // Bitstream contains VP8/VP8L header.
+    const uint32_t size = get_le32(data + TAG_SIZE);
+    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
+    }
+    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
+    *chunk_size = size;
+    *data_ptr += CHUNK_HEADER_SIZE;
+    *data_size -= CHUNK_HEADER_SIZE;
+    *is_lossless = is_vp8l;
+  } else {
+    // Raw VP8/VP8L bitstream (no header).
+    *is_lossless = VP8LCheckSignature(data, *data_size);
+    *chunk_size = *data_size;
+  }
+
+  return VP8_STATUS_OK;
+}
+
+//------------------------------------------------------------------------------
+
+// Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
+// 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
+// minimal amount will be read to fetch the remaining parameters.
+// If 'headers' is non-NULL this function will attempt to locate both alpha
+// data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
+static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
+                                          size_t data_size,
+                                          int* const width,
+                                          int* const height,
+                                          int* const has_alpha,
+                                          WebPHeaderStructure* const headers) {
+  int found_riff = 0;
+  int found_vp8x = 0;
+  VP8StatusCode status;
+  WebPHeaderStructure hdrs;
+
+  if (data == NULL || data_size < RIFF_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;
+  }
+  memset(&hdrs, 0, sizeof(hdrs));
+  hdrs.data = data;
+  hdrs.data_size = data_size;
+
+  // Skip over RIFF header.
+  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
+  if (status != VP8_STATUS_OK) {
+    return status;   // Wrong RIFF header / insufficient data.
+  }
+  found_riff = (hdrs.riff_size > 0);
+
+  // Skip over VP8X.
+  {
+    uint32_t flags = 0;
+    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Wrong VP8X / insufficient data.
+    }
+    if (!found_riff && found_vp8x) {
+      // Note: This restriction may be removed in the future, if it becomes
+      // necessary to send VP8X chunk to the decoder.
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG_BIT);
+    if (found_vp8x && headers == NULL) {
+      return VP8_STATUS_OK;  // Return features from VP8X header.
+    }
+  }
+
+  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+
+  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
+  if ((found_riff && found_vp8x) ||
+      (!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
+    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
+                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Found an invalid chunk size / insufficient data.
+    }
+  }
+
+  // Skip over VP8/VP8L header.
+  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+                          &hdrs.compressed_size, &hdrs.is_lossless);
+  if (status != VP8_STATUS_OK) {
+    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+  }
+  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+
+  if (!hdrs.is_lossless) {
+    if (data_size < VP8_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8 data.
+    if (!VP8GetInfo(data, data_size,
+                    (uint32_t)hdrs.compressed_size, width, height)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  } else {
+    if (data_size < VP8L_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8L data.
+    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  }
+
+  if (has_alpha != NULL) {
+    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    // to set this is by looking for alpha data (from an ALPH chunk).
+    *has_alpha |= (hdrs.alpha_data != NULL);
+  }
+  if (headers != NULL) {
+    *headers = hdrs;
+    headers->offset = data - headers->data;
+    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
+    assert(headers->offset == headers->data_size - data_size);
+  }
+  return VP8_STATUS_OK;  // Return features from VP8 header.
+}
+
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  assert(headers != NULL);
+  // fill out headers, ignore width/height/has_alpha.
+  return ParseHeadersInternal(headers->data, headers->data_size,
+                              NULL, NULL, NULL, headers);
+}
+
+//------------------------------------------------------------------------------
 // WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
@@ -72,41 +379,76 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // "Into" decoding variants
 
 // Main flow
-static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size,
+static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
                                 WebPDecParams* const params) {
-  VP8Decoder* dec = VP8New();
-  VP8StatusCode status = VP8_STATUS_OK;
+  VP8StatusCode status;
   VP8Io io;
+  WebPHeaderStructure headers;
 
-  assert(params);
-  if (dec == NULL) {
-    return VP8_STATUS_INVALID_PARAM;
+  headers.data = data;
+  headers.data_size = data_size;
+  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
+  if (status != VP8_STATUS_OK) {
+    return status;
   }
 
+  assert(params != NULL);
   VP8InitIo(&io);
-  io.data = data;
-  io.data_size = data_size;
+  io.data = headers.data + headers.offset;
+  io.data_size = headers.data_size - headers.offset;
   WebPInitCustomIo(params, &io);  // Plug the I/O functions.
 
-  // Decode bitstream header, update io->width/io->height.
-  if (!VP8GetHeaders(dec, &io)) {
-    status = VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    // Allocate/check output buffers.
-    status = WebPAllocateDecBuffer(io.width, io.height, params->options,
-                                   params->output);
-    if (status == VP8_STATUS_OK) {
-      // Decode
-      if (!VP8Decode(dec, &io)) {
-        status = dec->status_;
+  if (!headers.is_lossless) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = params->options && (params->options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+
+    // Decode bitstream header, update io->width/io->height.
+    if (!VP8GetHeaders(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8Decode(dec, &io)) {
+          status = dec->status_;
+        }
       }
     }
+    VP8Delete(dec);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    if (!VP8LDecodeHeader(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8LDecodeImage(dec)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8LDelete(dec);
   }
-  VP8Delete(dec);
+
   if (status != VP8_STATUS_OK) {
     WebPFreeDecBuffer(params->output);
   }
@@ -115,8 +457,10 @@
 
 // Helpers
 static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
-                                     const uint8_t* data, uint32_t data_size,
-                                     uint8_t* rgba, int stride, int size) {
+                                     const uint8_t* const data,
+                                     size_t data_size,
+                                     uint8_t* const rgba,
+                                     int stride, size_t size) {
   WebPDecParams params;
   WebPDecBuffer buf;
   if (rgba == NULL) {
@@ -136,35 +480,35 @@
   return rgba;
 }
 
-uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeRGBInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
   return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
 }
 
-uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeRGBAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
   return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
 }
 
-uint8_t* WebPDecodeARGBInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeARGBInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
   return DecodeIntoRGBABuffer(MODE_ARGB, data, data_size, output, stride, size);
 }
 
-uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeBGRInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
   return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
 }
 
-uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeBGRAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
   return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
 }
 
-uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* luma, int luma_size, int luma_stride,
-                           uint8_t* u, int u_size, int u_stride,
-                           uint8_t* v, int v_size, int v_stride) {
+uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
+                           uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride) {
   WebPDecParams params;
   WebPDecBuffer output;
   if (luma == NULL) return NULL;
@@ -188,11 +532,11 @@
   return luma;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
-static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data,
-                       uint32_t data_size, int* width, int* height,
-                       WebPDecBuffer* keep_info) {
+static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
+                       size_t data_size, int* const width, int* const height,
+                       WebPDecBuffer* const keep_info) {
   WebPDecParams params;
   WebPDecBuffer output;
 
@@ -205,53 +549,53 @@
   if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
     return NULL;
   }
-  if (width) *width = output.width;
-  if (height) *height = output.height;
+  if (width != NULL) *width = output.width;
+  if (height != NULL) *height = output.height;
 
   // Decode
   if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
     return NULL;
   }
-  if (keep_info) {    // keep track of the side-info
+  if (keep_info != NULL) {    // keep track of the side-info
     WebPCopyDecBuffer(&output, keep_info);
   }
   // return decoded samples (don't clear 'output'!)
-  return (mode >= MODE_YUV) ? output.u.YUVA.y : output.u.RGBA.rgba;
+  return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
 }
 
-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
   return Decode(MODE_RGB, data, data_size, width, height, NULL);
 }
 
-uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
                         int* width, int* height) {
   return Decode(MODE_RGBA, data, data_size, width, height, NULL);
 }
 
-uint8_t* WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
                         int* width, int* height) {
   return Decode(MODE_ARGB, data, data_size, width, height, NULL);
 }
 
-uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
   return Decode(MODE_BGR, data, data_size, width, height, NULL);
 }
 
-uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
                         int* width, int* height) {
   return Decode(MODE_BGRA, data, data_size, width, height, NULL);
 }
 
-uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                        int* width, int* height, uint8_t** u, uint8_t** v,
                        int* stride, int* uv_stride) {
   WebPDecBuffer output;   // only to preserve the side-infos
   uint8_t* const out = Decode(MODE_YUV, data, data_size,
                               width, height, &output);
 
-  if (out) {
+  if (out != NULL) {
     const WebPYUVABuffer* const buf = &output.u.YUVA;
     *u = buf->u;
     *v = buf->v;
@@ -262,52 +606,52 @@
   return out;
 }
 
-//-----------------------------------------------------------------------------
-// WebPGetInfo()
-
-int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int* width, int* height) {
-  const uint32_t chunk_size = WebPCheckRIFFHeader(&data, &data_size);
-  if (!chunk_size) {
-    return 0;         // unsupported RIFF header
-  }
-  // Validate raw video data
-  return VP8GetInfo(data, data_size, chunk_size, width, height, NULL);
-}
-
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
-  assert(features);
+  assert(features != NULL);
   memset(features, 0, sizeof(*features));
   features->bitstream_version = 0;
 }
 
-static VP8StatusCode GetFeatures(const uint8_t** data, uint32_t* data_size,
+static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
                                  WebPBitstreamFeatures* const features) {
-  uint32_t chunk_size;
-  if (features == NULL) {
+  if (features == NULL || data == NULL) {
     return VP8_STATUS_INVALID_PARAM;
   }
   DefaultFeatures(features);
-  if (data == NULL || *data == NULL || data_size == 0) {
-    return VP8_STATUS_INVALID_PARAM;
-  }
-  chunk_size = WebPCheckRIFFHeader(data, data_size);
-  if (chunk_size == 0) {
-    return VP8_STATUS_BITSTREAM_ERROR;   // unsupported RIFF header
-  }
-  if (!VP8GetInfo(*data, *data_size, chunk_size,
-                  &features->width, &features->height, &features->has_alpha)) {
-    return VP8_STATUS_BITSTREAM_ERROR;
-  }
-  return VP8_STATUS_OK;
+
+  // Only parse enough of the data to retrieve width/height/has_alpha.
+  return ParseHeadersInternal(data, data_size,
+                              &features->width, &features->height,
+                              &features->has_alpha, NULL);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// WebPGetInfo()
+
+int WebPGetInfo(const uint8_t* data, size_t data_size,
+                int* width, int* height) {
+  WebPBitstreamFeatures features;
+
+  if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) {
+    return 0;
+  }
+
+  if (width != NULL) {
+    *width  = features.width;
+  }
+  if (height != NULL) {
+    *height = features.height;
+  }
+
+  return 1;
+}
+
+//------------------------------------------------------------------------------
 // Advance decoding API
 
-int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config,
+int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
                                   int version) {
-  if (version != WEBP_DECODER_ABI_VERSION) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
     return 0;   // version mismatch
   }
   if (config == NULL) {
@@ -319,29 +663,38 @@
   return 1;
 }
 
-VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, uint32_t data_size,
-                                      WebPBitstreamFeatures* const features,
-                            int version) {
-  if (version != WEBP_DECODER_ABI_VERSION) {
+VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
+                                      WebPBitstreamFeatures* features,
+                                      int version) {
+  VP8StatusCode status;
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
     return VP8_STATUS_INVALID_PARAM;   // version mismatch
   }
   if (features == NULL) {
     return VP8_STATUS_INVALID_PARAM;
   }
-  return GetFeatures(&data, &data_size, features);
+
+  status = GetFeatures(data, data_size, features);
+  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
+  }
+  return status;
 }
 
-VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
-                         WebPDecoderConfig* const config) {
+VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
+                         WebPDecoderConfig* config) {
   WebPDecParams params;
   VP8StatusCode status;
 
-  if (!config) {
+  if (config == NULL) {
     return VP8_STATUS_INVALID_PARAM;
   }
 
-  status = GetFeatures(&data, &data_size, &config->input);
+  status = GetFeatures(data, data_size, &config->input);
   if (status != VP8_STATUS_OK) {
+    if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
+    }
     return status;
   }
 
@@ -353,6 +706,66 @@
   return status;
 }
 
+//------------------------------------------------------------------------------
+// Cropping and rescaling.
+
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    x = options->crop_left;
+    y = options->crop_top;
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+      x &= ~1;
+      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+    }
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/src/dec/webpi.h b/src/dec/webpi.h
index 6f6a72f..44e5744 100644
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,36 +9,22 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#ifndef WEBP_DEC_WEBPI_H
-#define WEBP_DEC_WEBPI_H
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#include "webp/decode_vp8.h"
+#include "../utils/rescaler.h"
+#include "./decode_vp8.h"
 
 //------------------------------------------------------------------------------
-// WebPDecParams: Decoding output parameters. Transcient internal object.
+// WebPDecParams: Decoding output parameters. Transient internal object.
 
 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
-
-// Structure use for on-the-fly rescaling
-typedef struct {
-  int x_expand;               // true if we're expanding in the x direction
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
-  int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
-  int src_width, src_height;  // source dimensions
-  int dst_width, dst_height;  // destination dimensions
-  uint8_t* dst;
-  int dst_stride;
-  int32_t* irow, *frow;       // work buffer
-} WebPRescaler;
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
 
 struct WebPDecParams {
   WebPDecBuffer* output;             // output buffer.
@@ -49,42 +35,51 @@
   const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
   // rescalers
   WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
-  void* memory;               // overall scratch memory for the output work.
-  OutputFunc emit;            // output RGB or YUV samples
-  OutputFunc emit_alpha;      // output alpha channel
+  void* memory;                  // overall scratch memory for the output work.
+
+  OutputFunc emit;               // output RGB or YUV samples
+  OutputFunc emit_alpha;         // output alpha channel
+  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
 };
 
 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);
 
 //------------------------------------------------------------------------------
-// Upsampler function to overwrite fancy upsampler.
+// Header parsing helpers
 
-typedef void (*WebPUpsampleLinePairFunc)(
-  const uint8_t* top_y, const uint8_t* bottom_y,
-  const uint8_t* top_u, const uint8_t* top_v,
-  const uint8_t* cur_u, const uint8_t* cur_v,
-  uint8_t* top_dst, uint8_t* bottom_dst, int len);
+// Structure storing a description of the RIFF headers.
+typedef struct {
+  const uint8_t* data;         // input buffer
+  size_t data_size;            // input buffer size
+  size_t offset;               // offset to main data chunk (VP8 or VP8L)
+  const uint8_t* alpha_data;   // points to alpha chunk (if present)
+  size_t alpha_data_size;      // alpha chunk size
+  size_t compressed_size;      // VP8/VP8L compressed data size
+  size_t riff_size;            // size of the riff payload (or 0 if absent)
+  int is_lossless;             // true if a VP8L chunk is present
+} WebPHeaderStructure;
 
-// Upsampler functions to be used to convert YUV to RGB(A) modes
-extern WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
-extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];
-
-// Initializes SSE2 version of the fancy upsamplers.
-void WebPInitUpsamplersSSE2(void);
+// Skips over all valid chunks prior to the first VP8/VP8L frame header.
+// Returns VP8_STATUS_OK on success,
+//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
+//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
+// In 'headers', compressed_size, offset, alpha_data, alpha_size and lossless
+// fields are updated appropriately upon success.
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 
 //------------------------------------------------------------------------------
 // Misc utils
 
-// If a RIFF container is detected, validate it and skip over it. Returns
-// VP8 bit-stream size if RIFF header is valid else returns 0
-uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t* data_size_ptr);
-
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
 
+// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
+// to the *compressed* format, not the output one.
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+
 //------------------------------------------------------------------------------
 // Internal functions regarding WebPDecBuffer memory (in buffer.c).
 // Don't really need to be externally visible for now.
@@ -108,10 +103,12 @@
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 
+
+
 //------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
-#endif  // WEBP_DEC_WEBPI_H
+#endif  /* WEBP_DEC_WEBPI_H_ */
diff --git a/src/dec/yuv.h b/src/dec/yuv.h
deleted file mode 100644
index 5f16ee6..0000000
--- a/src/dec/yuv.h
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// inline YUV->RGB conversion function
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_DEC_YUV_H_
-#define WEBP_DEC_YUV_H_
-
-#include "webp/decode_vp8.h"
-
-/*
- * Define ANDROID_WEBP_RGB to enable specific optimizations for Android
- * RGBA_4444 & RGB_565 color support.
- *
- */
-
-#define ANDROID_WEBP_RGB
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-enum { YUV_FIX = 16,                // fixed-point precision
-       YUV_RANGE_MIN = -227,        // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
-};
-extern int16_t VP8kVToR[256], VP8kUToB[256];
-extern int32_t VP8kVToG[256], VP8kUToG[256];
-extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-}
-
-static inline void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
-                                  uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-#ifdef ANDROID_WEBP_RGB
-  rgb[1] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  rgb[0] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-#else
-  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-#endif
-}
-
-static inline void VP8YuvToArgbKeepA(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const argb) {
-  // Don't update Aplha (argb[0])
-  VP8YuvToRgb(y, u, v, argb + 1);
-}
-
-static inline void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const argb) {
-  argb[0] = 0xff;
-  VP8YuvToArgbKeepA(y, u, v, argb);
-}
-
-static inline void VP8YuvToRgba4444KeepA(uint8_t y, uint8_t u, uint8_t v,
-                                         uint8_t* const argb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-#ifdef ANDROID_WEBP_RGB
-  argb[1] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  argb[0] = (argb[0] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
-#else
-  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  argb[1] = (argb[1] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
-#endif
-}
-
-static inline void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
-                                    uint8_t* const argb) {
-#ifdef ANDROID_WEBP_RGB
-  argb[0] = 0x0f;
-#else
-  argb[1] = 0x0f;
-#endif
-  VP8YuvToRgba4444KeepA(y, u, v, argb);
-}
-
-static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const bgr) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-}
-
-static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const bgra) {
-  VP8YuvToBgr(y, u, v, bgra);
-  bgra[3] = 0xff;
-}
-
-static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const rgba) {
-  VP8YuvToRgb(y, u, v, rgba);
-  rgba[3] = 0xff;
-}
-
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_DEC_YUV_H_
diff --git a/src/dsp/cpu.c b/src/dsp/cpu.c
new file mode 100644
index 0000000..10e0936
--- /dev/null
+++ b/src/dsp/cpu.c
@@ -0,0 +1,86 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "./dsp.h"
+
+//#if defined(__ANDROID__)
+//#include <cpu-features.h>
+//#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(WEBP_MSC_SSE2)
+#define GetCPUInfo __cpuid
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+  if (feature == kSSE3) {
+    return 0 != (cpu_info[2] & 0x00000001);
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)
+static int AndroidCPUInfo(CPUFeature feature) {
+//  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+//  const uint64_t cpu_features = android_getCpuFeatures();
+//  if (feature == kNEON) {
+//    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
+//            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
+//  }
+//  return 0;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(__ARM_NEON__)
+// define a dummy function to enable turning off NEON at runtime by setting
+// VP8DecGetCPUInfo = NULL
+static int armCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/dsp.c b/src/dsp/dec.c
similarity index 73%
rename from src/dec/dsp.c
rename to src/dsp/dec.c
index 0dea42a..9ae7b6f 100644
--- a/src/dec/dsp.c
+++ b/src/dsp/dec.c
@@ -1,21 +1,22 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// speed-critical functions.
+// Speed-critical decoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "vp8i.h"
+#include "./dsp.h"
+#include "../dec/vp8i.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // run-time tables (~4k)
 
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
@@ -28,7 +29,7 @@
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 
-void VP8DspInitTables(void) {
+static void DspInitTables(void) {
   if (!tables_ok) {
     int i;
     for (i = -255; i <= 255; ++i) {
@@ -48,11 +49,11 @@
   }
 }
 
-static inline uint8_t clip_8b(int v) {
+static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 #define STORE(x, y, v) \
@@ -133,13 +134,7 @@
 
 #undef STORE
 
-// default C implementations:
-VP8Idct2 VP8Transform = TransformTwo;
-VP8Idct VP8TransformUV = TransformUV;
-VP8Idct VP8TransformDC = TransformDC;
-VP8Idct VP8TransformDCUV = TransformDCUV;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 14.3
 
 static void TransformWHT(const int16_t* in, int16_t* out) {
@@ -171,12 +166,12 @@
 
 void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions
 
-#define OUT(x, y) dst[(x) + (y) * BPS]
+#define DST(x, y) dst[(x) + (y) * BPS]
 
-static inline void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
   const uint8_t* top = dst - BPS;
   const uint8_t* const clip0 = clip1 + 255 - top[-1];
   int y;
@@ -193,7 +188,7 @@
 static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
 static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 16x16
 
 static void VE16(uint8_t *dst) {     // vertical
@@ -211,7 +206,7 @@
   }
 }
 
-static inline void Put16(int v, uint8_t* dst) {
+static WEBP_INLINE void Put16(int v, uint8_t* dst) {
   int j;
   for (j = 0; j < 16; ++j) {
     memset(dst + j * BPS, v, 16);
@@ -249,7 +244,7 @@
   Put16(0x80, dst);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 4x4
 
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -299,13 +294,13 @@
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
   const int D = dst[3 - BPS];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
 }
 
 static void LD4(uint8_t *dst) {   // Down-Left
@@ -317,13 +312,13 @@
   const int F = dst[5 - BPS];
   const int G = dst[6 - BPS];
   const int H = dst[7 - BPS];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
 }
 
 static void VR4(uint8_t *dst) {   // Vertical-Right
@@ -335,17 +330,17 @@
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
   const int D = dst[3 - BPS];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
 
-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
 }
 
 static void VL4(uint8_t *dst) {   // Vertical-Left
@@ -357,17 +352,17 @@
   const int F = dst[5 - BPS];
   const int G = dst[6 - BPS];
   const int H = dst[7 - BPS];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
 
-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
 }
 
 static void HU4(uint8_t *dst) {   // Horizontal-Up
@@ -375,14 +370,14 @@
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
   const int L = dst[-1 + 3 * BPS];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-    OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
 static void HD4(uint8_t *dst) {  // Horizontal-Down
@@ -395,23 +390,24 @@
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
 
-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
 
-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
 }
 
+#undef DST
 #undef AVG3
 #undef AVG2
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma
 
 static void VE8uv(uint8_t *dst) {    // vertical
@@ -430,7 +426,7 @@
 }
 
 // helper for chroma-DC predictions
-static inline void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
   int j;
   for (j = 0; j < 8; ++j) {
     *(uint64_t*)(dst + j * BPS) = v;
@@ -468,28 +464,28 @@
   Put8x8uv(0x8080808080808080ULL, dst);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // default C implementations
 
-VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
+const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
   DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
 };
 
-VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
+const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
   DC16, TM16, VE16, HE16,
   DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };
 
-VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
+const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
   DC8uv, TM8uv, VE8uv, HE8uv,
   DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions
 
 // 4 pixels in, 2 pixels out
-static inline void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
   const int a1 = sclip2[112 + ((a + 4) >> 3)];
@@ -499,7 +495,7 @@
 }
 
 // 4 pixels in, 4 pixels out
-static inline void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0);
   const int a1 = sclip2[112 + ((a + 4) >> 3)];
@@ -512,7 +508,7 @@
 }
 
 // 6 pixels in, 6 pixels out
-static inline void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step];
   const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
@@ -527,17 +523,18 @@
   p[ 2*step] = clip1[255 + q2 - a3];
 }
 
-static inline int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
 }
 
-static inline int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
 }
 
-static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
   const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
   if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
@@ -547,7 +544,7 @@
          abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -584,11 +581,12 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
   while (size-- > 0) {
     if (needs_filter2(p, hstride, thresh, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
@@ -601,8 +599,9 @@
   }
 }
 
-static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
   while (size-- > 0) {
     if (needs_filter2(p, hstride, thresh, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
@@ -670,78 +669,61 @@
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
-void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16;
-void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16;
-void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8;
-void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8;
-void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
-void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
-void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
-void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
+VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformUV;
+VP8DecIdct VP8TransformDC;
+VP8DecIdct VP8TransformDCUV;
 
-void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16;
-void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16;
-void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
-void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
-
-//-----------------------------------------------------------------------------
-// SSE2 detection.
-//
-
-#if defined(__pic__) && defined(__i386__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "mov %%ebx, %%edi\n"
-    "cpuid\n"
-    "xchg %%edi, %%ebx\n"
-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "cpuid\n"
-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(_MSC_VER)  // Visual C++
-#define GetCPUInfo __cpuid
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
-static int x86CPUInfo(CPUFeature feature) {
-  int cpu_info[4];
-  GetCPUInfo(cpu_info, 1);
-  if (feature == kSSE2) {
-    return 0 != (cpu_info[3] & 0x04000000);
-  }
-  if (feature == kSSE3) {
-    return 0 != (cpu_info[2] & 0x00000001);
-  }
-  return 0;
-}
-VP8CPUInfo VP8DecGetCPUInfo = x86CPUInfo;
-#else
-VP8CPUInfo VP8DecGetCPUInfo = NULL;
-#endif
-
-//-----------------------------------------------------------------------------
+VP8LumaFilterFunc VP8VFilter16;
+VP8LumaFilterFunc VP8HFilter16;
+VP8ChromaFilterFunc VP8VFilter8;
+VP8ChromaFilterFunc VP8HFilter8;
+VP8LumaFilterFunc VP8VFilter16i;
+VP8LumaFilterFunc VP8HFilter16i;
+VP8ChromaFilterFunc VP8VFilter8i;
+VP8ChromaFilterFunc VP8HFilter8i;
+VP8SimpleFilterFunc VP8SimpleVFilter16;
+VP8SimpleFilterFunc VP8SimpleHFilter16;
+VP8SimpleFilterFunc VP8SimpleVFilter16i;
+VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
 extern void VP8DspInitSSE2(void);
+extern void VP8DspInitNEON(void);
 
 void VP8DspInit(void) {
+  DspInitTables();
+
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8DecGetCPUInfo) {
-    if (VP8DecGetCPUInfo(kSSE2)) {
-#if defined(__SSE2__) || defined(_MSC_VER)
+  if (VP8GetCPUInfo) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
+    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
 #endif
-    }
-    if (VP8DecGetCPUInfo(kSSE3)) {
-      // later we'll plug some SSE3 variant here
-    }
   }
 }
 
diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
new file mode 100644
index 0000000..ec824b7
--- /dev/null
+++ b/src/dsp/dec_neon.c
@@ -0,0 +1,329 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of dsp functions and loop filtering.
+//
+// Authors: Somnath Banerjee (somnath@google.com)
+//          Johann Koenig (johannkoenig@google.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "../dec/vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
+              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+
+#define FLIP_SIGN_BIT2(a, b, s)                                                \
+  "veor     " #a "," #a "," #s "               \n"                             \
+  "veor     " #b "," #b "," #s "               \n"                             \
+
+#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
+  FLIP_SIGN_BIT2(a, b, s)                                                      \
+  FLIP_SIGN_BIT2(c, d, s)                                                      \
+
+#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
+  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
+  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
+  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
+  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
+  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+  "vdup.8     q14, " #thresh "            \n"                                  \
+  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
+
+#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
+  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
+  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
+
+#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
+  "vmov.i8    q15, #0x03                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
+  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
+                                                                               \
+  "vmov.i8    q15, #0x04                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
+  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
+
+// Applies filter on 2 pixels (p0 and q0)
+#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
+  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
+  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
+  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
+  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
+  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
+  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
+  FLIP_SIGN_BIT2(p0, q0, q10)
+
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+
+#define STORE8x2(c1, c2, p,stride)                                             \
+  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
+    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
+    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
+    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
+    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", QRegs
+  );
+}
+
+static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
+    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
+    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
+
+    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
+    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
+    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
+    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
+    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], #1                 \n"  // p - 1
+
+    "vswp        d5, d6                        \n"
+    STORE8x2(d4, d5, [%[p]], %[stride])
+    STORE8x2(d6, d7, [%[p]], %[stride])
+
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", "r4", "r5", "r6", QRegs
+  );
+}
+
+static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16NEON(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16NEON(p, stride, thresh);
+  }
+}
+
+static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+  const int kBPS = BPS;
+  const int16_t constants[] = {20091, 17734, 0, 0};
+  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
+   * Technically these are unsigned but vqdmulh is only available in signed.
+   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
+   * changing the >> 16 to >> 15 and requiring an additional >> 1.
+   * We use this to our advantage with kC2. The canonical value is 35468.
+   * However, the high bit is set so treating it as signed will give incorrect
+   * results. We avoid this by down shifting by 1 here to clear the highest bit.
+   * Combined with the doubling effect of vqdmulh we get >> 16.
+   * This can not be applied to kC1 because the lowest bit is set. Down shifting
+   * the constant would reduce precision.
+   */
+
+  /* libwebp uses a trick to avoid some extra addition that libvpx does.
+   * Instead of:
+   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
+   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
+   */
+
+  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[constants]]        \n"
+
+    /* d2: in[0]
+     * d3: in[8]
+     * d4: in[4]
+     * d5: in[12]
+     */
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
+     * q9 = {in[4], in[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = in[0] + in[8]
+     * d23 = b = in[0] - in[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* The multiplication should be x * kC1 >> 16
+     * However, with vqdmulh we get x * kC1 * 2 >> 16
+     * (multiply, double, return high half)
+     * We avoided this in kC2 by pre-shifting the constant.
+     * q8 = in[4]/[12] * kC1 >> 16
+     */
+    "vshr.s16        q8, q8, #1                  \n"
+
+    /* Add {in[4], in[12]} back after the multiplication. This is handled by
+     * adding 1 << 16 to kC1 in the libwebp C code.
+     */
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = tmp[0] + tmp[8]
+     * d23 = b = tmp[0] - tmp[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* See long winded explanations prior */
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
+
+    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+    /* (val) + 4 >> 3 */
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    /* Must accumulate before saturating */
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
+    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
+  );
+}
+
+static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOneNEON(in, dst);
+  if (do_two) {
+    TransformOneNEON(in + 16, dst + 4);
+  }
+}
+
+extern void VP8DspInitNEON(void);
+
+void VP8DspInitNEON(void) {
+  VP8Transform = TransformTwoNEON;
+
+  VP8SimpleVFilter16 = SimpleVFilter16NEON;
+  VP8SimpleHFilter16 = SimpleHFilter16NEON;
+  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
+  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // WEBP_USE_NEON
diff --git a/src/dec/dsp_sse2.c b/src/dsp/dec_sse2.c
similarity index 95%
rename from src/dec/dsp_sse2.c
rename to src/dsp/dec_sse2.c
index 785f02e..472b68e 100644
--- a/src/dec/dsp_sse2.c
+++ b/src/dsp/dec_sse2.c
@@ -1,25 +1,27 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// SSE2 version of dsp functions and loop filtering.
+// SSE2 version of some decoding functions (idct, loop filtering).
 //
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)
 
-#if defined(__SSE2__) || defined(_MSC_VER)
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
 
 #include <emmintrin.h>
-#include "vp8i.h"
+#include "../dec/vp8i.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
@@ -240,7 +242,7 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)
 
 // Compute abs(p - q) = subs(p - q) OR subs(q - p)
@@ -337,12 +339,12 @@
   *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions
 
 // Applies filter on 2 pixels (p0 and q0)
-static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
-                             const __m128i* q1, int thresh) {
+static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
+                                  const __m128i* q1, int thresh) {
   __m128i a, mask;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
@@ -362,8 +364,9 @@
 }
 
 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
-                             const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1,
+                                  const __m128i* mask, int hev_thresh) {
   __m128i not_hev;
   __m128i t1, t2, t3;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
@@ -408,9 +411,9 @@
 }
 
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
-                             __m128i* q0, __m128i* q1, __m128i *q2,
-                             const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1, __m128i *q2,
+                                  const __m128i* mask, int hev_thresh) {
   __m128i a, not_hev;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
 
@@ -466,8 +469,8 @@
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
 // two Load4x4() to avoid code duplication.
-static inline void Load8x4(const uint8_t* b, int stride,
-                           __m128i* p, __m128i* q) {
+static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
+                                __m128i* p, __m128i* q) {
   __m128i t1, t2;
 
   // Load 0th, 1st, 4th and 5th rows
@@ -506,9 +509,10 @@
   *q = _mm_unpackhi_epi32(t1, t2);
 }
 
-static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
-                            __m128i* p1, __m128i* p0,
-                            __m128i* q0, __m128i* q1) {
+static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+                                 int stride,
+                                 __m128i* p1, __m128i* p0,
+                                 __m128i* q0, __m128i* q1) {
   __m128i t1, t2;
   // Assume the pixels around the edge (|) are numbered as follows
   //                00 01 | 02 03
@@ -540,7 +544,7 @@
   *q1 = _mm_unpackhi_epi64(t2, *q1);
 }
 
-static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
     *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
@@ -549,8 +553,9 @@
 }
 
 // Transpose back and store
-static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
-                             __m128i* p0, __m128i* q0, __m128i* q1) {
+static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
+                                  __m128i* p1, __m128i* p0,
+                                  __m128i* q0, __m128i* q1) {
   __m128i t1;
 
   // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@@ -586,7 +591,7 @@
   Store4x4(q1, r8, stride);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
 static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
@@ -629,7 +634,7 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
 #define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
@@ -895,4 +900,4 @@
 }    // extern "C"
 #endif
 
-#endif   //__SSE2__ || _MSC_VER
+#endif   // WEBP_USE_SSE2
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
new file mode 100644
index 0000000..c4061aa
--- /dev/null
+++ b/src/dsp/dsp.h
@@ -0,0 +1,210 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//   Speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_DSP_H_
+#define WEBP_DSP_DSP_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// CPU detection
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__)
+#define WEBP_ANDROID_NEON  // Android targets that might support NEON
+#endif
+
+#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+#define WEBP_USE_NEON
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kNEON
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+extern VP8CPUInfo VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Encoding
+
+int VP8GetAlpha(const int histo[]);
+
+// Transforms
+// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
+//          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
+typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                        int do_two);
+typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
+typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+extern VP8Idct VP8ITransform;
+extern VP8Fdct VP8FTransform;
+extern VP8WHT VP8ITransformWHT;
+extern VP8WHT VP8FTransformWHT;
+// Predictions
+// *dst is the destination block. *top and *left can be NULL.
+typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+                              const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+extern VP8Intra4Preds VP8EncPredLuma4;
+extern VP8IntraPreds VP8EncPredLuma16;
+extern VP8IntraPreds VP8EncPredChroma8;
+
+typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
+typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
+                          const uint16_t* const weights);
+extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
+
+typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+extern VP8BlockCopy VP8Copy4x4;
+// Quantization
+struct VP8Matrix;   // forward declaration
+typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
+                                int n, const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlock VP8EncQuantizeBlock;
+
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                         int start_block, int end_block);
+extern const int VP8DspScan[16 + 4 + 4];
+extern VP8CHisto VP8CollectHistogram;
+
+void VP8EncDspInit(void);   // must be called before using any of the above
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformUV;
+extern VP8DecIdct VP8TransformDC;
+extern VP8DecIdct VP8TransformDCUV;
+extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t* dst);
+extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outer edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+// must be called before anything using the above
+void VP8DspInit(void);
+
+//------------------------------------------------------------------------------
+// WebP I/O
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+typedef void (*WebPUpsampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* cur_u, const uint8_t* cur_v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB(A) modes
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+// Initializes SSE2 version of the fancy upsamplers.
+void WebPInitUpsamplersSSE2(void);
+
+#endif    // FANCY_UPSAMPLING
+
+// Point-sampling methods.
+typedef void (*WebPSampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* u, const uint8_t* v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+
+// General function for converting two lines of ARGB or RGBA.
+// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
+// as 0x00, 0x00, 0x00, 0xff (little endian).
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
+
+// YUV444->RGB converters
+typedef void (*WebPYUV444Converter)(const uint8_t* y,
+                                    const uint8_t* u, const uint8_t* v,
+                                    uint8_t* dst, int len);
+
+extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+
+// Main function to be called
+void WebPInitUpsamplers(void);
+
+//------------------------------------------------------------------------------
+// Pre-multiply planes with alpha values
+
+// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
+// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
+extern void (*WebPApplyAlphaMultiply)(
+    uint8_t* rgba, int alpha_first, int w, int h, int stride);
+
+// Same, buf specifically for RGBA4444 format
+extern void (*WebPApplyAlphaMultiply4444)(
+    uint8_t* rgba4444, int w, int h, int stride);
+
+// To be called first before using the above.
+void WebPInitPremultiply(void);
+
+void WebPInitPremultiplySSE2(void);   // should not be called directly.
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_DSP_H_ */
diff --git a/src/enc/dsp.c b/src/dsp/enc.c
similarity index 76%
rename from src/enc/dsp.c
rename to src/dsp/enc.c
index 25a9bf9..0223456 100644
--- a/src/enc/dsp.c
+++ b/src/dsp/enc.c
@@ -1,22 +1,23 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// speed-critical functions.
+// Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
-#include "vp8enci.h"
+#include <stdlib.h>  // for abs()
+#include "./dsp.h"
+#include "../enc/vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
@@ -41,13 +42,24 @@
   return ClipAlpha(alpha);
 }
 
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block) {
   int histo[MAX_COEFF_THRESH + 1] = { 0 };
   int16_t out[16];
   int j, k;
   for (j = start_block; j < end_block; ++j) {
-    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
     for (k = 0; k < 16; ++k) {
@@ -64,7 +76,7 @@
   return VP8GetAlpha(histo);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // run-time tables (~4k)
 
 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
@@ -83,11 +95,11 @@
   }
 }
 
-static inline uint8_t clip_8b(int v) {
+static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 #define STORE(x, y, v) \
@@ -97,8 +109,8 @@
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
 
-static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
   int C[4 * 4], *tmp;
   int i;
   tmp = C;
@@ -226,19 +238,20 @@
 #undef MUL
 #undef STORE
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions
 
-#define OUT(x, y) dst[(x) + (y) * BPS]
+#define DST(x, y) dst[(x) + (y) * BPS]
 
-static inline void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
   int j;
   for (j = 0; j < size; ++j) {
     memset(dst + j * BPS, value, size);
   }
 }
 
-static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
   int j;
   if (top) {
     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@@ -247,7 +260,8 @@
   }
 }
 
-static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
   if (left) {
     int j;
     for (j = 0; j < size; ++j) {
@@ -258,8 +272,8 @@
   }
 }
 
-static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
   int y;
   if (left) {
     if (top) {
@@ -288,9 +302,9 @@
   }
 }
 
-static inline void DCMode(uint8_t* dst, const uint8_t* left,
-                          const uint8_t* top,
-                          int size, int round, int shift) {
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
   int DC = 0;
   int j;
   if (top) {
@@ -311,7 +325,7 @@
   Fill(dst, DC, size);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
@@ -331,7 +345,7 @@
   TrueMotion(C8TM8 + dst, left, top, 8);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
 static void Intra16Preds(uint8_t* dst,
@@ -342,7 +356,7 @@
   TrueMotion(I16TM16 + dst, left, top, 16);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 4x4 prediction
 
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -390,13 +404,13 @@
   const int B = top[1];
   const int C = top[2];
   const int D = top[3];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
 }
 
 static void LD4(uint8_t* dst, const uint8_t* top) {
@@ -408,13 +422,13 @@
   const int F = top[5];
   const int G = top[6];
   const int H = top[7];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
 }
 
 static void VR4(uint8_t* dst, const uint8_t* top) {
@@ -426,17 +440,17 @@
   const int B = top[1];
   const int C = top[2];
   const int D = top[3];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
 
-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
 }
 
 static void VL4(uint8_t* dst, const uint8_t* top) {
@@ -448,17 +462,17 @@
   const int F = top[5];
   const int G = top[6];
   const int H = top[7];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
 
-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
 }
 
 static void HU4(uint8_t* dst, const uint8_t* top) {
@@ -466,14 +480,14 @@
   const int J = top[-3];
   const int K = top[-4];
   const int L = top[-5];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-  OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
 static void HD4(uint8_t* dst, const uint8_t* top) {
@@ -486,17 +500,17 @@
   const int B = top[1];
   const int C = top[2];
 
-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
 
-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
 }
 
 static void TM4(uint8_t* dst, const uint8_t* top) {
@@ -511,6 +525,7 @@
   }
 }
 
+#undef DST
 #undef AVG3
 #undef AVG2
 
@@ -529,10 +544,11 @@
   HU4(I4HU4 + dst, top);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Metric
 
-static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
   int count = 0;
   int y, x;
   for (y = 0; y < h; ++y) {
@@ -559,7 +575,7 @@
   return GetSSE(a, b, 4, 4);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
@@ -620,16 +636,20 @@
   return D;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantization
 //
 
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
 // Simple quantization
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                          int n, const VP8Matrix* const mtx) {
   int last = -1;
   for (; n < 16; ++n) {
-    const int j = VP8Zigzag[n];
+    const int j = kZigzag[n];
     const int sign = (in[j] < 0);
     int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     if (coeff > 2047) coeff = 2047;
@@ -649,10 +669,10 @@
   return (last >= 0);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Block copy
 
-static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
   int y;
   for (y = 0; y < size; ++y) {
     memcpy(dst, src, size);
@@ -662,49 +682,9 @@
 }
 
 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
-static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
-static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }
 
-//-----------------------------------------------------------------------------
-// SSE2 detection.
-//
-
-#if defined(__pic__) && defined(__i386__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "mov %%ebx, %%edi\n"
-    "cpuid\n"
-    "xchg %%edi, %%ebx\n"
-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "cpuid\n"
-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(_MSC_VER)  // Visual C++
-#define GetCPUInfo __cpuid
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
-static int x86CPUInfo(CPUFeature feature) {
-  int cpu_info[4];
-  GetCPUInfo(cpu_info, 1);
-  if (feature == kSSE2) {
-    return 0 != (cpu_info[3] & 0x04000000);
-  }
-  if (feature == kSSE3) {
-    return 0 != (cpu_info[2] & 0x00000001);
-  }
-  return 0;
-}
-VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo;
-#else
-VP8CPUInfo VP8EncGetCPUInfo = NULL;
-#endif
+//------------------------------------------------------------------------------
+// Initialization
 
 // Speed-critical function pointers. We have to initialize them to the default
 // implementations within VP8EncDspInit().
@@ -724,8 +704,6 @@
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8BlockCopy VP8Copy4x4;
-VP8BlockCopy VP8Copy8x8;
-VP8BlockCopy VP8Copy16x16;
 
 extern void VP8EncDspInitSSE2(void);
 
@@ -749,19 +727,14 @@
   VP8TDisto16x16 = Disto16x16;
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8Copy4x4 = Copy4x4;
-  VP8Copy8x8 = Copy8x8;
-  VP8Copy16x16 = Copy16x16;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8EncGetCPUInfo) {
-    if (VP8EncGetCPUInfo(kSSE2)) {
-#if defined(__SSE2__) || defined(_MSC_VER)
+  if (VP8GetCPUInfo) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
+    }
 #endif
-    }
-    if (VP8EncGetCPUInfo(kSSE3)) {
-      // later we'll plug some SSE3 variant here
-    }
   }
 }
 
diff --git a/src/enc/dsp_sse2.c b/src/dsp/enc_sse2.c
similarity index 98%
rename from src/enc/dsp_sse2.c
rename to src/dsp/enc_sse2.c
index db20e64..b046761 100644
--- a/src/enc/dsp_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -1,24 +1,27 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// SSE2 version of speed-critical functions.
+// SSE2 version of speed-critical encoding functions.
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#if defined(__SSE2__) || defined(_MSC_VER)
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
-#include "vp8enci.h"
+#include "../enc/vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
@@ -29,7 +32,7 @@
   int j, k;
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   for (j = start_block; j < end_block; ++j) {
-    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
     {
@@ -64,7 +67,7 @@
   return VP8GetAlpha(histo);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
@@ -436,7 +439,7 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Metric
 
 static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
@@ -485,7 +488,7 @@
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
@@ -679,7 +682,7 @@
 }
 
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantization
 //
 
@@ -831,4 +834,4 @@
 }    // extern "C"
 #endif
 
-#endif   //__SSE2__
+#endif   // WEBP_USE_SSE2
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
new file mode 100644
index 0000000..6d3094f
--- /dev/null
+++ b/src/dsp/lossless.c
@@ -0,0 +1,1150 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#define ANDROID_WEBP_RGB
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#include <math.h>
+#include <stdlib.h>
+#include "./lossless.h"
+#include "../dec/vp8li.h"
+#include "../dsp/yuv.h"
+#include "../dsp/dsp.h"
+#include "../enc/histogram.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+// lookup table for small values of log2(int)
+#define APPROX_LOG_MAX  4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+#define LOG_LOOKUP_IDX_MAX 256
+static const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f,
+  1.0000000000000000f, 1.5849625007211560f,
+  2.0000000000000000f, 2.3219280948873621f,
+  2.5849625007211560f, 2.8073549220576041f,
+  3.0000000000000000f, 3.1699250014423121f,
+  3.3219280948873621f, 3.4594316186372973f,
+  3.5849625007211560f, 3.7004397181410921f,
+  3.8073549220576041f, 3.9068905956085187f,
+  4.0000000000000000f, 4.0874628412503390f,
+  4.1699250014423121f, 4.2479275134435852f,
+  4.3219280948873626f, 4.3923174227787606f,
+  4.4594316186372973f, 4.5235619560570130f,
+  4.5849625007211560f, 4.6438561897747243f,
+  4.7004397181410917f, 4.7548875021634682f,
+  4.8073549220576037f, 4.8579809951275718f,
+  4.9068905956085187f, 4.9541963103868749f,
+  5.0000000000000000f, 5.0443941193584533f,
+  5.0874628412503390f, 5.1292830169449663f,
+  5.1699250014423121f, 5.2094533656289501f,
+  5.2479275134435852f, 5.2854022188622487f,
+  5.3219280948873626f, 5.3575520046180837f,
+  5.3923174227787606f, 5.4262647547020979f,
+  5.4594316186372973f, 5.4918530963296747f,
+  5.5235619560570130f, 5.5545888516776376f,
+  5.5849625007211560f, 5.6147098441152083f,
+  5.6438561897747243f, 5.6724253419714951f,
+  5.7004397181410917f, 5.7279204545631987f,
+  5.7548875021634682f, 5.7813597135246599f,
+  5.8073549220576037f, 5.8328900141647412f,
+  5.8579809951275718f, 5.8826430493618415f,
+  5.9068905956085187f, 5.9307373375628866f,
+  5.9541963103868749f, 5.9772799234999167f,
+  6.0000000000000000f, 6.0223678130284543f,
+  6.0443941193584533f, 6.0660891904577720f,
+  6.0874628412503390f, 6.1085244567781691f,
+  6.1292830169449663f, 6.1497471195046822f,
+  6.1699250014423121f, 6.1898245588800175f,
+  6.2094533656289501f, 6.2288186904958804f,
+  6.2479275134435852f, 6.2667865406949010f,
+  6.2854022188622487f, 6.3037807481771030f,
+  6.3219280948873626f, 6.3398500028846243f,
+  6.3575520046180837f, 6.3750394313469245f,
+  6.3923174227787606f, 6.4093909361377017f,
+  6.4262647547020979f, 6.4429434958487279f,
+  6.4594316186372973f, 6.4757334309663976f,
+  6.4918530963296747f, 6.5077946401986963f,
+  6.5235619560570130f, 6.5391588111080309f,
+  6.5545888516776376f, 6.5698556083309478f,
+  6.5849625007211560f, 6.5999128421871278f,
+  6.6147098441152083f, 6.6293566200796094f,
+  6.6438561897747243f, 6.6582114827517946f,
+  6.6724253419714951f, 6.6865005271832185f,
+  6.7004397181410917f, 6.7142455176661224f,
+  6.7279204545631987f, 6.7414669864011464f,
+  6.7548875021634682f, 6.7681843247769259f,
+  6.7813597135246599f, 6.7944158663501061f,
+  6.8073549220576037f, 6.8201789624151878f,
+  6.8328900141647412f, 6.8454900509443747f,
+  6.8579809951275718f, 6.8703647195834047f,
+  6.8826430493618415f, 6.8948177633079437f,
+  6.9068905956085187f, 6.9188632372745946f,
+  6.9307373375628866f, 6.9425145053392398f,
+  6.9541963103868749f, 6.9657842846620869f,
+  6.9772799234999167f, 6.9886846867721654f,
+  7.0000000000000000f, 7.0112272554232539f,
+  7.0223678130284543f, 7.0334230015374501f,
+  7.0443941193584533f, 7.0552824355011898f,
+  7.0660891904577720f, 7.0768155970508308f,
+  7.0874628412503390f, 7.0980320829605263f,
+  7.1085244567781691f, 7.1189410727235076f,
+  7.1292830169449663f, 7.1395513523987936f,
+  7.1497471195046822f, 7.1598713367783890f,
+  7.1699250014423121f, 7.1799090900149344f,
+  7.1898245588800175f, 7.1996723448363644f,
+  7.2094533656289501f, 7.2191685204621611f,
+  7.2288186904958804f, 7.2384047393250785f,
+  7.2479275134435852f, 7.2573878426926521f,
+  7.2667865406949010f, 7.2761244052742375f,
+  7.2854022188622487f, 7.2946207488916270f,
+  7.3037807481771030f, 7.3128829552843557f,
+  7.3219280948873626f, 7.3309168781146167f,
+  7.3398500028846243f, 7.3487281542310771f,
+  7.3575520046180837f, 7.3663222142458160f,
+  7.3750394313469245f, 7.3837042924740519f,
+  7.3923174227787606f, 7.4008794362821843f,
+  7.4093909361377017f, 7.4178525148858982f,
+  7.4262647547020979f, 7.4346282276367245f,
+  7.4429434958487279f, 7.4512111118323289f,
+  7.4594316186372973f, 7.4676055500829976f,
+  7.4757334309663976f, 7.4838157772642563f,
+  7.4918530963296747f, 7.4998458870832056f,
+  7.5077946401986963f, 7.5156998382840427f,
+  7.5235619560570130f, 7.5313814605163118f,
+  7.5391588111080309f, 7.5468944598876364f,
+  7.5545888516776376f, 7.5622424242210728f,
+  7.5698556083309478f, 7.5774288280357486f,
+  7.5849625007211560f, 7.5924570372680806f,
+  7.5999128421871278f, 7.6073303137496104f,
+  7.6147098441152083f, 7.6220518194563764f,
+  7.6293566200796094f, 7.6366246205436487f,
+  7.6438561897747243f, 7.6510516911789281f,
+  7.6582114827517946f, 7.6653359171851764f,
+  7.6724253419714951f, 7.6794800995054464f,
+  7.6865005271832185f, 7.6934869574993252f,
+  7.7004397181410917f, 7.7073591320808825f,
+  7.7142455176661224f, 7.7210991887071855f,
+  7.7279204545631987f, 7.7347096202258383f,
+  7.7414669864011464f, 7.7481928495894605f,
+  7.7548875021634682f, 7.7615512324444795f,
+  7.7681843247769259f, 7.7747870596011736f,
+  7.7813597135246599f, 7.7879025593914317f,
+  7.7944158663501061f, 7.8008998999203047f,
+  7.8073549220576037f, 7.8137811912170374f,
+  7.8201789624151878f, 7.8265484872909150f,
+  7.8328900141647412f, 7.8392037880969436f,
+  7.8454900509443747f, 7.8517490414160571f,
+  7.8579809951275718f, 7.8641861446542797f,
+  7.8703647195834047f, 7.8765169465649993f,
+  7.8826430493618415f, 7.8887432488982591f,
+  7.8948177633079437f, 7.9008668079807486f,
+  7.9068905956085187f, 7.9128893362299619f,
+  7.9188632372745946f, 7.9248125036057812f,
+  7.9307373375628866f, 7.9366379390025709f,
+  7.9425145053392398f, 7.9483672315846778f,
+  7.9541963103868749f, 7.9600019320680805f,
+  7.9657842846620869f, 7.9715435539507719f,
+  7.9772799234999167f, 7.9829935746943103f,
+  7.9886846867721654f, 7.9943534368588577f
+};
+
+float VP8LFastLog2(int v) {
+  if (v < LOG_LOOKUP_IDX_MAX) {
+    return kLog2Table[v];
+  } else if (v < APPROX_LOG_MAX) {
+    int log_cnt = 0;
+    while (v >= LOG_LOOKUP_IDX_MAX) {
+      ++log_cnt;
+      v = v >> 1;
+    }
+    return kLog2Table[v] + (float)log_cnt;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+// In-place sum of each component with mod 256.
+static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
+  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
+  return Clip255(a + b - c);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
+                                         (c1 >> 16) & 0xff,
+                                         (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
+                                         (c1 >> 8) & 0xff,
+                                         (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return (a << 24) | (r << 16) | (g << 8) | b;
+}
+
+static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
+  return Clip255(a + (a - b) / 2);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
+  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
+  return (a << 24) | (r << 16) | (g << 8) | b;
+}
+
+static WEBP_INLINE int Sub3(int a, int b, int c) {
+  const int pa = b - c;
+  const int pb = a - c;
+  return abs(pa) - abs(pb);
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  const int pa_minus_pb =
+      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
+      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
+      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
+      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
+
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+//------------------------------------------------------------------------------
+// Predictors
+
+static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+  (void)top;
+  (void)left;
+  return ARGB_BLACK;
+}
+static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+  (void)top;
+  return left;
+}
+static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[0];
+}
+static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[1];
+}
+static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[-1];
+}
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
+static const PredictorFunc kPredictors[16] = {
+  Predictor0, Predictor1, Predictor2, Predictor3,
+  Predictor4, Predictor5, Predictor6, Predictor7,
+  Predictor8, Predictor9, Predictor10, Predictor11,
+  Predictor12, Predictor13,
+  Predictor0, Predictor0    // <- padding security sentinels
+};
+
+// TODO(vikasa): Replace 256 etc with defines.
+static float PredictionCostSpatial(const int* counts,
+                                   int weight_0, double exp_val) {
+  const int significant_symbols = 16;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+// Compute the Shanon's entropy: Sum(p*log2(p))
+static float ShannonEntropy(const int* const array, int n) {
+  int i;
+  float retval = 0.f;
+  int sum = 0;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      retval -= VP8LFastSLog2(array[i]);
+    }
+  }
+  retval += VP8LFastSLog2(sum);
+  return retval;
+}
+
+static float PredictionCostSpatialHistogram(int accumulated[4][256],
+                                            int tile[4][256]) {
+  int i;
+  int k;
+  int combo[256];
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double exp_val = 0.94;
+    retval += PredictionCostSpatial(&tile[i][0], 1, exp_val);
+    retval += ShannonEntropy(&tile[i][0], 256);
+    for (k = 0; k < 256; ++k) {
+      combo[k] = accumulated[i][k] + tile[i][k];
+    }
+    retval += ShannonEntropy(&combo[0], 256);
+  }
+  return (float)retval;
+}
+
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   const uint32_t* const argb_scratch) {
+  const int kNumPredModes = 14;
+  const int col_start = tile_x << bits;
+  const int row_start = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int ymax = (tile_size <= height - row_start) ?
+      tile_size : height - row_start;
+  const int xmax = (tile_size <= width - col_start) ?
+      tile_size : width - col_start;
+  int histo[4][256];
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+
+  int mode;
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    const uint32_t* current_row = argb_scratch;
+    const PredictorFunc pred_func = kPredictors[mode];
+    float cur_diff;
+    int y;
+    memset(&histo[0][0], 0, sizeof(histo));
+    for (y = 0; y < ymax; ++y) {
+      int x;
+      const int row = row_start + y;
+      const uint32_t* const upper_row = current_row;
+      current_row = upper_row + width;
+      for (x = 0; x < xmax; ++x) {
+        const int col = col_start + x;
+        uint32_t predict;
+        uint32_t predict_diff;
+        if (row == 0) {
+          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
+        } else if (col == 0) {
+          predict = upper_row[col];  // Top.
+        } else {
+          predict = pred_func(current_row[col - 1], upper_row + col);
+        }
+        predict_diff = VP8LSubPixels(current_row[col], predict);
+        ++histo[0][predict_diff >> 24];
+        ++histo[1][((predict_diff >> 16) & 0xff)];
+        ++histo[2][((predict_diff >> 8) & 0xff)];
+        ++histo[3][(predict_diff & 0xff)];
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
+    if (cur_diff < best_diff) {
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  return best_mode;
+}
+
+static void CopyTileWithPrediction(int width, int height,
+                                   int tile_x, int tile_y, int bits, int mode,
+                                   const uint32_t* const argb_scratch,
+                                   uint32_t* const argb) {
+  const int col_start = tile_x << bits;
+  const int row_start = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int ymax = (tile_size <= height - row_start) ?
+      tile_size : height - row_start;
+  const int xmax = (tile_size <= width - col_start) ?
+      tile_size : width - col_start;
+  const PredictorFunc pred_func = kPredictors[mode];
+  const uint32_t* current_row = argb_scratch;
+
+  int y;
+  for (y = 0; y < ymax; ++y) {
+    int x;
+    const int row = row_start + y;
+    const uint32_t* const upper_row = current_row;
+    current_row = upper_row + width;
+    for (x = 0; x < xmax; ++x) {
+      const int col = col_start + x;
+      const int pix = row * width + col;
+      uint32_t predict;
+      if (row == 0) {
+        predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
+      } else if (col == 0) {
+        predict = upper_row[col];  // Top.
+      } else {
+        predict = pred_func(current_row[col - 1], upper_row + col);
+      }
+      argb[pix] = VP8LSubPixels(current_row[col], predict);
+    }
+  }
+}
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image) {
+  const int max_tile_size = 1 << bits;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  uint32_t* const upper_row = argb_scratch;
+  uint32_t* const current_tile_rows = argb_scratch + width;
+  int tile_y;
+  int histo[4][256];
+  memset(histo, 0, sizeof(histo));
+  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+    const int tile_y_offset = tile_y * max_tile_size;
+    const int this_tile_height =
+        (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
+    int tile_x;
+    if (tile_y > 0) {
+      memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
+             width * sizeof(*upper_row));
+    }
+    memcpy(current_tile_rows, &argb[tile_y_offset * width],
+           this_tile_height * width * sizeof(*current_tile_rows));
+    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+      int pred;
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      int all_x_max = tile_x_offset + max_tile_size;
+      if (all_x_max > width) {
+        all_x_max = width;
+      }
+      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
+                                     argb_scratch);
+      image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
+      CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
+                             argb_scratch, argb);
+      for (y = 0; y < max_tile_size; ++y) {
+        int ix;
+        int all_x;
+        int all_y = tile_y_offset + y;
+        if (all_y >= height) {
+          break;
+        }
+        ix = all_y * width + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          const uint32_t a = argb[ix];
+          ++histo[0][a >> 24];
+          ++histo[1][((a >> 16) & 0xff)];
+          ++histo[2][((a >> 8) & 0xff)];
+          ++histo[3][(a & 0xff)];
+        }
+      }
+    }
+  }
+}
+
+// Inverse prediction.
+static void PredictorInverseTransform(const VP8LTransform* const transform,
+                                      int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
+    int x;
+    const uint32_t pred0 = Predictor0(data[-1], NULL);
+    AddPixelsEq(data, pred0);
+    for (x = 1; x < width; ++x) {
+      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
+      AddPixelsEq(data + x, pred1);
+    }
+    data += width;
+    ++y_start;
+  }
+
+  {
+    int y = y_start;
+    const int mask = (1 << transform->bits_) - 1;
+    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
+    const uint32_t* pred_mode_base =
+        transform->data_ + (y >> transform->bits_) * tiles_per_row;
+
+    while (y < y_end) {
+      int x;
+      const uint32_t pred2 = Predictor2(data[-1], data - width);
+      const uint32_t* pred_mode_src = pred_mode_base;
+      PredictorFunc pred_func;
+
+      // First pixel follows the T (mode=2) mode.
+      AddPixelsEq(data, pred2);
+
+      // .. the rest:
+      pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
+      for (x = 1; x < width; ++x) {
+        uint32_t pred;
+        if ((x & mask) == 0) {    // start of tile. Read predictor function.
+          pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        }
+        pred = pred_func(data[x - 1], data + x - width);
+        AddPixelsEq(data + x, pred);
+      }
+      data += width;
+      ++y;
+      if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
+        pred_mode_base += tiles_per_row;
+      }
+    }
+  }
+}
+
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
+  int i;
+  for (i = 0; i < num_pixs; ++i) {
+    const uint32_t argb = argb_data[i];
+    const uint32_t green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  }
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
+                                 int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  const uint32_t* const data_end = data + (y_end - y_start) * width;
+  while (data < data_end) {
+    const uint32_t argb = *data;
+    // "* 0001001u" is equivalent to "(green << 16) + green)"
+    const uint32_t green = ((argb >> 8) & 0xff);
+    uint32_t red_blue = (argb & 0x00ff00ffu);
+    red_blue += (green << 16) | green;
+    red_blue &= 0x00ff00ffu;
+    *data++ = (argb & 0xff00ff00u) | red_blue;
+  }
+}
+
+typedef struct {
+  // Note: the members are uint8_t, so that any negative values are
+  // automatically converted to "mod 256" values.
+  uint8_t green_to_red_;
+  uint8_t green_to_blue_;
+  uint8_t red_to_blue_;
+} Multipliers;
+
+static WEBP_INLINE void MultipliersClear(Multipliers* m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               Multipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
+                                           uint32_t argb, int inverse) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint32_t new_red = red;
+  uint32_t new_blue = argb;
+
+  if (inverse) {
+    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue += ColorTransformDelta(m->green_to_blue_, green);
+    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+    new_blue &= 0xff;
+  } else {
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+  }
+  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+}
+
+static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
+                                          int ix, int xsize) {
+  const uint32_t v = argb[ix];
+  if (ix >= xsize + 3) {
+    if (v == argb[ix - xsize] &&
+        argb[ix - 1] == argb[ix - xsize - 1] &&
+        argb[ix - 2] == argb[ix - xsize - 2] &&
+        argb[ix - 3] == argb[ix - xsize - 3]) {
+      return 1;
+    }
+    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
+  } else if (ix >= 3) {
+    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
+  }
+  return 0;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  int i;
+  int combo[256];
+  for (i = 0; i < 256; ++i) {
+    combo[i] = accumulated[i] + counts[i];
+  }
+  return ShannonEntropy(combo, 256) +
+         ShannonEntropy(counts, 256) +
+         PredictionCostSpatial(counts, 3, 2.4);  // Favor small absolute values.
+}
+
+static Multipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    Multipliers prevX,
+    Multipliers prevY,
+    int step, int xsize, int ysize,
+    int* accumulated_red_histo,
+    int* accumulated_blue_histo,
+    const uint32_t* const argb) {
+  float best_diff = MAX_DIFF_COST;
+  float cur_diff;
+  const int halfstep = step / 2;
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  int green_to_red;
+  int green_to_blue;
+  int red_to_blue;
+  int all_x_max = tile_x_offset + max_tile_size;
+  int all_y_max = tile_y_offset + max_tile_size;
+  Multipliers best_tx;
+  MultipliersClear(&best_tx);
+  if (all_x_max > xsize) {
+    all_x_max = xsize;
+  }
+  if (all_y_max > ysize) {
+    all_y_max = ysize;
+  }
+  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
+    int histo[256] = { 0 };
+    int all_y;
+    Multipliers tx;
+    MultipliersClear(&tx);
+    tx.green_to_red_ = green_to_red & 0xff;
+
+    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+      uint32_t predict;
+      int ix = all_y * xsize + tile_x_offset;
+      int all_x;
+      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+        if (SkipRepeatedPixels(argb, ix, xsize)) {
+          continue;
+        }
+        predict = TransformColor(&tx, argb[ix], 0);
+        ++histo[(predict >> 16) & 0xff];  // red.
+      }
+    }
+    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
+    if (tx.green_to_red_ == prevX.green_to_red_) {
+      cur_diff -= 3;  // favor keeping the areas locally similar
+    }
+    if (tx.green_to_red_ == prevY.green_to_red_) {
+      cur_diff -= 3;  // favor keeping the areas locally similar
+    }
+    if (tx.green_to_red_ == 0) {
+      cur_diff -= 3;
+    }
+    if (cur_diff < best_diff) {
+      best_diff = cur_diff;
+      best_tx = tx;
+    }
+  }
+  best_diff = MAX_DIFF_COST;
+  green_to_red = best_tx.green_to_red_;
+  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
+    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
+      int all_y;
+      int histo[256] = { 0 };
+      Multipliers tx;
+      tx.green_to_red_ = green_to_red;
+      tx.green_to_blue_ = green_to_blue;
+      tx.red_to_blue_ = red_to_blue;
+      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+        uint32_t predict;
+        int all_x;
+        int ix = all_y * xsize + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          if (SkipRepeatedPixels(argb, ix, xsize)) {
+            continue;
+          }
+          predict = TransformColor(&tx, argb[ix], 0);
+          ++histo[predict & 0xff];  // blue.
+        }
+      }
+      cur_diff =
+        PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
+      if (tx.green_to_blue_ == prevX.green_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.green_to_blue_ == prevY.green_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.red_to_blue_ == prevX.red_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.red_to_blue_ == prevY.red_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.green_to_blue_ == 0) {
+        cur_diff -= 3;
+      }
+      if (tx.red_to_blue_ == 0) {
+        cur_diff -= 3;
+      }
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        best_tx = tx;
+      }
+    }
+  }
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y, int bits,
+                                       Multipliers color_transform,
+                                       uint32_t* const argb) {
+  int y;
+  int xscan = 1 << bits;
+  int yscan = 1 << bits;
+  tile_x <<= bits;
+  tile_y <<= bits;
+  if (xscan > xsize - tile_x) {
+    xscan = xsize - tile_x;
+  }
+  if (yscan > ysize - tile_y) {
+    yscan = ysize - tile_y;
+  }
+  yscan += tile_y;
+  for (y = tile_y; y < yscan; ++y) {
+    int ix = y * xsize + tile_x;
+    const int end_ix = ix + xscan;
+    for (; ix < end_ix; ++ix) {
+      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
+    }
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  int tile_xsize = VP8LSubSampleSize(width, bits);
+  int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_y;
+  int tile_x;
+  Multipliers prevX;
+  Multipliers prevY;
+  MultipliersClear(&prevY);
+  MultipliersClear(&prevX);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      Multipliers color_transform;
+      int all_x_max;
+      int y;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int tile_x_offset = tile_x * max_tile_size;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
+                               &prevY);
+      } else if (tile_x != 0) {
+        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+      }
+      color_transform =
+          GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                       prevX, prevY,
+                                       step, width, height,
+                                       &accumulated_red_histo[0],
+                                       &accumulated_blue_histo[0],
+                                       argb);
+      image[tile_y * tile_xsize + tile_x] =
+          MultipliersToColorCode(&color_transform);
+      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
+                                 color_transform, argb);
+
+      // Gather accumulated histogram data.
+      all_x_max = tile_x_offset + max_tile_size;
+      if (all_x_max > width) {
+        all_x_max = width;
+      }
+      for (y = 0; y < max_tile_size; ++y) {
+        int ix;
+        int all_x;
+        int all_y = tile_y_offset + y;
+        if (all_y >= height) {
+          break;
+        }
+        ix = all_y * width + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          if (ix >= 2 &&
+              argb[ix] == argb[ix - 2] &&
+              argb[ix] == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              argb[ix] == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
+          ++accumulated_blue_histo[argb[ix] & 0xff];
+        }
+      }
+    }
+  }
+}
+
+// Color space inverse transform.
+static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
+                                       int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  const int mask = (1 << transform->bits_) - 1;
+  const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
+  int y = y_start;
+  const uint32_t* pred_row =
+      transform->data_ + (y >> transform->bits_) * tiles_per_row;
+
+  while (y < y_end) {
+    const uint32_t* pred = pred_row;
+    Multipliers m = { 0, 0, 0 };
+    int x;
+
+    for (x = 0; x < width; ++x) {
+      if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
+      data[x] = TransformColor(&m, data[x], 1);
+    }
+    data += width;
+    ++y;
+    if ((y & mask) == 0) pred_row += tiles_per_row;;
+  }
+}
+
+// Separate out pixels packed together using pixel-bundling.
+static void ColorIndexInverseTransform(
+    const VP8LTransform* const transform,
+    int y_start, int y_end, const uint32_t* src, uint32_t* dst) {
+  int y;
+  const int bits_per_pixel = 8 >> transform->bits_;
+  const int width = transform->xsize_;
+  const uint32_t* const color_map = transform->data_;
+  if (bits_per_pixel < 8) {
+    const int pixels_per_byte = 1 << transform->bits_;
+    const int count_mask = pixels_per_byte - 1;
+    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;
+    for (y = y_start; y < y_end; ++y) {
+      uint32_t packed_pixels = 0;
+      int x;
+      for (x = 0; x < width; ++x) {
+        // We need to load fresh 'packed_pixels' once every 'pixels_per_byte'
+        // increments of x. Fortunately, pixels_per_byte is a power of 2, so
+        // can just use a mask for that, instead of decrementing a counter.
+        if ((x & count_mask) == 0) packed_pixels = ((*src++) >> 8) & 0xff;
+        *dst++ = color_map[packed_pixels & bit_mask];
+        packed_pixels >>= bits_per_pixel;
+      }
+    }
+  } else {
+    for (y = y_start; y < y_end; ++y) {
+      int x;
+      for (x = 0; x < width; ++x) {
+        *dst++ = color_map[((*src++) >> 8) & 0xff];
+      }
+    }
+  }
+}
+
+void VP8LInverseTransform(const VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out) {
+  assert(row_start < row_end);
+  assert(row_end <= transform->ysize_);
+  switch (transform->type_) {
+    case SUBTRACT_GREEN:
+      AddGreenToBlueAndRed(transform, row_start, row_end, out);
+      break;
+    case PREDICTOR_TRANSFORM:
+      PredictorInverseTransform(transform, row_start, row_end, out);
+      if (row_end != transform->ysize_) {
+        // The last predicted row in this iteration will be the top-pred row
+        // for the first row in next iteration.
+        const int width = transform->xsize_;
+        memcpy(out - width, out + (row_end - row_start - 1) * width,
+               width * sizeof(*out));
+      }
+      break;
+    case CROSS_COLOR_TRANSFORM:
+      ColorSpaceInverseTransform(transform, row_start, row_end, out);
+      break;
+    case COLOR_INDEXING_TRANSFORM:
+      if (in == out && transform->bits_ > 0) {
+        // Move packed pixels to the end of unpacked region, so that unpacking
+        // can occur seamlessly.
+        // Also, note that this is the only transform that applies on
+        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
+        // transforms work on effective width of xsize_.
+        const int out_stride = (row_end - row_start) * transform->xsize_;
+        const int in_stride = (row_end - row_start) *
+            VP8LSubSampleSize(transform->xsize_, transform->bits_);
+        uint32_t* const src = out + out_stride - in_stride;
+        memmove(src, out, in_stride * sizeof(*src));
+        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+      } else {
+        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+      }
+      break;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color space conversion.
+
+static int is_big_endian(void) {
+  static const union {
+    uint16_t w;
+    uint8_t b[2];
+  } tmp = { 1 };
+  return (tmp.b[0] != 1);
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >> 16) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >>  0) & 0xff;
+  }
+}
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >> 16) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >>  0) & 0xff;
+    *dst++ = (argb >> 24) & 0xff;
+  }
+}
+
+static void ConvertBGRAToRGBA4444(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+#ifdef ANDROID_WEBP_RGB
+    *dst++ = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
+    *dst++ = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
+#else
+    *dst++ = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
+    *dst++ = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
+#endif
+  }
+}
+
+static void ConvertBGRAToRGB565(const uint32_t* src,
+                                int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+#ifdef ANDROID_WEBP_RGB
+    *dst++ = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
+    *dst++ = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
+#else
+    *dst++ = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
+    *dst++ = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
+#endif
+  }
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >>  0) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >> 16) & 0xff;
+  }
+}
+
+static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
+                       int swap_on_big_endian) {
+  if (is_big_endian() == swap_on_big_endian) {
+    const uint32_t* const src_end = src + num_pixels;
+    while (src < src_end) {
+      uint32_t argb = *src++;
+#if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__))
+      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
+      *(uint32_t*)dst = argb;
+      dst += sizeof(argb);
+#elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER)
+      argb = _byteswap_ulong(argb);
+      *(uint32_t*)dst = argb;
+      dst += sizeof(argb);
+#else
+      *dst++ = (argb >> 24) & 0xff;
+      *dst++ = (argb >> 16) & 0xff;
+      *dst++ = (argb >>  8) & 0xff;
+      *dst++ = (argb >>  0) & 0xff;
+#endif
+    }
+  } else {
+    memcpy(dst, src, num_pixels * sizeof(*src));
+  }
+}
+
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
+  switch (out_colorspace) {
+    case MODE_RGB:
+      ConvertBGRAToRGB(in_data, num_pixels, rgba);
+      break;
+    case MODE_RGBA:
+      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      break;
+    case MODE_rgbA:
+      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
+      break;
+    case MODE_BGR:
+      ConvertBGRAToBGR(in_data, num_pixels, rgba);
+      break;
+    case MODE_BGRA:
+      CopyOrSwap(in_data, num_pixels, rgba, 1);
+      break;
+    case MODE_bgrA:
+      CopyOrSwap(in_data, num_pixels, rgba, 1);
+      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
+      break;
+    case MODE_ARGB:
+      CopyOrSwap(in_data, num_pixels, rgba, 0);
+      break;
+    case MODE_Argb:
+      CopyOrSwap(in_data, num_pixels, rgba, 0);
+      WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
+      break;
+    case MODE_RGBA_4444:
+      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      break;
+    case MODE_rgbA_4444:
+      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
+      break;
+    case MODE_RGB_565:
+      ConvertBGRAToRGB565(in_data, num_pixels, rgba);
+      break;
+    default:
+      assert(0);          // Code flow should not reach here.
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
new file mode 100644
index 0000000..22a91cb
--- /dev/null
+++ b/src/dsp/lossless.h
@@ -0,0 +1,82 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_H_
+#define WEBP_DSP_LOSSLESS_H_
+
+#include "webp/types.h"
+#include "webp/decode.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+struct VP8LTransform;  // Defined in dec/vp8li.h.
+
+// Performs inverse transform of data given transform information, start and end
+// rows. Transform will be applied to rows [row_start, row_end[.
+// The *in and *out pointers refer to source and destination data respectively
+// corresponding to the intermediate row (row_start).
+void VP8LInverseTransform(const struct VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out);
+
+// Subtracts green from blue and red channels.
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
+// Color space conversion.
+
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// Faster logarithm for integers, with the property of log2(0) == 0.
+float VP8LFastLog2(int v);
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(int v) { return VP8LFastLog2(v) * v; }
+
+// In-place difference of each component with mod 256.
+static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_H_
diff --git a/src/dsp/upsampling.c b/src/dsp/upsampling.c
new file mode 100644
index 0000000..4855eb1
--- /dev/null
+++ b/src/dsp/upsampling.c
@@ -0,0 +1,357 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+#include "./yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB
+WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int i;                                                                       \
+  for (i = 0; i < len - 1; i += 2) {                                           \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
+    top_y += 2;                                                                \
+    bottom_y += 2;                                                             \
+    u++;                                                                       \
+    v++;                                                                       \
+    top_dst += 2 * XSTEP;                                                      \
+    bottom_dst += 2 * XSTEP;                                                   \
+  }                                                                            \
+  if (i == len - 1) {    /* last one */                                        \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+  }                                                                            \
+}
+
+// All variants implemented.
+SAMPLE_FUNC(SampleRgbLinePair,      VP8YuvToRgb,  3)
+SAMPLE_FUNC(SampleBgrLinePair,      VP8YuvToBgr,  3)
+SAMPLE_FUNC(SampleRgbaLinePair,     VP8YuvToRgba, 4)
+SAMPLE_FUNC(SampleBgraLinePair,     VP8YuvToBgra, 4)
+SAMPLE_FUNC(SampleArgbLinePair,     VP8YuvToArgb, 4)
+SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+SAMPLE_FUNC(SampleRgb565LinePair,   VP8YuvToRgb565, 2)
+
+#undef SAMPLE_FUNC
+
+const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
+  SampleRgbLinePair,       // MODE_RGB
+  SampleRgbaLinePair,      // MODE_RGBA
+  SampleBgrLinePair,       // MODE_BGR
+  SampleBgraLinePair,      // MODE_BGRA
+  SampleArgbLinePair,      // MODE_ARGB
+  SampleRgba4444LinePair,  // MODE_RGBA_4444
+  SampleRgb565LinePair,    // MODE_RGB_565
+  SampleRgbaLinePair,      // MODE_rgbA
+  SampleBgraLinePair,      // MODE_bgrA
+  SampleArgbLinePair,      // MODE_Argb
+  SampleRgba4444LinePair   // MODE_rgbA_4444
+};
+
+//------------------------------------------------------------------------------
+
+#if !defined(FANCY_UPSAMPLING)
+#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* bot_u, const uint8_t* bot_v,              \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+  const int half_len = len >> 1;                                               \
+  int x;                                                                       \
+  if (top_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
+      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x);  \
+  }                                                                            \
+  if (bot_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0);         \
+      FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x);  \
+  }                                                                            \
+}
+
+DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
+DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
+#undef DUAL_SAMPLE_FUNC
+
+#endif  // !FANCY_UPSAMPLING
+
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
+  WebPInitUpsamplers();
+  VP8YUVInit();
+#ifdef FANCY_UPSAMPLING
+  return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
+#else
+  return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
+YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
+YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
+
+#undef YUV444_FUNC
+
+const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
+  Yuv444ToRgb,       // MODE_RGB
+  Yuv444ToRgba,      // MODE_RGBA
+  Yuv444ToBgr,       // MODE_BGR
+  Yuv444ToBgra,      // MODE_BGRA
+  Yuv444ToArgb,      // MODE_ARGB
+  Yuv444ToRgba4444,  // MODE_RGBA_4444
+  Yuv444ToRgb565,    // MODE_RGB_565
+  Yuv444ToRgba,      // MODE_rgbA
+  Yuv444ToBgra,      // MODE_bgrA
+  Yuv444ToArgb,      // MODE_Argb
+  Yuv444ToRgba4444   // MODE_rgbA_4444
+};
+
+//------------------------------------------------------------------------------
+// Premultiplied modes
+
+// non dithered-modes
+
+// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
+#if 1     // (int)(x * a / 255.)
+#define MULTIPLIER(a)   ((a) * 32897UL)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+#else     // (int)(x * a / 255. + .5)
+#define MULTIPLIER(a) ((a) * 65793UL)
+#define PREMULTIPLY(x, m) (((x) * (m) + (1UL << 23)) >> 24)
+#endif
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  while (h-- > 0) {
+    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// rgbA4444
+
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
+
+static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
+  return (x & 0xf0) | (x >> 4);
+}
+
+static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
+  return (x & 0x0f) | (x << 4);
+}
+
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
+static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+  while (h-- > 0) {
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
+      const uint32_t mult = MULTIPLIER(a);
+      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
+      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
+      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
+      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
+      rgba4444[2 * i + 1] = (b & 0xf0) | a;
+    }
+    rgba4444 += stride;
+  }
+}
+#undef MULTIPLIER
+
+void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
+    = ApplyAlphaMultiply;
+void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int)
+    = ApplyAlphaMultiply4444;
+
+//------------------------------------------------------------------------------
+// Main call
+
+void WebPInitUpsamplers(void) {
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitUpsamplersSSE2();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+}
+
+void WebPInitPremultiply(void) {
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply4444;
+
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitPremultiplySSE2();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/io_sse2.c b/src/dsp/upsampling_sse2.c
similarity index 92%
rename from src/dec/io_sse2.c
rename to src/dsp/upsampling_sse2.c
index 0f42350..8cb275a 100644
--- a/src/dec/io_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,18 +9,21 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#if defined(__SSE2__) || defined(_MSC_VER)
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
 
 #include <assert.h>
 #include <emmintrin.h>
 #include <string.h>
-#include "webpi.h"
-#include "yuv.h"
+#include "./yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
+#ifdef FANCY_UPSAMPLING
+
 // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
 // u = (9*a + 3*b + 3*c + d + 8) / 16
 //   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
@@ -173,9 +176,6 @@
 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2,  VP8YuvToBgr,  3)
 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
-// These two don't erase the alpha value
-SSE2_UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePairSSE2, VP8YuvToRgb, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4)
 
 #undef GET_M
 #undef PACK_AND_STORE
@@ -184,22 +184,26 @@
 #undef CONVERT2RGB
 #undef SSE2_UPSAMPLE_FUNC
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersSSE2(void) {
-  WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
-  WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
-  WebPUpsamplers[MODE_BGRA] =  UpsampleBgraLinePairSSE2;
-
-  WebPUpsamplersKeepAlpha[MODE_RGB] = UpsampleRgbLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_BGR] =  UpsampleBgrLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_BGRA] =  UpsampleBgrKeepAlphaLinePairSSE2;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
 }
 
+void WebPInitPremultiplySSE2(void) {
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
+}
+
+#endif  // FANCY_UPSAMPLING
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
-#endif   //__SSE2__ || _MSC_VER
+#endif   // WEBP_USE_SSE2
diff --git a/src/dec/yuv.c b/src/dsp/yuv.c
similarity index 91%
rename from src/dec/yuv.c
rename to src/dsp/yuv.c
index 2b203f7..7f05f9a 100644
--- a/src/dec/yuv.c
+++ b/src/dsp/yuv.c
@@ -1,4 +1,4 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,7 +9,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "yuv.h"
+#include "./yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -24,7 +24,7 @@
 
 static int done = 0;
 
-static inline uint8_t clip(int v, int max_value) {
+static WEBP_INLINE uint8_t clip(int v, int max_value) {
   return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 
diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h
new file mode 100644
index 0000000..ee3587e
--- /dev/null
+++ b/src/dsp/yuv.h
@@ -0,0 +1,148 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// inline YUV<->RGB conversion function
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_YUV_H_
+#define WEBP_DSP_YUV_H_
+
+#include "../dec/decode_vp8.h"
+
+/*
+ * Define ANDROID_WEBP_RGB to enable specific optimizations for Android
+ * RGBA_4444 & RGB_565 color support.
+ *
+ */
+
+#define ANDROID_WEBP_RGB
+
+//------------------------------------------------------------------------------
+// YUV -> RGB conversion
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_RANGE_MIN = -227,        // min value of r/g/b output
+       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+};
+extern int16_t VP8kVToR[256], VP8kUToB[256];
+extern int32_t VP8kVToG[256], VP8kUToG[256];
+extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+#ifdef ANDROID_WEBP_RGB
+  rgb[1] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  rgb[0] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#else
+  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToRgb(y, u, v, argb + 1);
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+#ifdef ANDROID_WEBP_RGB
+  argb[1] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  argb[0] = 0x0f | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
+#else
+  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  argb[1] = 0x0f | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+}
+
+static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const bgra) {
+  VP8YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+// Must be called before everything, to initialize the tables.
+void VP8YUVInit(void);
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations.
+
+static WEBP_INLINE int VP8ClipUV(int v) {
+   v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
+  const int luma = 16839 * r + 33059 * g + 6420 * b;
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+  return VP8ClipUV(-9719 * r - 19081 * g + 28800 * b);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+  return VP8ClipUV(+28800 * r - 24116 * g - 4684 * b);
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_YUV_H_ */
diff --git a/src/enc/Android.mk b/src/enc/Android.mk
index 64f6743..7f38d40 100644
--- a/src/enc/Android.mk
+++ b/src/enc/Android.mk
@@ -16,28 +16,48 @@
 
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
-	alpha.c \
-	analysis.c \
-	bit_writer.c \
-	config.c \
-	cost.c \
-	dsp.c \
-	dsp_sse2.c \
-	filter.c \
-	frame.c \
-	iterator.c \
-	layer.c \
-	picture.c \
-	quant.c \
-	syntax.c \
-	tree.c \
-	webpenc.c
+        alpha.c \
+        analysis.c \
+        backward_references.c \
+        config.c \
+        cost.c \
+        filter.c \
+        frame.c\
+        histogram.c \
+        iterator.c \
+        layer.c \
+        picture.c \
+        quant.c \
+        syntax.c \
+        tree.c \
+        vp8l.c \
+        webpenc.c \
+        ../dsp/cpu.c \
+        ../dsp/dec.c \
+        ../dsp/dec_neon.c \
+        ../dsp/dec_sse2.c \
+        ../dsp/enc.c \
+        ../dsp/enc_sse2.c \
+        ../dsp/lossless.c \
+        ../dsp/upsampling.c \
+        ../dsp/upsampling_sse2.c \
+        ../dsp/yuv.c \
+        ../utils/bit_reader.c \
+        ../utils/bit_writer.c \
+        ../utils/color_cache.c \
+        ../utils/filters.c \
+        ../utils/huffman.c \
+        ../utils/huffman_encode.c \
+        ../utils/quant_levels.c \
+        ../utils/rescaler.c \
+        ../utils/thread.c \
+        ../utils/utils.c
 
 LOCAL_CFLAGS := -DANDROID
 
 LOCAL_C_INCLUDES += \
-	$(LOCAL_PATH) \
-	$(LOCAL_PATH)/../../include
+        $(LOCAL_PATH) \
+        $(LOCAL_PATH)/../../include
 
 LOCAL_MODULE:= libwebp-encode
 
diff --git a/src/enc/alpha.c b/src/enc/alpha.c
index d5d4f88..376f786 100644
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -11,98 +11,314 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "vp8enci.h"
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "zlib.h"
-#endif
+#include "./vp8enci.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels.h"
+#include "webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
+// -----------------------------------------------------------------------------
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
+//   'method = 0' - No compression;
+//   'method = 1' - Use lossless coder on the alpha plane only
+// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
+// vertical & gradient filters. The prediction mode 4 will try all the
+// prediction modes 0 to 3 and pick the best one.
+// 'effort_level': specifies how much effort must be spent to try and reduce
+//  the compressed output size. In range 0 (quick) to 6 (slow).
+//
+// 'output' corresponds to the buffer containing compressed alpha data.
+//          This buffer is allocated by this method and caller should call
+//          free(*output) when done.
+// 'output_size' corresponds to size of this compressed alpha buffer.
+//
+// Returns 1 on successfully encoding the alpha and
+//         0 if either:
+//           invalid quality or method, or
+//           memory allocation for the compressed data fails.
 
-#define CHUNK_SIZE 8192
+#include "../enc/vp8li.h"
 
-//-----------------------------------------------------------------------------
+static int EncodeLossless(const uint8_t* const data, int width, int height,
+                          int effort_level,  // in [0..6] range
+                          VP8BitWriter* const bw,
+                          WebPAuxStats* const stats) {
+  int ok = 0;
+  WebPConfig config;
+  WebPPicture picture;
+  VP8LBitWriter tmp_bw;
 
-static int CompressAlpha(const uint8_t* data, size_t data_size,
-                         uint8_t** output, size_t* output_size,
-                         int algo) {
-  int ret = Z_OK;
-  z_stream strm;
-  unsigned char chunk[CHUNK_SIZE];
+  WebPPictureInit(&picture);
+  picture.width = width;
+  picture.height = height;
+  picture.use_argb = 1;
+  picture.stats = stats;
+  if (!WebPPictureAlloc(&picture)) return 0;
 
-  *output = NULL;
-  *output_size = 0;
-  memset(&strm, 0, sizeof(strm));
-  if (deflateInit(&strm, algo ? Z_BEST_SPEED : Z_BEST_COMPRESSION) != Z_OK) {
-    return 0;
-  }
-  strm.next_in = (unsigned char*)data;
-  strm.avail_in = data_size;
-  do {
-    size_t size_out;
-
-    strm.next_out = chunk;
-    strm.avail_out = CHUNK_SIZE;
-    ret = deflate(&strm, Z_FINISH);
-    if (ret == Z_STREAM_ERROR) {
-      break;
-    }
-    size_out = CHUNK_SIZE - strm.avail_out;
-    if (size_out) {
-      size_t new_size = *output_size + size_out;
-      uint8_t* new_output = realloc(*output, new_size);
-      if (new_output == NULL) {
-        ret = Z_MEM_ERROR;
-        break;
+  // Transfer the alpha values to the green channel.
+  {
+    int i, j;
+    uint32_t* dst = picture.argb;
+    const uint8_t* src = data;
+    for (j = 0; j < picture.height; ++j) {
+      for (i = 0; i < picture.width; ++i) {
+        dst[i] = (src[i] << 8) | 0xff000000u;
       }
-      memcpy(new_output + *output_size, chunk, size_out);
-      *output_size = new_size;
-      *output = new_output;
+      src += width;
+      dst += picture.argb_stride;
     }
-  } while (ret != Z_STREAM_END || strm.avail_out == 0);
-
-  deflateEnd(&strm);
-  if (ret != Z_STREAM_END) {
-    free(*output);
-    output_size = 0;
-    return 0;
   }
-  return 1;
+
+  WebPConfigInit(&config);
+  config.lossless = 1;
+  config.method = effort_level;  // impact is very small
+  // Set moderate default quality setting for alpha. Higher qualities (80 and
+  // above) could be very slow.
+  config.quality = 10.f + 15.f * effort_level;
+  if (config.quality > 100.f) config.quality = 100.f;
+
+  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
+  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  WebPPictureFree(&picture);
+  if (ok) {
+    const uint8_t* const data = VP8LBitWriterFinish(&tmp_bw);
+    const size_t data_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8BitWriterAppend(bw, data, data_size);
+  }
+  VP8LBitWriterDestroy(&tmp_bw);
+  return ok && !bw->error_;
 }
 
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+// -----------------------------------------------------------------------------
 
-void VP8EncInitAlpha(VP8Encoder* enc) {
-  enc->has_alpha_ = (enc->pic_->a != NULL);
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
+                               int method, int filter, int reduce_levels,
+                               int effort_level,  // in [0..6] range
+                               uint8_t* const tmp_alpha,
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
+  int ok = 0;
+  const uint8_t* alpha_src;
+  WebPFilterFunc filter_func;
+  uint8_t header;
+  size_t expected_size;
+  const size_t data_size = width * height;
+
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
+  assert(method >= ALPHA_NO_COMPRESSION);
+  assert(method <= ALPHA_LOSSLESS_COMPRESSION);
+  assert(sizeof(header) == ALPHA_HEADER_LEN);
+  // TODO(skal): have a common function and #define's to validate alpha params.
+
+  expected_size =
+      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
+                                       : (data_size >> 5);
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(bw, expected_size);
+  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
+
+  filter_func = WebPFilters[filter];
+  if (filter_func) {
+    filter_func(data, width, height, 1, width, tmp_alpha);
+    alpha_src = tmp_alpha;
+  }  else {
+    alpha_src = data;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = ok && !bw->error_;
+  } else {
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    VP8BitWriterFinish(bw);
+  }
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+// TODO(skal): move to dsp/ ?
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
+                       int quality, int method, int filter,
+                       int effort_level,
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  uint8_t* quant_alpha = NULL;
+  const size_t data_size = width * height;
+  uint64_t sse = 0;
+  int ok = 1;
+  const int reduce_levels = (quality < 100);
+
+  // quick sanity checks
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
+  assert(width > 0 && height > 0);
+  assert(pic->a_stride >= width);
+  assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
+
+  if (quality < 0 || quality > 100) {
+    return 0;
+  }
+
+  if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
+    return 0;
+  }
+
+  quant_alpha = (uint8_t*)malloc(data_size);
+  if (quant_alpha == NULL) {
+    return 0;
+  }
+
+  // Extract alpha data (width x height) from raw_data (stride x height).
+  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+
+  if (reduce_levels) {  // No Quantization required for 'quality = 100'.
+    // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
+    // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
+    // and Quality:]70, 100] -> Levels:]16, 256].
+    const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
+                                             : (16 + (quality - 70) * 8);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
+  }
+
+  if (ok) {
+    VP8BitWriter bw;
+    int test_filter;
+    uint8_t* filtered_alpha = NULL;
+
+    // We always test WEBP_FILTER_NONE first.
+    ok = EncodeAlphaInternal(quant_alpha, width, height,
+                             method, WEBP_FILTER_NONE, reduce_levels,
+                             effort_level, NULL, &bw, pic->stats);
+    if (!ok) {
+      VP8BitWriterWipeOut(&bw);
+      goto End;
+    }
+
+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
+      filter = EstimateBestFilter(quant_alpha, width, height, width);
+    }
+    // Stop?
+    if (filter == WEBP_FILTER_NONE) {
+      goto Ok;
+    }
+
+    filtered_alpha = (uint8_t*)malloc(data_size);
+    ok = (filtered_alpha != NULL);
+    if (!ok) {
+      goto End;
+    }
+
+    // Try the other mode(s).
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = VP8BitWriterSize(&bw);
+
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter = WEBP_FILTER_HORIZONTAL;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
+        }
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        VP8BitWriterWipeOut(&tmp_bw);
+      }
+      if (pic->stats != NULL) *pic->stats = best_stats;
+    }
+ Ok:
+    if (ok) {
+      *output_size = VP8BitWriterSize(&bw);
+      *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
+    }
+    free(filtered_alpha);
+  }
+ End:
+  free(quant_alpha);
+  return ok;
+}
+
+
+//------------------------------------------------------------------------------
+// Main calls
+
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
 }
 
-void VP8EncCodeAlphaBlock(VP8EncIterator* it) {
-  (void)it;
-  // Nothing for now. We just ZLIB-compress in the end.
-}
-
-int VP8EncFinishAlpha(VP8Encoder* enc) {
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    const WebPPicture* pic = enc->pic_;
-    assert(pic->a);
-    if (!CompressAlpha(pic->a, pic->width * pic->height,
-                       &enc->alpha_data_, &enc->alpha_data_size_,
-                       enc->config_->alpha_compression)) {
+    const WebPConfig* config = enc->config_;
+    uint8_t* tmp_data = NULL;
+    size_t tmp_size = 0;
+    const int effort_level = config->method;  // maps to [0..6]
+    const WEBP_FILTER_TYPE filter =
+        (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+        (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                         WEBP_FILTER_BEST;
+
+    if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                     filter, effort_level, &tmp_data, &tmp_size)) {
       return 0;
     }
-#endif
+    if (tmp_size != (uint32_t)tmp_size) {  // Sanity check.
+      free(tmp_data);
+      return 0;
+    }
+    enc->alpha_data_size_ = (uint32_t)tmp_size;
+    enc->alpha_data_ = tmp_data;
   }
-  return 1;
+  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
-void VP8EncDeleteAlpha(VP8Encoder* enc) {
+void VP8EncDeleteAlpha(VP8Encoder* const enc) {
   free(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
diff --git a/src/enc/analysis.c b/src/enc/analysis.c
index 8f84bd5..22cfb49 100644
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -13,8 +13,9 @@
 #include <string.h>
 #include <assert.h>
 
-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -26,7 +27,7 @@
   return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
 
@@ -35,7 +36,8 @@
   const int w = enc->mb_w_;
   const int h = enc->mb_h_;
   const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* tmp = (uint8_t*)malloc(w * h * sizeof(uint8_t));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
 
   if (tmp == NULL) return;
   for (y = 1; y < h - 1; ++y) {
@@ -69,7 +71,7 @@
   free(tmp);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Finalize Segment probability based on the coding tree
 
 static int GetProba(int a, int b) {
@@ -112,7 +114,7 @@
   }
 }
 
-static inline int clip(int v, int m, int M) {
+static WEBP_INLINE int clip(int v, int m, int M) {
   return v < m ? m : v > M ? M : v;
 }
 
@@ -139,13 +141,13 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
 static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
   const int nb = enc->segment_hdr_.num_segments_;
   int centers[NUM_MB_SEGMENTS];
-  int weighted_average;
+  int weighted_average = 0;
   int map[256];
   int a, n, k;
   int min_a = 0, max_a = 255, range_a;
@@ -206,9 +208,9 @@
   // Map each original value to the closest centroid
   for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
     VP8MBInfo* const mb = &enc->mb_info_[n];
-    const int a = mb->alpha_;
-    mb->segment_ = map[a];
-    mb->alpha_ = centers[map[a]];     // just for the record.
+    const int alpha = mb->alpha_;
+    mb->segment_ = map[alpha];
+    mb->alpha_ = centers[map[alpha]];     // just for the record.
   }
 
   if (nb > 1) {
@@ -220,7 +222,7 @@
   SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Macroblock analysis: collect histogram for each mode, deduce the maximal
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
@@ -253,7 +255,7 @@
 
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
-  int modes[16];
+  uint8_t modes[16];
   const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
   int i4_alpha = 0;
   VP8IteratorStartI4(it);
@@ -328,7 +330,7 @@
   it->mb_->alpha_ = best_alpha;   // Informative only.
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main analysis loop:
 // Collect all susceptibilities for each macroblock and record their
 // distribution in alphas[]. Segments is assigned a-posteriori, based on
@@ -339,6 +341,7 @@
 // this stage.
 
 int VP8EncAnalyze(VP8Encoder* const enc) {
+  int ok = 1;
   int alphas[256] = { 0 };
   VP8EncIterator it;
 
@@ -347,12 +350,13 @@
   do {
     VP8IteratorImport(&it);
     MBAnalyze(&it, alphas, &enc->uv_alpha_);
+    ok = VP8IteratorProgress(&it, 20);
     // Let's pretend we have perfect lossless reconstruction.
-  } while (VP8IteratorNext(&it, it.yuv_in_));
+  } while (ok && VP8IteratorNext(&it, it.yuv_in_));
   enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
-  AssignSegments(enc, alphas);
+  if (ok) AssignSegments(enc, alphas);
 
-  return 1;
+  return ok;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/enc/backward_references.c b/src/enc/backward_references.c
new file mode 100644
index 0000000..b8c8ece
--- /dev/null
+++ b/src/enc/backward_references.c
@@ -0,0 +1,874 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../dsp/lossless.h"
+#include "../utils/color_cache.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+#define HASH_BITS 18
+#define HASH_SIZE (1 << HASH_BITS)
+#define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << 20) - 120)
+
+// Bounds for the match length.
+#define MIN_LENGTH 2
+#define MAX_LENGTH 4096
+
+typedef struct {
+  // Stores the most recently added position with the given hash value.
+  int32_t hash_to_first_index_[HASH_SIZE];
+  // chain_[pos] stores the previous position with the same hash value
+  // for every pixel in the image.
+  int32_t* chain_;
+} HashChain;
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+static int DistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       const int max_limit) {
+  int match_len = 0;
+  while (match_len < max_limit && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    refs->refs = NULL;
+    refs->size = 0;
+    refs->max_size = 0;
+  }
+}
+
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    free(refs->refs);
+    VP8LInitBackwardRefs(refs);
+  }
+}
+
+int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
+  assert(refs != NULL);
+  refs->size = 0;
+  refs->max_size = 0;
+  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
+                                          sizeof(*refs->refs));
+  if (refs->refs == NULL) return 0;
+  refs->max_size = max_size;
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
+  uint64_t key = ((uint64_t)(argb[1]) << 32) | argb[0];
+  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+  return key;
+}
+
+static int HashChainInit(HashChain* const p, int size) {
+  int i;
+  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
+  if (p->chain_ == NULL) {
+    return 0;
+  }
+  for (i = 0; i < size; ++i) {
+    p->chain_[i] = -1;
+  }
+  for (i = 0; i < HASH_SIZE; ++i) {
+    p->hash_to_first_index_[i] = -1;
+  }
+  return 1;
+}
+
+static void HashChainDelete(HashChain* const p) {
+  if (p != NULL) {
+    free(p->chain_);
+    free(p);
+  }
+}
+
+// Insertion of two pixels at a time.
+static void HashChainInsert(HashChain* const p,
+                            const uint32_t* const argb, int pos) {
+  const uint64_t hash_code = GetPixPairHash64(argb);
+  p->chain_[pos] = p->hash_to_first_index_[hash_code];
+  p->hash_to_first_index_[hash_code] = pos;
+}
+
+static int HashChainFindCopy(const HashChain* const p,
+                             int quality, int index, int xsize,
+                             const uint32_t* const argb, int maxlen,
+                             int* const distance_ptr,
+                             int* const length_ptr) {
+  const uint64_t hash_code = GetPixPairHash64(&argb[index]);
+  int prev_length = 0;
+  int64_t best_val = 0;
+  int best_length = 0;
+  int best_distance = 0;
+  const uint32_t* const argb_start = argb + index;
+  const int iter_min_mult = (quality < 50) ? 2 : (quality < 75) ? 4 : 8;
+  const int iter_min = -quality * iter_min_mult;
+  int iter_cnt = 10 + (quality >> 1);
+  const int min_pos = (index > WINDOW_SIZE) ? index - WINDOW_SIZE : 0;
+  int pos;
+
+  assert(xsize > 0);
+  for (pos = p->hash_to_first_index_[hash_code];
+       pos >= min_pos;
+       pos = p->chain_[pos]) {
+    int64_t val;
+    int curr_length;
+    if (iter_cnt < 0) {
+      if (iter_cnt < iter_min || best_val >= 0xff0000) {
+        break;
+      }
+    }
+    --iter_cnt;
+    if (best_length != 0 &&
+        argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
+    if (curr_length < prev_length) {
+      continue;
+    }
+    val = 65536 * curr_length;
+    // Favoring 2d locality here gives savings for certain images.
+    if (index - pos < 9 * xsize) {
+      const int y = (index - pos) / xsize;
+      int x = (index - pos) % xsize;
+      if (x > xsize / 2) {
+        x = xsize - x;
+      }
+      if (x <= 7 && x >= -8) {
+        val -= y * y + x * x;
+      } else {
+        val -= 9 * 9 + 9 * 9;
+      }
+    } else {
+      val -= 9 * 9 + 9 * 9;
+    }
+    if (best_val < val) {
+      prev_length = curr_length;
+      best_val = val;
+      best_length = curr_length;
+      best_distance = index - pos;
+      if (curr_length >= MAX_LENGTH) {
+        break;
+      }
+      if ((best_distance == 1 || best_distance == xsize) &&
+          best_length >= 128) {
+        break;
+      }
+    }
+  }
+  *distance_ptr = best_distance;
+  *length_ptr = best_length;
+  return (best_length >= MIN_LENGTH);
+}
+
+static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
+  int size = refs->size;
+  while (length >= MAX_LENGTH) {
+    refs->refs[size++] = PixOrCopyCreateCopy(1, MAX_LENGTH);
+    length -= MAX_LENGTH;
+  }
+  if (length > 0) {
+    refs->refs[size++] = PixOrCopyCreateCopy(1, length);
+  }
+  refs->size = size;
+}
+
+static void BackwardReferencesRle(int xsize, int ysize,
+                                  const uint32_t* const argb,
+                                  VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int match_len = 0;
+  int i;
+  refs->size = 0;
+  PushBackCopy(refs, match_len);    // i=0 case
+  refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[0]);
+  for (i = 1; i < pix_count; ++i) {
+    if (argb[i] == argb[i - 1]) {
+      ++match_len;
+    } else {
+      PushBackCopy(refs, match_len);
+      match_len = 0;
+      refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[i]);
+    }
+  }
+  PushBackCopy(refs, match_len);
+}
+
+static int BackwardReferencesHashChain(int xsize, int ysize,
+                                       const uint32_t* const argb,
+                                       int cache_bits, int quality,
+                                       VP8LBackwardRefs* const refs) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+
+  if (hash_chain == NULL) return 0;
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  refs->size = 0;
+  for (i = 0; i < pix_count; ) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
+      int maxlen = pix_count - i;
+      if (maxlen > MAX_LENGTH) {
+        maxlen = MAX_LENGTH;
+      }
+      HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+                        &offset, &len);
+    }
+    if (len >= MIN_LENGTH) {
+      // Alternative#2: Insert the pixel at 'i' as literal, and code the
+      // pixels starting at 'i + 1' using backward reference.
+      int offset2 = 0;
+      int len2 = 0;
+      int k;
+      HashChainInsert(hash_chain, &argb[i], i);
+      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > MAX_LENGTH) {
+          maxlen = MAX_LENGTH;
+        }
+        HashChainFindCopy(hash_chain, quality,
+                          i + 1, xsize, argb, maxlen, &offset2, &len2);
+        if (len2 > len + 1) {
+          const uint32_t pixel = argb[i];
+          // Alternative#2 is a better match. So push pixel at 'i' as literal.
+          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
+            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+          } else {
+            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+          }
+          ++refs->size;
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+          i++;  // Backward reference to be done for next pixel.
+          len = len2;
+          offset = offset2;
+        }
+      }
+      if (len >= MAX_LENGTH) {
+        len = MAX_LENGTH - 1;
+      }
+      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      // Add to the hash_chain (but cannot add the last pixel).
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 1; k < last; ++k) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      const uint32_t pixel = argb[i];
+      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+        // push pixel as a PixOrCopyCreateCacheIdx pixel
+        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
+        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+      }
+      ++refs->size;
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+      if (i + 1 < pix_count) {
+        HashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double literal_[PIX_OR_COPY_CODES_MAX];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+} CostModel;
+
+static int BackwardReferencesTraceBackwards(
+    int xsize, int ysize, int recursive_cost_model,
+    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const int population_counts[], double output[]) {
+  int sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int ysize,
+                          int recursion_level, const uint32_t* const argb,
+                          int cache_bits) {
+  int ok = 0;
+  VP8LHistogram histo;
+  VP8LBackwardRefs refs;
+  const int quality = 100;
+
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
+
+  if (recursion_level > 0) {
+    if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
+                                          argb, cache_bits, &refs)) {
+      goto Error;
+    }
+  } else {
+    if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
+                                     &refs)) {
+      goto Error;
+    }
+  }
+  VP8LHistogramCreate(&histo, &refs, cache_bits);
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return m->distance_[code] + extra_bits_count;
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
+    int cache_bits, uint32_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int quality = 100;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  double* const cost =
+      (double*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
+  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
+  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+  const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
+  const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
+
+  if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
+
+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
+                      cache_bits)) {
+    goto Error;
+  }
+
+  for (i = 0; i < pix_count; ++i) cost[i] = 1e100;
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  for (i = 0; i < pix_count; ++i) {
+    double prev_cost = 0.0;
+    int shortmax;
+    if (i > 0) {
+      prev_cost = cost[i - 1];
+    }
+    for (shortmax = 0; shortmax < 2; ++shortmax) {
+      int offset = 0;
+      int len = 0;
+      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
+        int maxlen = shortmax ? 2 : MAX_LENGTH;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+                          &offset, &len);
+      }
+      if (len >= MIN_LENGTH) {
+        const int code = DistanceToPlaneCode(xsize, offset);
+        const double distance_cost =
+            prev_cost + GetDistanceCost(cost_model, code);
+        int k;
+        for (k = 1; k < len; ++k) {
+          const double cost_val =
+              distance_cost + GetLengthCost(cost_model, k);
+          if (cost[i + k] > cost_val) {
+            cost[i + k] = cost_val;
+            dist_array[i + k] = k + 1;
+          }
+        }
+        // This if is for speedup only. It roughly doubles the speed, and
+        // makes compression worse by .1 %.
+        if (len >= 128 && code < 2) {
+          // Long copy for short distances, let's skip the middle
+          // lookups for better copies.
+          // 1) insert the hashes.
+          if (use_color_cache) {
+            for (k = 0; k < len; ++k) {
+              VP8LColorCacheInsert(&hashers, argb[i + k]);
+            }
+          }
+          // 2) Add to the hash_chain (but cannot add the last pixel)
+          {
+            const int last = (len < pix_count - 1 - i) ? len
+                                                       : pix_count - 1 - i;
+            for (k = 0; k < last; ++k) {
+              HashChainInsert(hash_chain, &argb[i + k], i + k);
+            }
+          }
+          // 3) jump.
+          i += len - 1;  // for loop does ++i, thus -1 here.
+          goto next_symbol;
+        }
+      }
+    }
+    if (i < pix_count - 1) {
+      HashChainInsert(hash_chain, &argb[i], i);
+    }
+    {
+      // inserting a literal pixel
+      double cost_val = prev_cost;
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        cost_val += GetCacheCost(cost_model, ix) * mul0;
+      } else {
+        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
+      }
+      if (cost[i] > cost_val) {
+        cost[i] = cost_val;
+        dist_array[i] = 1;  // only one is inserted.
+      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+    }
+ next_symbol: ;
+  }
+  // Last pixel still to do, it can only be a single step if not reached
+  // through cheaper means already.
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  free(cost_model);
+  free(cost);
+  return ok;
+}
+
+static int TraceBackwards(const uint32_t* const dist_array,
+                          int dist_array_size,
+                          uint32_t** const chosen_path,
+                          int* const chosen_path_size) {
+  int i;
+  // Count how many.
+  int count = 0;
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    ++count;
+    i -= k;
+  }
+  // Allocate.
+  *chosen_path_size = count;
+  *chosen_path =
+      (uint32_t*)WebPSafeMalloc((uint64_t)count, sizeof(**chosen_path));
+  if (*chosen_path == NULL) return 0;
+
+  // Write in reverse order.
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    (*chosen_path)[--count] = k;
+    i -= k;
+  }
+  return 1;
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const uint32_t* const chosen_path, int chosen_path_size,
+    VP8LBackwardRefs* const refs) {
+  const int quality = 100;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  int size = 0;
+  int i = 0;
+  int k;
+  int ix;
+  int ok = 0;
+  int cc_init = 0;
+  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+
+  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
+    goto Error;
+  }
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  refs->size = 0;
+  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
+    int offset = 0;
+    int len = 0;
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      HashChainFindCopy(hash_chain, quality,
+                        i, xsize, argb, maxlen, &offset, &len);
+      assert(len == maxlen);
+      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 0; k < last; ++k) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        // push pixel as a color cache index
+        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
+      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+      if (i + 1 < pix_count) {
+        HashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  assert(size <= refs->max_size);
+  refs->size = size;
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  return ok;
+}
+
+// Returns 1 on success.
+static int BackwardReferencesTraceBackwards(int xsize, int ysize,
+                                            int recursive_cost_model,
+                                            const uint32_t* const argb,
+                                            int cache_bits,
+                                            VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint32_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint32_t* dist_array =
+      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+      xsize, ysize, recursive_cost_model, argb, cache_bits, dist_array)) {
+    goto Error;
+  }
+  if (!TraceBackwards(dist_array, dist_array_size,
+                      &chosen_path, &chosen_path_size)) {
+    goto Error;
+  }
+  free(dist_array);   // no need to retain this memory any longer
+  dist_array = NULL;
+  if (!BackwardReferencesHashChainFollowChosenPath(
+      xsize, ysize, argb, cache_bits, chosen_path, chosen_path_size, refs)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  free(chosen_path);
+  free(dist_array);
+  return ok;
+}
+
+static void BackwardReferences2DLocality(int xsize,
+                                         VP8LBackwardRefs* const refs) {
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    if (PixOrCopyIsCopy(&refs->refs[i])) {
+      const int dist = refs->refs[i].argb_or_distance;
+      const int transformed_dist = DistanceToPlaneCode(xsize, dist);
+      refs->refs[i].argb_or_distance = transformed_dist;
+    }
+  }
+}
+
+int VP8LGetBackwardReferences(int width, int height,
+                              const uint32_t* const argb,
+                              int quality, int cache_bits, int use_2d_locality,
+                              VP8LBackwardRefs* const best) {
+  int ok = 0;
+  int lz77_is_useful;
+  VP8LBackwardRefs refs_rle, refs_lz77;
+  const int num_pix = width * height;
+
+  VP8LBackwardRefsAlloc(&refs_rle, num_pix);
+  VP8LBackwardRefsAlloc(&refs_lz77, num_pix);
+  VP8LInitBackwardRefs(best);
+  if (refs_rle.refs == NULL || refs_lz77.refs == NULL) {
+ Error1:
+    VP8LClearBackwardRefs(&refs_rle);
+    VP8LClearBackwardRefs(&refs_lz77);
+    goto End;
+  }
+
+  if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
+                                   &refs_lz77)) {
+    goto End;
+  }
+  // Backward Reference using RLE only.
+  BackwardReferencesRle(width, height, argb, &refs_rle);
+
+  {
+    double bit_cost_lz77, bit_cost_rle;
+    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
+    if (histo == NULL) goto Error1;
+    // Evaluate lz77 coding
+    VP8LHistogramCreate(histo, &refs_lz77, cache_bits);
+    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
+    // Evaluate RLE coding
+    VP8LHistogramCreate(histo, &refs_rle, cache_bits);
+    bit_cost_rle = VP8LHistogramEstimateBits(histo);
+    // Decide if LZ77 is useful.
+    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
+    free(histo);
+  }
+
+  // Choose appropriate backward reference.
+  if (lz77_is_useful) {
+    // TraceBackwards is costly. Run it for higher qualities.
+    const int try_lz77_trace_backwards = (quality >= 75);
+    *best = refs_lz77;   // default guess: lz77 is better
+    VP8LClearBackwardRefs(&refs_rle);
+    if (try_lz77_trace_backwards) {
+      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
+      VP8LBackwardRefs refs_trace;
+      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
+        goto End;
+      }
+      if (BackwardReferencesTraceBackwards(
+          width, height, recursion_level, argb, cache_bits, &refs_trace)) {
+        VP8LClearBackwardRefs(&refs_lz77);
+        *best = refs_trace;
+      }
+    }
+  } else {
+    VP8LClearBackwardRefs(&refs_lz77);
+    *best = refs_rle;
+  }
+
+  if (use_2d_locality) BackwardReferences2DLocality(width, best);
+
+  ok = 1;
+
+ End:
+  if (!ok) {
+    VP8LClearBackwardRefs(best);
+  }
+  return ok;
+}
+
+// Returns 1 on success.
+static int ComputeCacheHistogram(const uint32_t* const argb,
+                                 int xsize, int ysize,
+                                 const VP8LBackwardRefs* const refs,
+                                 int cache_bits,
+                                 VP8LHistogram* const histo) {
+  int pixel_index = 0;
+  int i;
+  uint32_t k;
+  VP8LColorCache hashers;
+  const int use_color_cache = (cache_bits > 0);
+  int cc_init = 0;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) return 0;
+  }
+
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy* const v = &refs->refs[i];
+    if (PixOrCopyIsLiteral(v)) {
+      if (use_color_cache &&
+          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
+        // push pixel as a cache index
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
+        const PixOrCopy token = PixOrCopyCreateCacheIdx(ix);
+        VP8LHistogramAddSinglePixOrCopy(histo, &token);
+      } else {
+        VP8LHistogramAddSinglePixOrCopy(histo, v);
+      }
+    } else {
+      VP8LHistogramAddSinglePixOrCopy(histo, v);
+    }
+    if (use_color_cache) {
+      for (k = 0; k < PixOrCopyLength(v); ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
+      }
+    }
+    pixel_index += PixOrCopyLength(v);
+  }
+  assert(pixel_index == xsize * ysize);
+  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
+  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+// Returns how many bits are to be used for a color cache.
+int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
+                                      int xsize, int ysize,
+                                      int* const best_cache_bits) {
+  int ok = 0;
+  int cache_bits;
+  double lowest_entropy = 1e99;
+  VP8LBackwardRefs refs;
+  static const double kSmallPenaltyForLargeCache = 4.0;
+  static const int quality = 30;
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize) ||
+      !BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, &refs)) {
+    goto Error;
+  }
+  for (cache_bits = 0; cache_bits <= MAX_COLOR_CACHE_BITS; ++cache_bits) {
+    double cur_entropy;
+    VP8LHistogram histo;
+    VP8LHistogramInit(&histo, cache_bits);
+    ComputeCacheHistogram(argb, xsize, ysize, &refs, cache_bits, &histo);
+    cur_entropy = VP8LHistogramEstimateBits(&histo) +
+        kSmallPenaltyForLargeCache * cache_bits;
+    if (cache_bits == 0 || cur_entropy < lowest_entropy) {
+      *best_cache_bits = cache_bits;
+      lowest_entropy = cur_entropy;
+    }
+  }
+  ok = 1;
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
diff --git a/src/enc/backward_references.h b/src/enc/backward_references.h
new file mode 100644
index 0000000..cda7c2b
--- /dev/null
+++ b/src/enc/backward_references.h
@@ -0,0 +1,212 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
+#define WEBP_ENC_BACKWARD_REFERENCES_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include "webp/types.h"
+#include "webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// The spec allows 11, we use 9 bits to reduce memory consumption in encoding.
+// Having 9 instead of 11 only removes about 0.25 % of compression density.
+#define MAX_COLOR_CACHE_BITS 9
+
+// Max ever number of codes we'll use:
+#define PIX_OR_COPY_CODES_MAX \
+    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  return _BitScanReverse(&first_set_bit, n) ? first_set_bit : -1;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  if (value == 0) return -1;
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return floor;
+  else
+    return floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(int distance, int* const code,
+                                     int* const extra_bits_count,
+                                     int* const extra_bits_value) {
+  // Collect the two most significant bits where the highest bit is 1.
+  const int highest_bit = BitsLog2Floor(--distance);
+  // & 0x3f is to make behavior well defined when highest_bit
+  // does not exist or is the least significant bit.
+  const int second_highest_bit =
+      (distance >> ((highest_bit - 1) & 0x3f)) & 1;
+  *extra_bits_count = (highest_bit > 0) ? (highest_bit - 1) : 0;
+  *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+  *code = (highest_bit > 0) ? (2 * highest_bit + second_highest_bit)
+                            : (highest_bit == 0) ? 1 : 0;
+}
+
+// -----------------------------------------------------------------------------
+// PixOrCopy
+
+enum Mode {
+  kLiteral,
+  kCacheIdx,
+  kCopy,
+  kNone
+};
+
+typedef struct {
+  // mode as uint8_t to make the memory layout to be exactly 8 bytes.
+  uint8_t mode;
+  uint16_t len;
+  uint32_t argb_or_distance;
+} PixOrCopy;
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCopy(uint32_t distance,
+                                                 uint16_t len) {
+  PixOrCopy retval;
+  retval.mode = kCopy;
+  retval.argb_or_distance = distance;
+  retval.len = len;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCacheIdx(int idx) {
+  PixOrCopy retval;
+  assert(idx >= 0);
+  assert(idx < (1 << MAX_COLOR_CACHE_BITS));
+  retval.mode = kCacheIdx;
+  retval.argb_or_distance = idx;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateLiteral(uint32_t argb) {
+  PixOrCopy retval;
+  retval.mode = kLiteral;
+  retval.argb_or_distance = argb;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE int PixOrCopyIsLiteral(const PixOrCopy* const p) {
+  return (p->mode == kLiteral);
+}
+
+static WEBP_INLINE int PixOrCopyIsCacheIdx(const PixOrCopy* const p) {
+  return (p->mode == kCacheIdx);
+}
+
+static WEBP_INLINE int PixOrCopyIsCopy(const PixOrCopy* const p) {
+  return (p->mode == kCopy);
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLiteral(const PixOrCopy* const p,
+                                             int component) {
+  assert(p->mode == kLiteral);
+  return (p->argb_or_distance >> (component * 8)) & 0xff;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
+  return p->len;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
+  assert(p->mode == kLiteral);
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
+  assert(p->mode == kCacheIdx);
+  assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
+  assert(p->mode == kCopy);
+  return p->argb_or_distance;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs
+
+typedef struct {
+  PixOrCopy* refs;
+  int size;      // currently used
+  int max_size;  // maximum capacity
+} VP8LBackwardRefs;
+
+// Initialize the object. Must be called first. 'refs' can be NULL.
+void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs);
+
+// Release memory and re-initialize the object. 'refs' can be NULL.
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+
+// Allocate 'max_size' references. Returns false in case of memory error.
+int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size);
+
+// -----------------------------------------------------------------------------
+// Main entry points
+
+// Evaluates best possible backward references for specified quality.
+// Further optimize for 2D locality if use_2d_locality flag is set.
+int VP8LGetBackwardReferences(int width, int height,
+                              const uint32_t* const argb,
+                              int quality, int cache_bits, int use_2d_locality,
+                              VP8LBackwardRefs* const best);
+
+// Produce an estimate for a good color cache size for the image.
+int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
+                                      int xsize, int ysize,
+                                      int* const best_cache_bits);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_ENC_BACKWARD_REFERENCES_H_
diff --git a/src/enc/bit_writer.c b/src/enc/bit_writer.c
deleted file mode 100644
index 24bbd33..0000000
--- a/src/enc/bit_writer.c
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2011 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Bit writing and boolean coder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include <stdlib.h>
-#include "vp8enci.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// VP8BitWriter
-
-static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
-  uint8_t* new_buf;
-  size_t new_size;
-  const size_t needed_size = bw->pos_ + extra_size;
-  if (needed_size <= bw->max_pos_) return 1;
-  new_size = 2 * bw->max_pos_;
-  if (new_size < needed_size)
-    new_size = needed_size;
-  if (new_size < 1024) new_size = 1024;
-  new_buf = (uint8_t*)malloc(new_size);
-  if (new_buf == NULL) {
-    bw->error_ = 1;
-    return 0;
-  }
-  if (bw->pos_ > 0) memcpy(new_buf, bw->buf_, bw->pos_);
-  free(bw->buf_);
-  bw->buf_ = new_buf;
-  bw->max_pos_ = new_size;
-  return 1;
-}
-
-static void kFlush(VP8BitWriter* const bw) {
-  const int s = 8 + bw->nb_bits_;
-  const int32_t bits = bw->value_ >> s;
-  assert(bw->nb_bits_ >= 0);
-  bw->value_ -= bits << s;
-  bw->nb_bits_ -= 8;
-  if ((bits & 0xff) != 0xff) {
-    size_t pos = bw->pos_;
-    if (pos + bw->run_ >= bw->max_pos_) {  // reallocate
-      if (!BitWriterResize(bw,  bw->run_ + 1)) {
-        return;
-      }
-    }
-    if (bits & 0x100) {  // overflow -> propagate carry over pending 0xff's
-      if (pos > 0) bw->buf_[pos - 1]++;
-    }
-    if (bw->run_ > 0) {
-      const int value = (bits & 0x100) ? 0x00 : 0xff;
-      for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
-    }
-    bw->buf_[pos++] = bits;
-    bw->pos_ = pos;
-  } else {
-    bw->run_++;   // delay writing of bytes 0xff, pending eventual carry.
-  }
-}
-
-//-----------------------------------------------------------------------------
-// renormalization
-
-static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
-     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0
-};
-
-// range = ((range + 1) << kVP8Log2Range[range]) - 1
-const uint8_t kNewRange[128] = {
-  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
-  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
-  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
-  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
-  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
-  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
-  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
-  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
-  241, 243, 245, 247, 249, 251, 253, 127
-};
-
-int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
-  const int split = (bw->range_ * prob) >> 8;
-  if (bit) {
-    bw->value_ += split + 1;
-    bw->range_ -= split + 1;
-  } else {
-    bw->range_ = split;
-  }
-  if (bw->range_ < 127) {   // emit 'shift' bits out and renormalize
-    const int shift = kNorm[bw->range_];
-    bw->range_ = kNewRange[bw->range_];
-    bw->value_ <<= shift;
-    bw->nb_bits_ += shift;
-    if (bw->nb_bits_ > 0) kFlush(bw);
-  }
-  return bit;
-}
-
-int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
-  const int split = bw->range_ >> 1;
-  if (bit) {
-    bw->value_ += split + 1;
-    bw->range_ -= split + 1;
-  } else {
-    bw->range_ = split;
-  }
-  if (bw->range_ < 127) {
-    bw->range_ = kNewRange[bw->range_];
-    bw->value_ <<= 1;
-    bw->nb_bits_ += 1;
-    if (bw->nb_bits_ > 0) kFlush(bw);
-  }
-  return bit;
-}
-
-void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits) {
-  int mask;
-  for (mask = 1 << (nb_bits - 1); mask; mask >>= 1)
-    VP8PutBitUniform(bw, value & mask);
-}
-
-void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) {
-  if (!VP8PutBitUniform(bw, value != 0))
-    return;
-  if (value < 0) {
-    VP8PutValue(bw, ((-value) << 1) | 1, nb_bits + 1);
-  } else {
-    VP8PutValue(bw, value << 1, nb_bits + 1);
-  }
-}
-
-//-----------------------------------------------------------------------------
-
-int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
-  bw->range_   = 255 - 1;
-  bw->value_   = 0;
-  bw->run_     = 0;
-  bw->nb_bits_ = -8;
-  bw->pos_     = 0;
-  bw->max_pos_ = 0;
-  bw->error_   = 0;
-  bw->buf_     = NULL;
-  return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
-}
-
-uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
-  VP8PutValue(bw, 0, 9 - bw->nb_bits_);
-  bw->nb_bits_ = 0;   // pad with zeroes
-  kFlush(bw);
-  return bw->buf_;
-}
-
-int VP8BitWriterAppend(VP8BitWriter* const bw,
-                       const uint8_t* data, size_t size) {
-  assert(data);
-  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
-  if (!BitWriterResize(bw, size)) return 0;
-  memcpy(bw->buf_ + bw->pos_, data, size);
-  bw->pos_ += size;
-  return 1;
-}
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/src/enc/bit_writer.h b/src/enc/bit_writer.h
deleted file mode 100644
index 69e247a..0000000
--- a/src/enc/bit_writer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2011 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// Bit writing and boolean coder
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_ENC_BIT_WRITER_H_
-#define WEBP_ENC_BIT_WRITER_H_
-
-#include "vp8enci.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//-----------------------------------------------------------------------------
-// Bit-writing
-
-typedef struct VP8BitWriter VP8BitWriter;
-struct VP8BitWriter {
-  int32_t  range_;      // range-1
-  int32_t  value_;
-  int      run_;        // number of outstanding bits
-  int      nb_bits_;    // number of pending bits
-  uint8_t* buf_;
-  size_t   pos_;
-  size_t   max_pos_;
-  int      error_;      // true in case of error
-};
-
-int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size);
-uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw);
-int VP8PutBit(VP8BitWriter* const bw, int bit, int prob);
-int VP8PutBitUniform(VP8BitWriter* const bw, int bit);
-void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits);
-void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits);
-int VP8BitWriterAppend(VP8BitWriter* const bw,
-                       const uint8_t* data, size_t size);
-
-// return approximate write position (in bits)
-static inline uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
-  return (uint64_t)(bw->pos_ + bw->run_) * 8 + 8 + bw->nb_bits_;
-}
-
-static inline uint8_t* VP8BitWriterBuf(const VP8BitWriter* const bw) {
-  return bw->buf_;
-}
-static inline size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
-  return bw->pos_;
-}
-
-//-----------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_ENC_BIT_WRITER_H_
diff --git a/src/enc/config.c b/src/enc/config.c
index 0a1ccbb..b05328d 100644
--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,20 +9,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
 #include "webp/encode.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPConfig
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
-int WebPConfigInitInternal(WebPConfig* const config,
+int WebPConfigInitInternal(WebPConfig* config,
                            WebPPreset preset, float quality, int version) {
-  if (version != WEBP_ENCODER_ABI_VERSION) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
     return 0;   // caller/system version mismatch!
   }
   if (config == NULL) return 0;
@@ -41,7 +40,12 @@
   config->show_compressed = 0;
   config->preprocessing = 0;
   config->autofilter = 0;
-  config->alpha_compression = 0;
+  config->partition_limit = 0;
+  config->alpha_compression = 1;
+  config->alpha_filtering = 1;
+  config->alpha_quality = 100;
+  config->lossless = 0;
+  config->image_hint = WEBP_HINT_DEFAULT;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -76,7 +80,7 @@
   return WebPValidateConfig(config);
 }
 
-int WebPValidateConfig(const WebPConfig* const config) {
+int WebPValidateConfig(const WebPConfig* config) {
   if (config == NULL) return 0;
   if (config->quality < 0 || config->quality > 100)
     return 0;
@@ -106,12 +110,22 @@
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100)
+    return 0;
   if (config->alpha_compression < 0)
     return 0;
+  if (config->alpha_filtering < 0)
+    return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100)
+    return 0;
+  if (config->lossless < 0 || config->lossless > 1)
+    return 0;
+  if (config->image_hint >= WEBP_HINT_LAST)
+    return 0;
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/cost.c b/src/enc/cost.c
index 0f7ee72..92e0cc7 100644
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,15 +9,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
-
-#include "cost.h"
+#include "./cost.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Boolean-cost cost table
 
 const uint16_t VP8EntropyCost[256] = {
@@ -49,12 +47,12 @@
     10,    9,    7,    6,    4,    3
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Level cost tables
 
-// For each given level, the following table given the pattern of contexts
-// to use for coding it (in [][0]) as well as the bit value to use for
-// each context (in [][1]).
+// For each given level, the following table gives the pattern of contexts to
+// use for coding it (in [][0]) as well as the bit value to use for each
+// context (in [][1]).
 const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
                   {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
   {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
@@ -351,11 +349,14 @@
   return cost;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Pre-calc level costs once for all
 
 void VP8CalculateLevelCosts(VP8Proba* const proba) {
   int ctype, band, ctx;
+
+  if (!proba->dirty_) return;  // nothing to do.
+
   for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
     for (band = 0; band < NUM_BANDS; ++band) {
       for(ctx = 0; ctx < NUM_CTX; ++ctx) {
@@ -372,14 +373,16 @@
       }
     }
   }
+  proba->dirty_ = 0;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Mode cost tables.
 
 // These are the fixed probabilities (in the coding trees) turned into bit-cost
 // by calling VP8BitCost().
 const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
+// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
   { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
@@ -484,7 +487,7 @@
     {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/cost.h b/src/enc/cost.h
index 6b83c83..09b75b6 100644
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -12,7 +12,7 @@
 #ifndef WEBP_ENC_COST_H_
 #define WEBP_ENC_COST_H_
 
-#include "vp8enci.h"
+#include "./vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -22,22 +22,16 @@
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
 
 // Cost of coding one event with probability 'proba'.
-static inline int VP8BitCost(int bit, uint8_t proba) {
+static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }
 
-// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
-static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total,
-                                     uint8_t proba) {
-  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
-}
-
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8Proba* const proba);
-static inline int VP8LevelCost(const uint16_t* const table, int level) {
+static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
   return VP8LevelFixedCosts[level]
-       + table[level > MAX_VARIABLE_LEVEL ? MAX_VARIABLE_LEVEL : level];
+       + table[(level > MAX_VARIABLE_LEVEL) ? MAX_VARIABLE_LEVEL : level];
 }
 
 // Mode costs
@@ -45,10 +39,10 @@
 extern const uint16_t VP8FixedCostsI16[4];
 extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
-#endif  // WEBP_ENC_COST_H_
+#endif  /* WEBP_ENC_COST_H_ */
diff --git a/src/enc/filter.c b/src/enc/filter.c
index a0a42b0..7fb78a3 100644
--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,8 +9,7 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include <math.h>
-#include "vp8enci.h"
+#include "./vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -45,11 +44,11 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions
 
 // 4 pixels in, 2 pixels out
-static inline void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
   const int a1 = sclip2[112 + ((a + 4) >> 3)];
@@ -59,7 +58,7 @@
 }
 
 // 4 pixels in, 4 pixels out
-static inline void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0);
   const int a1 = sclip2[112 + ((a + 4) >> 3)];
@@ -72,17 +71,18 @@
 }
 
 // high edge-variance
-static inline int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
 }
 
-static inline int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
 }
 
-static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
   const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
   if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
@@ -92,7 +92,7 @@
          abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -129,11 +129,12 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
   while (size-- > 0) {
     if (needs_filter2(p, hstride, thresh, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
@@ -177,7 +178,7 @@
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
 void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
@@ -187,7 +188,7 @@
 void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
 void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength
 
 static int GetILevel(int sharpness, int level) {
@@ -229,18 +230,25 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // SSIM metric
 
 enum { KERNEL = 3 };
-typedef struct {
-  double w, xm, ym, xxm, xym, yym;
-} SSIMStats;
+static const double kMinValue = 1.e-10;  // minimal threshold
 
-static void Accumulate(const uint8_t* src1, int stride1,
-                       const uint8_t* src2, int stride2,
-                       int xo, int yo, int W, int H,
-                       SSIMStats* const stats) {
+void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst) {
+  dst->w   += src->w;
+  dst->xm  += src->xm;
+  dst->ym  += src->ym;
+  dst->xxm += src->xxm;
+  dst->xym += src->xym;
+  dst->yym += src->yym;
+}
+
+static void VP8SSIMAccumulate(const uint8_t* src1, int stride1,
+                              const uint8_t* src2, int stride2,
+                              int xo, int yo, int W, int H,
+                              DistoStats* const stats) {
   const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
   const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
   const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
@@ -262,7 +270,7 @@
   }
 }
 
-static double GetSSIM(const SSIMStats* const stats) {
+double VP8SSIMGet(const DistoStats* const stats) {
   const double xmxm = stats->xm * stats->xm;
   const double ymym = stats->ym * stats->ym;
   const double xmym = stats->xm * stats->ym;
@@ -280,29 +288,52 @@
   C2 = 58.5225 * w2;
   fnum = (2 * xmym + C1) * (2 * sxy + C2);
   fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0) ? fnum / fden : 0.;
+  return (fden != 0.) ? fnum / fden : kMinValue;
+}
+
+double VP8SSIMGetSquaredError(const DistoStats* const s) {
+  if (s->w > 0.) {
+    const double iw2 = 1. / (s->w * s->w);
+    const double sxx = s->xxm * s->w - s->xm * s->xm;
+    const double syy = s->yym * s->w - s->ym * s->ym;
+    const double sxy = s->xym * s->w - s->xm * s->ym;
+    const double SSE = iw2 * (sxx + syy - 2. * sxy);
+    if (SSE > kMinValue) return SSE;
+  }
+  return kMinValue;
+}
+
+void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
+                            const uint8_t* src2, int stride2,
+                            int W, int H, DistoStats* const stats) {
+  int x, y;
+  for (y = 0; y < H; ++y) {
+    for (x = 0; x < W; ++x) {
+      VP8SSIMAccumulate(src1, stride1, src2, stride2, x, y, W, H, stats);
+    }
+  }
 }
 
 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   int x, y;
-  SSIMStats s = { .0, .0, .0, .0, .0, .0 };
+  DistoStats s = { .0, .0, .0, .0, .0, .0 };
 
   // compute SSIM in a 10 x 10 window
   for (x = 3; x < 13; x++) {
     for (y = 3; y < 13; y++) {
-      Accumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
+      VP8SSIMAccumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      Accumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
-      Accumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
     }
   }
-  return GetSSIM(&s);
+  return VP8SSIMGet(&s);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Exposed APIs: Encoder should call the following 3 functions to adjust
 // loop filter strength
 
diff --git a/src/enc/frame.c b/src/enc/frame.c
index d0270d7..bdd3600 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,13 +9,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 #include <math.h>
 
-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -37,7 +37,7 @@
   CostArray*  cost;
 } VP8Residual;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Tables for level coding
 
 const uint8_t VP8EncBands[16 + 1] = {
@@ -51,18 +51,20 @@
 static const uint8_t kCat6[] =
     { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Reset the statistics about: number of skips, token proba, level cost,...
 
-static void ResetStats(VP8Encoder* const enc, int precalc_cost) {
+static void ResetStats(VP8Encoder* const enc) {
   VP8Proba* const proba = &enc->proba_;
-  if (precalc_cost) VP8CalculateLevelCosts(proba);
+  VP8CalculateLevelCosts(proba);
   proba->nb_skip_ = 0;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Skip decision probability
 
+#define SKIP_PROBA_THRESHOLD 250  // value below which using skip_proba is OK.
+
 static int CalcSkipProba(uint64_t nb, uint64_t total) {
   return (int)(total ? (total - nb) * 255 / total : 255);
 }
@@ -74,7 +76,7 @@
   const int nb_events = proba->nb_skip_;
   int size;
   proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
-  proba->use_skip_proba_ = (proba->skip_proba_ < 250);
+  proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
   size = 256;   // 'use_skip_proba' bit
   if (proba->use_skip_proba_) {
     size +=  nb_events * VP8BitCost(1, proba->skip_proba_)
@@ -84,7 +86,7 @@
   return size;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Recording of token probabilities.
 
 static void ResetTokenStats(VP8Encoder* const enc) {
@@ -93,9 +95,14 @@
 }
 
 // Record proba context used
-static int Record(int bit, uint64_t* const stats) {
-  stats[0] += bit;
-  stats[1] += 1;
+static int Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
   return bit;
 }
 
@@ -104,33 +111,35 @@
 
 // Simulate block coding, but only record statistics.
 // Note: no need to record the fixed probas.
-static int RecordCoeffs(int ctx, VP8Residual* res) {
+static int RecordCoeffs(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  uint64_t (*s)[2] = res->stats[VP8EncBands[n]][ctx];
-  if (!Record(res->last >= 0, s[0])) {
+  proba_t* s = res->stats[VP8EncBands[n]][ctx];
+  if (res->last  < 0) {
+    Record(0, s + 0);
     return 0;
   }
-
-  while (1) {
-    int v = res->coeffs[n++];
-    if (!Record(v != 0, s[1])) {
+  while (n <= res->last) {
+    int v;
+    Record(1, s + 0);
+    while ((v = res->coeffs[n++]) == 0) {
+      Record(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
-      continue;
     }
-    if (!Record(2u < (unsigned int)(v + 1), s[2])) {  // v = -1 or 1
+    Record(1, s + 1);
+    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
       s = res->stats[VP8EncBands[n]][1];
     } else {
       v = abs(v);
 #if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s[3])) {
-        if (Record(v != 2, s[4]))
-          Record(v == 4, s[5]);
-      } else if (!Record(v > 10, s[6])) {
-        Record(v > 6, s[7]);
-      } else if (!Record((v >= 3 + (8 << 2)), s[8])) {
-        Record((v >= 3 + (8 << 1)), s[9]);
+      if (!Record(v > 4, s + 3)) {
+        if (Record(v != 2, s + 4))
+          Record(v == 4, s + 5);
+      } else if (!Record(v > 10, s + 6)) {
+        Record(v > 6, s + 7);
+      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
+        Record((v >= 3 + (8 << 1)), s + 9);
       } else {
-        Record((v >= 3 + (8 << 3)), s[10]);
+        Record((v >= 3 + (8 << 3)), s + 10);
       }
 #else
       if (v > MAX_VARIABLE_LEVEL)
@@ -142,44 +151,54 @@
         int i;
         for (i = 0; (pattern >>= 1) != 0; ++i) {
           const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s[3 + i]);
+          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
         }
       }
 #endif
       s = res->stats[VP8EncBands[n]][2];
     }
-    if (n == 16 || !Record(n <= res->last, s[0])) {
-      return 1;
-    }
   }
+  if (n < 16) Record(0, s + 0);
+  return 1;
 }
 
 // Collect statistics and deduce probabilities for next coding pass.
 // Return the total bit-cost for coding the probability updates.
-static int CalcTokenProba(uint64_t nb, uint64_t total) {
-  return (int)(nb ? ((total - nb) * 255 + total / 2) / total : 255);
+static int CalcTokenProba(int nb, int total) {
+  assert(nb <= total);
+  return nb ? (255 - nb * 255 / total) : 255;
+}
+
+// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
+static int BranchCost(int nb, int total, int proba) {
+  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }
 
 static int FinalizeTokenProbas(VP8Encoder* const enc) {
   VP8Proba* const proba = &enc->proba_;
+  int has_changed = 0;
   int size = 0;
   int t, b, c, p;
   for (t = 0; t < NUM_TYPES; ++t) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          const uint64_t* const cnt = proba->stats_[t][b][c][p];
+          const proba_t stats = proba->stats_[t][b][c][p];
+          const int nb = (stats >> 0) & 0xffff;
+          const int total = (stats >> 16) & 0xffff;
           const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
           const int old_p = VP8CoeffsProba0[t][b][c][p];
-          const int new_p = CalcTokenProba(cnt[0], cnt[1]);
-          const uint64_t old_cost = VP8BranchCost(cnt[0], cnt[1], old_p)
-                                  + VP8BitCost(0, update_proba);
-          const uint64_t new_cost = VP8BranchCost(cnt[0], cnt[1], new_p)
-                                  + VP8BitCost(1, update_proba) + 8 * 256;
+          const int new_p = CalcTokenProba(nb, total);
+          const int old_cost = BranchCost(nb, total, old_p)
+                             + VP8BitCost(0, update_proba);
+          const int new_cost = BranchCost(nb, total, new_p)
+                             + VP8BitCost(1, update_proba)
+                             + 8 * 256;
           const int use_new_p = (old_cost > new_cost);
           size += VP8BitCost(use_new_p, update_proba);
           if (use_new_p) {  // only use proba that seem meaningful enough.
             proba->coeffs_[t][b][c][p] = new_p;
+            has_changed |= (new_p != old_p);
             size += 8 * 256;
           } else {
             proba->coeffs_[t][b][c][p] = old_p;
@@ -188,10 +207,11 @@
       }
     }
   }
+  proba->dirty_ = has_changed;
   return size;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.
 
 static void InitResidual(int first, int coeff_type,
@@ -216,49 +236,53 @@
   res->coeffs = coeffs;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Mode costs
 
 static int GetResidualCost(int ctx, const VP8Residual* const res) {
   int n = res->first;
-  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
-  const uint16_t *t = res->cost[VP8EncBands[n]][ctx];
+  int p0 = res->prob[VP8EncBands[n]][ctx][0];
+  const uint16_t* t = res->cost[VP8EncBands[n]][ctx];
   int cost;
 
-  cost = VP8BitCost(res->last >= 0, p[0]);
   if (res->last < 0) {
-    return cost;
+    return VP8BitCost(0, p0);
   }
+  cost = 0;
   while (n <= res->last) {
-    const int v = res->coeffs[n++];
+    const int v = res->coeffs[n];
+    const int b = VP8EncBands[n + 1];
+    ++n;
     if (v == 0) {
-      cost += VP8LevelCost(t, 0);
-      p = res->prob[VP8EncBands[n]][0];
-      t = res->cost[VP8EncBands[n]][0];
+      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
+      cost += t[0];
+      t = res->cost[b][0];
       continue;
-    } else if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
-      cost += VP8LevelCost(t, 1);
-      p = res->prob[VP8EncBands[n]][1];
-      t = res->cost[VP8EncBands[n]][1];
+    }
+    cost += VP8BitCost(1, p0);
+    if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
+      // short-case for "VP8LevelCost(t, 1)" (256 is VP8LevelFixedCosts[1]):
+      cost += 256 + t[1];
+      p0 = res->prob[b][1][0];
+      t = res->cost[b][1];
     } else {
       cost += VP8LevelCost(t, abs(v));
-      p = res->prob[VP8EncBands[n]][2];
-      t = res->cost[VP8EncBands[n]][2];
-    }
-    if (n < 16) {
-      cost += VP8BitCost(n <= res->last, p[0]);
+      p0 = res->prob[b][2][0];
+      t = res->cost[b][2];
     }
   }
+  if (n < 16) cost += VP8BitCost(0, p0);
   return cost;
 }
 
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
   const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
   VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
   int R = 0;
   int ctx;
 
-  InitResidual(0, 3, it->enc_, &res);
+  InitResidual(0, 3, enc, &res);
   ctx = it->top_nz_[x] + it->left_nz_[y];
   SetResidualCoeffs(levels, &res);
   R += GetResidualCost(ctx, &res);
@@ -267,18 +291,19 @@
 
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
   VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
   int x, y;
   int R = 0;
 
   VP8IteratorNzToBytes(it);   // re-import the non-zero context
 
   // DC
-  InitResidual(0, 1, it->enc_, &res);
+  InitResidual(0, 1, enc, &res);
   SetResidualCoeffs(rd->y_dc_levels, &res);
   R += GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
 
   // AC
-  InitResidual(1, 0, it->enc_, &res);
+  InitResidual(1, 0, enc, &res);
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
@@ -292,12 +317,13 @@
 
 int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
   VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
   int ch, x, y;
   int R = 0;
 
   VP8IteratorNzToBytes(it);  // re-import the non-zero context
 
-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
@@ -311,7 +337,7 @@
   return R;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Coefficient coding
 
 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
@@ -393,18 +419,19 @@
   uint64_t pos1, pos2, pos3;
   const int i16 = (it->mb_->type_ == 1);
   const int segment = it->mb_->segment_;
+  VP8Encoder* const enc = it->enc_;
 
   VP8IteratorNzToBytes(it);
 
   pos1 = VP8BitWriterPos(bw);
   if (i16) {
-    InitResidual(0, 1, it->enc_, &res);
+    InitResidual(0, 1, enc, &res);
     SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
       PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, it->enc_, &res);
+    InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, it->enc_, &res);
+    InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
@@ -418,7 +445,7 @@
   pos2 = VP8BitWriterPos(bw);
 
   // U/V
-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
@@ -443,17 +470,18 @@
                             const VP8ModeScore* const rd) {
   int x, y, ch;
   VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
 
   VP8IteratorNzToBytes(it);
 
   if (it->mb_->type_ == 1) {   // i16x16
-    InitResidual(0, 1, it->enc_, &res);
+    InitResidual(0, 1, enc, &res);
     SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
       RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, it->enc_, &res);
+    InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, it->enc_, &res);
+    InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
@@ -466,7 +494,7 @@
   }
 
   // U/V
-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
@@ -481,7 +509,181 @@
   VP8IteratorBytesToNz(it);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Token buffer
+
+#ifdef USE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  b->rows_ = NULL;
+  b->tokens_ = NULL;
+  b->last_ = &b->rows_;
+  b->left_ = 0;
+  b->error_ = 0;
+}
+
+int VP8TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  *b->last_ = page;
+  b->last_ = &page->next_;
+  b->left_ = MAX_NUM_TOKEN;
+  b->tokens_ = page->tokens_;
+  return 1;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    const VP8Tokens* p = b->rows_;
+    while (p != NULL) {
+      const VP8Tokens* const next = p->next_;
+      free((void*)p);
+      p = next;
+    }
+    VP8TBufferInit(b);
+  }
+}
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas) {
+  VP8Tokens* p = b->rows_;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const int N = (p->next_ == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      VP8PutBit(bw, (p->tokens_[n] >> 15) & 1, probas[p->tokens_[n] & 0x7fff]);
+    }
+    p = p->next_;
+  }
+  return 1;
+}
+
+#define TOKEN_ID(b, ctx, p) ((p) + NUM_PROBAS * ((ctx) + (b) * NUM_CTX))
+
+static int RecordCoeffTokens(int ctx, const VP8Residual* const res,
+                             VP8TBuffer* tokens) {
+  int n = res->first;
+  int b = VP8EncBands[n];
+  if (!VP8AddToken(tokens, res->last >= 0, TOKEN_ID(b, ctx, 0))) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = res->coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    const int base_id = TOKEN_ID(b, ctx, 0);
+    if (!VP8AddToken(tokens, v != 0, base_id + 1)) {
+      b = VP8EncBands[n];
+      ctx = 0;
+      continue;
+    }
+    if (!VP8AddToken(tokens, v > 1, base_id + 2)) {
+      b = VP8EncBands[n];
+      ctx = 1;
+    } else {
+      if (!VP8AddToken(tokens, v > 4, base_id + 3)) {
+        if (VP8AddToken(tokens, v != 2, base_id + 4))
+          VP8AddToken(tokens, v == 4, base_id + 5);
+      } else if (!VP8AddToken(tokens, v > 10, base_id + 6)) {
+        if (!VP8AddToken(tokens, v > 6, base_id + 7)) {
+//          VP8AddToken(tokens, v == 6, 159);
+        } else {
+//          VP8AddToken(tokens, v >= 9, 165);
+//          VP8AddToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 9);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = kCat3;
+        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
+          VP8AddToken(tokens, 0, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 9);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = kCat4;
+        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 0, base_id + 10);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = kCat5;
+        } else {                         // kCat6 (11b)
+          VP8AddToken(tokens, 1, base_id + 8);
+          VP8AddToken(tokens, 1, base_id + 10);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = kCat6;
+        }
+        while (mask) {
+          // VP8AddToken(tokens, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      ctx = 2;
+    }
+    b = VP8EncBands[n];
+    // VP8PutBitUniform(bw, sign);
+    if (n == 16 || !VP8AddToken(tokens, n <= res->last, TOKEN_ID(b, ctx, 0))) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+static void RecordTokens(VP8EncIterator* const it,
+                         const VP8ModeScore* const rd, VP8TBuffer tokens[2]) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+  if (it->mb_->type_ == 1) {   // i16x16
+    InitResidual(0, 1, enc, &res);
+    SetResidualCoeffs(rd->y_dc_levels, &res);
+// TODO(skal): FIX ->    it->top_nz_[8] = it->left_nz_[8] =
+      RecordCoeffTokens(it->top_nz_[8] + it->left_nz_[8], &res, &tokens[0]);
+    InitResidual(1, 0, enc, &res);
+  } else {
+    InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] =
+          RecordCoeffTokens(ctx, &res, &tokens[0]);
+    }
+  }
+
+  // U/V
+  InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            RecordCoeffTokens(ctx, &res, &tokens[1]);
+      }
+    }
+  }
+}
+
+#endif    // USE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
 
 #if SEGMENT_VISU
@@ -515,16 +717,16 @@
   const VP8MBInfo* const mb = it->mb_;
   WebPPicture* const pic = enc->pic_;
 
-  if (pic->stats) {
+  if (pic->stats != NULL) {
     StoreSSE(it);
     enc->block_count_[0] += (mb->type_ == 0);
     enc->block_count_[1] += (mb->type_ == 1);
     enc->block_count_[2] += (mb->skip_ != 0);
   }
 
-  if (pic->extra_info) {
+  if (pic->extra_info != NULL) {
     uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
-    switch(pic->extra_info_type) {
+    switch (pic->extra_info_type) {
       case 1: *info = mb->type_; break;
       case 2: *info = mb->segment_; break;
       case 3: *info = enc->dqm_[mb->segment_].quant_; break;
@@ -544,7 +746,7 @@
 #endif
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main loops
 //
 //  VP8EncLoop(): does the final bitstream coding.
@@ -560,6 +762,7 @@
 
 int VP8EncLoop(VP8Encoder* const enc) {
   int i, s, p;
+  int ok = 1;
   VP8EncIterator it;
   VP8ModeScore info;
   const int dont_use_skip = !enc->proba_.use_skip_proba_;
@@ -573,7 +776,7 @@
     VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
   }
 
-  ResetStats(enc, rd_opt != 0);
+  ResetStats(enc);
   ResetSSE(enc);
 
   VP8IteratorInit(enc, &it);
@@ -588,9 +791,6 @@
       ResetAfterSkip(&it);
     }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->has_alpha_) {
-      VP8EncCodeAlphaBlock(&it);
-    }
     if (enc->use_layer_) {
       VP8EncCodeLayerBlock(&it);
     }
@@ -598,25 +798,34 @@
     StoreSideInfo(&it);
     VP8StoreFilterStats(&it);
     VP8IteratorExport(&it);
-  } while (VP8IteratorNext(&it, it.yuv_out_));
-  VP8AdjustFilterStrength(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
 
-  // Finalize the partitions
-  for (p = 0; p < enc->num_parts_; ++p) {
-    VP8BitWriterFinish(enc->parts_ + p);
-  }
-  // and byte counters
-  if (enc->pic_->stats) {
-    for (i = 0; i <= 2; ++i) {
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
-      }
+  if (ok) {      // Finalize the partitions, check for extra errors.
+    for (p = 0; p < enc->num_parts_; ++p) {
+      VP8BitWriterFinish(enc->parts_ + p);
+      ok &= !enc->parts_[p].error_;
     }
   }
-  return 1;
+
+  if (ok) {      // All good. Finish up.
+    if (enc->pic_->stats) {           // finalize byte counters...
+      for (i = 0; i <= 2; ++i) {
+        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+          enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
+        }
+      }
+    }
+    VP8AdjustFilterStrength(&it);     // ...and store filter stats.
+  } else {
+    // Something bad happened -> need to do some memory cleanup.
+    VP8EncFreeBitWriters(enc);
+  }
+
+  return ok;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
 //                 This is used for deciding optimal probabilities. It also
 //                 modifies the quantizer value if some target (size, PNSR)
@@ -625,7 +834,7 @@
 #define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better
 
 static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
-                       float* const PSNR) {
+                       float* const PSNR, int percent_delta) {
   VP8EncIterator it;
   uint64_t size = 0;
   uint64_t distortion = 0;
@@ -640,7 +849,7 @@
 
   VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
 
-  ResetStats(enc, rd_opt != 0);
+  ResetStats(enc);
   ResetTokenStats(enc);
 
   VP8IteratorInit(enc, &it);
@@ -654,6 +863,8 @@
     RecordResiduals(&it, &info);
     size += info.R;
     distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+      return 0;
   } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
   size += FinalizeSkipProba(enc);
   size += FinalizeTokenProbas(enc);
@@ -674,6 +885,10 @@
     (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0);
   const int fast_probe = (enc->method_ < 2 && !do_search);
   float q = enc->config_->quality;
+  const int max_passes = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
+  const int final_percent = enc->percent_ + task_percent;
   int pass;
   int nb_mbs;
 
@@ -683,39 +898,41 @@
 
   // No target size: just do several pass without changing 'q'
   if (!do_search) {
-    for (pass = 0; pass < enc->config_->pass; ++pass) {
+    for (pass = 0; pass < max_passes; ++pass) {
       const int rd_opt = (enc->method_ > 2);
-      OneStatPass(enc, q, rd_opt, nb_mbs, NULL);
+      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
+        return 0;
+      }
     }
-    return 1;
-  }
-
-  // binary search for a size close to target
-  for (pass = 0; pass < enc->config_->pass && (dqs[pass] > 0); ++pass) {
-    const int rd_opt = 1;
-    float PSNR;
-    int criterion;
-    const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR);
+  } else {
+    // binary search for a size close to target
+    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
+      const int rd_opt = 1;
+      float PSNR;
+      int criterion;
+      const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR,
+                                   percent_per_pass);
 #if DEBUG_SEARCH
-    printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
+      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-
-    if (enc->config_->target_PSNR > 0) {
-      criterion = (PSNR < enc->config_->target_PSNR);
-    } else {
-      criterion = (size < enc->config_->target_size);
-    }
-    // dichotomize
-    if (criterion) {
-      q += dqs[pass];
-    } else {
-      q -= dqs[pass];
+      if (!size) return 0;
+      if (enc->config_->target_PSNR > 0) {
+        criterion = (PSNR < enc->config_->target_PSNR);
+      } else {
+        criterion = (size < enc->config_->target_size);
+      }
+      // dichotomize
+      if (criterion) {
+        q += dqs[pass];
+      } else {
+        q -= dqs[pass];
+      }
     }
   }
-  return 1;
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/histogram.c b/src/enc/histogram.c
new file mode 100644
index 0000000..ca838e0
--- /dev/null
+++ b/src/enc/histogram.c
@@ -0,0 +1,406 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../dsp/lossless.h"
+#include "../utils/utils.h"
+
+static void HistogramClear(VP8LHistogram* const p) {
+  memset(p->literal_, 0, sizeof(p->literal_));
+  memset(p->red_, 0, sizeof(p->red_));
+  memset(p->blue_, 0, sizeof(p->blue_));
+  memset(p->alpha_, 0, sizeof(p->alpha_));
+  memset(p->distance_, 0, sizeof(p->distance_));
+  p->bit_cost_ = 0;
+}
+
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo) {
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    VP8LHistogramAddSinglePixOrCopy(histo, &refs->refs[i]);
+  }
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits) {
+  if (palette_code_bits >= 0) {
+    p->palette_code_bits_ = palette_code_bits;
+  }
+  HistogramClear(p);
+  VP8LHistogramStoreRefs(refs, p);
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
+  p->palette_code_bits_ = palette_code_bits;
+  HistogramClear(p);
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+  int i;
+  VP8LHistogramSet* set;
+  VP8LHistogram* bulk;
+  const uint64_t total_size = (uint64_t)sizeof(*set)
+                            + size * sizeof(*set->histograms)
+                            + size * sizeof(**set->histograms);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+
+  set = (VP8LHistogramSet*)memory;
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  memory += size * sizeof(*set->histograms);
+  bulk = (VP8LHistogram*)memory;
+  set->max_size = size;
+  set->size = size;
+  for (i = 0; i < size; ++i) {
+    set->histograms[i] = bulk + i;
+    VP8LHistogramInit(set->histograms[i], cache_bits);
+  }
+  return set;
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v) {
+  if (PixOrCopyIsLiteral(v)) {
+    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
+    ++histo->red_[PixOrCopyLiteral(v, 2)];
+    ++histo->literal_[PixOrCopyLiteral(v, 1)];
+    ++histo->blue_[PixOrCopyLiteral(v, 0)];
+  } else if (PixOrCopyIsCacheIdx(v)) {
+    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    ++histo->literal_[literal_ix];
+  } else {
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++histo->literal_[256 + code];
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++histo->distance_[code];
+  }
+}
+
+
+
+static double BitsEntropy(const int* const array, int n) {
+  double retval = 0.;
+  int sum = 0;
+  int nonzeros = 0;
+  int max_val = 0;
+  int i;
+  double mix;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      ++nonzeros;
+      retval -= VP8LFastSLog2(array[i]);
+      if (max_val < array[i]) {
+        max_val = array[i];
+      }
+    }
+  }
+  retval += VP8LFastSLog2(sum);
+
+  if (nonzeros < 5) {
+    if (nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (nonzeros == 2) {
+      return 0.99 * sum + 0.01 * retval;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  } else {
+    mix = 0.627;
+  }
+
+  {
+    double min_limit = 2 * sum - max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * retval;
+    return (retval < min_limit) ? min_limit : retval;
+  }
+}
+
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  double retval = BitsEntropy(&p->literal_[0], VP8LHistogramNumCodes(p))
+                + BitsEntropy(&p->red_[0], 256)
+                + BitsEntropy(&p->blue_[0], 256)
+                + BitsEntropy(&p->alpha_[0], 256)
+                + BitsEntropy(&p->distance_[0], NUM_DISTANCE_CODES);
+  // Compute the extra bits cost.
+  int i;
+  for (i = 2; i < NUM_LENGTH_CODES - 2; ++i) {
+    retval +=
+        (i >> 1) * p->literal_[256 + i + 2];
+  }
+  for (i = 2; i < NUM_DISTANCE_CODES - 2; ++i) {
+    retval += (i >> 1) * p->distance_[i + 2];
+  }
+  return retval;
+}
+
+
+// Returns the cost encode the rle-encoded entropy code.
+// The constants in this function are experimental.
+static double HuffmanCost(const int* const population, int length) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  double retval = kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+  int streak = 0;
+  int i = 0;
+  for (; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+ last_streak_hack:
+    // population[i] points now to the symbol in the streak of same values.
+    if (streak > 3) {
+      if (population[i] == 0) {
+        retval += 1.5625 + 0.234375 * streak;
+      } else {
+        retval += 2.578125 + 0.703125 * streak;
+      }
+    } else {
+      if (population[i] == 0) {
+        retval += 1.796875 * streak;
+      } else {
+        retval += 3.28125 * streak;
+      }
+    }
+    streak = 0;
+  }
+  if (i == length - 1) {
+    ++streak;
+    goto last_streak_hack;
+  }
+  return retval;
+}
+
+// Estimates the Huffman dictionary + other block overhead size.
+static double HistogramEstimateBitsHeader(const VP8LHistogram* const p) {
+  return HuffmanCost(&p->alpha_[0], 256) +
+         HuffmanCost(&p->red_[0], 256) +
+         HuffmanCost(&p->literal_[0], VP8LHistogramNumCodes(p)) +
+         HuffmanCost(&p->blue_[0], 256) +
+         HuffmanCost(&p->distance_[0], NUM_DISTANCE_CODES);
+}
+
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+  return HistogramEstimateBitsHeader(p) + VP8LHistogramEstimateBitsBulk(p);
+}
+
+static void HistogramBuildImage(int xsize, int histo_bits,
+                                const VP8LBackwardRefs* const backward_refs,
+                                VP8LHistogramSet* const image) {
+  int i;
+  int x = 0, y = 0;
+  const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
+  VP8LHistogram** const histograms = image->histograms;
+  assert(histo_bits > 0);
+  for (i = 0; i < backward_refs->size; ++i) {
+    const PixOrCopy* const v = &backward_refs->refs[i];
+    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
+    x += PixOrCopyLength(v);
+    while (x >= xsize) {
+      x -= xsize;
+      ++y;
+    }
+  }
+}
+
+static uint32_t MyRand(uint32_t *seed) {
+  *seed *= 16807U;
+  if (*seed == 0) {
+    *seed = 1;
+  }
+  return *seed;
+}
+
+static int HistogramCombine(const VP8LHistogramSet* const in,
+                            VP8LHistogramSet* const out, int num_pairs) {
+  int ok = 0;
+  int i, iter;
+  uint32_t seed = 0;
+  int tries_with_no_success = 0;
+  const int min_cluster_size = 2;
+  int out_size = in->size;
+  const int outer_iters = in->size * 3;
+  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
+  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
+  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
+  if (histos == NULL) goto End;
+
+  // Copy histograms from in[] to out[].
+  assert(in->size <= out->size);
+  for (i = 0; i < in->size; ++i) {
+    in->histograms[i]->bit_cost_ = VP8LHistogramEstimateBits(in->histograms[i]);
+    *out->histograms[i] = *in->histograms[i];
+  }
+
+  // Collapse similar histograms in 'out'.
+  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+    // We pick the best pair to be combined out of 'inner_iters' pairs.
+    double best_cost_diff = 0.;
+    int best_idx1 = 0, best_idx2 = 1;
+    int j;
+    seed += iter;
+    for (j = 0; j < num_pairs; ++j) {
+      double curr_cost_diff;
+      // Choose two histograms at random and try to combine them.
+      const uint32_t idx1 = MyRand(&seed) % out_size;
+      const uint32_t tmp = ((j & 7) + 1) % (out_size - 1);
+      const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
+      const uint32_t idx2 = (idx1 + diff + 1) % out_size;
+      if (idx1 == idx2) {
+        continue;
+      }
+      *cur_combo = *out->histograms[idx1];
+      VP8LHistogramAdd(cur_combo, out->histograms[idx2]);
+      cur_combo->bit_cost_ = VP8LHistogramEstimateBits(cur_combo);
+      // Calculate cost reduction on combining.
+      curr_cost_diff = cur_combo->bit_cost_
+                     - out->histograms[idx1]->bit_cost_
+                     - out->histograms[idx2]->bit_cost_;
+      if (best_cost_diff > curr_cost_diff) {    // found a better pair?
+        {     // swap cur/best combo histograms
+          VP8LHistogram* const tmp_histo = cur_combo;
+          cur_combo = best_combo;
+          best_combo = tmp_histo;
+        }
+        best_cost_diff = curr_cost_diff;
+        best_idx1 = idx1;
+        best_idx2 = idx2;
+      }
+    }
+
+    if (best_cost_diff < 0.0) {
+      *out->histograms[best_idx1] = *best_combo;
+      // swap best_idx2 slot with last one (which is now unused)
+      --out_size;
+      if (best_idx2 != out_size) {
+        out->histograms[best_idx2] = out->histograms[out_size];
+        out->histograms[out_size] = NULL;   // just for sanity check.
+      }
+      tries_with_no_success = 0;
+    }
+    if (++tries_with_no_success >= 50) {
+      break;
+    }
+  }
+  out->size = out_size;
+  ok = 1;
+
+ End:
+  free(histos);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// What is the bit cost of moving square_histogram from
+// cur_symbol to candidate_symbol.
+// TODO(skal): we don't really need to copy the histogram and Add(). Instead
+// we just need VP8LDualHistogramEstimateBits(A, B) estimation function.
+static double HistogramDistance(const VP8LHistogram* const square_histogram,
+                                const VP8LHistogram* const candidate) {
+  const double previous_bit_cost = candidate->bit_cost_;
+  double new_bit_cost;
+  VP8LHistogram modified_histo;
+  modified_histo = *candidate;
+  VP8LHistogramAdd(&modified_histo, square_histogram);
+  new_bit_cost = VP8LHistogramEstimateBits(&modified_histo);
+
+  return new_bit_cost - previous_bit_cost;
+}
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           const VP8LHistogramSet* const out,
+                           uint16_t* const symbols) {
+  int i;
+  for (i = 0; i < in->size; ++i) {
+    int best_out = 0;
+    double best_bits = HistogramDistance(in->histograms[i], out->histograms[0]);
+    int k;
+    for (k = 1; k < out->size; ++k) {
+      const double cur_bits =
+          HistogramDistance(in->histograms[i], out->histograms[k]);
+      if (cur_bits < best_bits) {
+        best_bits = cur_bits;
+        best_out = k;
+      }
+    }
+    symbols[i] = best_out;
+  }
+
+  // Recompute each out based on raw and symbols.
+  for (i = 0; i < out->size; ++i) {
+    HistogramClear(out->histograms[i]);
+  }
+  for (i = 0; i < in->size; ++i) {
+    VP8LHistogramAdd(out->histograms[symbols[i]], in->histograms[i]);
+  }
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histo_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols) {
+  int ok = 0;
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
+  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+  const int num_histo_pairs = 10 + quality / 2;  // For HistogramCombine().
+  const int histo_image_raw_size = histo_xsize * histo_ysize;
+  VP8LHistogramSet* const image_out =
+      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
+  if (image_out == NULL) return 0;
+
+  // Build histogram image.
+  HistogramBuildImage(xsize, histo_bits, refs, image_out);
+  // Collapse similar histograms.
+  if (!HistogramCombine(image_out, image_in, num_histo_pairs)) {
+    goto Error;
+  }
+  // Find the optimal map from original histograms to the final ones.
+  HistogramRemap(image_out, image_in, histogram_symbols);
+  ok = 1;
+
+Error:
+  free(image_out);
+  return ok;
+}
diff --git a/src/enc/histogram.h b/src/enc/histogram.h
new file mode 100644
index 0000000..b99b758
--- /dev/null
+++ b/src/enc/histogram.h
@@ -0,0 +1,115 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Models the histograms of literal and distance codes.
+
+#ifndef WEBP_ENC_HISTOGRAM_H_
+#define WEBP_ENC_HISTOGRAM_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "./backward_references.h"
+#include "webp/format_constants.h"
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// A simple container for histograms of data.
+typedef struct {
+  // literal_ contains green literal, palette-code and
+  // copy-length-prefix histogram
+  int literal_[PIX_OR_COPY_CODES_MAX];
+  int red_[256];
+  int blue_[256];
+  int alpha_[256];
+  // Backward reference prefix-code histogram.
+  int distance_[NUM_DISTANCE_CODES];
+  int palette_code_bits_;
+  double bit_cost_;   // cached value of VP8LHistogramEstimateBits(this)
+} VP8LHistogram;
+
+// Collection of histograms with fixed capacity, allocated as one
+// big memory chunk. Can be destroyed by simply calling 'free()'.
+typedef struct {
+  int size;         // number of slots currently in use
+  int max_size;     // maximum capacity
+  VP8LHistogram** histograms;
+} VP8LHistogramSet;
+
+// Create the histogram.
+//
+// The input data is the PixOrCopy data, which models the literals, stop
+// codes and backward references (both distances and lengths).  Also: if
+// palette_code_bits is >= 0, initialize the histogram with this value.
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits);
+
+// Set the palette_code_bits and reset the stats.
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
+
+// Collect all the references into a histogram (without reset)
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo);
+
+// Allocate an array of pointer to histograms, allocated and initialized
+// using 'cache_bits'. Return NULL in case of memory error.
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
+
+// Accumulate a token 'v' into a histogram.
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
+// This function estimates the cost in bits excluding the bits needed to
+// represent the entropy code itself.
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
+
+static WEBP_INLINE void VP8LHistogramAdd(VP8LHistogram* const p,
+                                         const VP8LHistogram* const a) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    p->literal_[i] += a->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    p->distance_[i] += a->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    p->red_[i] += a->red_[i];
+    p->blue_[i] += a->blue_[i];
+    p->alpha_[i] += a->alpha_[i];
+  }
+}
+
+static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
+  return 256 + NUM_LENGTH_CODES +
+      ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
+}
+
+// Builds the histogram image.
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_ENC_HISTOGRAM_H_
diff --git a/src/enc/iterator.c b/src/enc/iterator.c
index 3a8ad04..86e473b 100644
--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,22 +9,22 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <stdlib.h>
 #include <string.h>
-#include "vp8enci.h"
+
+#include "./vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Iterator
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static void InitLeft(VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
-      (it->y_) > 0 ? 129 : 127;
+      (it->y_ > 0) ? 129 : 127;
   memset(enc->y_left_, 129, 16);
   memset(enc->u_left_, 129, 8);
   memset(enc->v_left_, 129, 8);
@@ -33,7 +33,7 @@
 
 static void InitTop(VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
-  const int top_size = enc->mb_w_ * 16;
+  const size_t top_size = enc->mb_w_ * 16;
   memset(enc->y_top_, 127, 2 * top_size);
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }
@@ -65,66 +65,81 @@
   it->yuv_out2_ = enc->yuv_out2_;
   it->yuv_p_    = enc->yuv_p_;
   it->lf_stats_ = enc->lf_stats_;
+  it->percent0_ = enc->percent_;
   VP8IteratorReset(it);
 }
 
-//-----------------------------------------------------------------------------
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
+  VP8Encoder* const enc = it->enc_;
+  if (delta && enc->pic_->progress_hook) {
+    const int percent = (enc->mb_h_ <= 1)
+                      ? it->percent0_
+                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
+    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.
 
+static void ImportBlock(const uint8_t* src, int src_stride,
+                        uint8_t* dst, int w, int h, int size) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    memcpy(dst, src, w);
+    if (w < size) {
+      memset(dst + w, dst[w - 1], size - w);
+    }
+    dst += BPS;
+    src += src_stride;
+  }
+  for (i = h; i < size; ++i) {
+    memcpy(dst, dst - BPS, size);
+    dst += BPS;
+  }
+}
+
 void VP8IteratorImport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
-  const uint8_t* ysrc = pic->y + (y * pic->y_stride + x) * 16;
-  const uint8_t* usrc = pic->u + (y * pic->uv_stride + x) * 8;
-  const uint8_t* vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  uint8_t* ydst = it->yuv_in_ + Y_OFF;
-  uint8_t* udst = it->yuv_in_ + U_OFF;
-  uint8_t* vdst = it->yuv_in_ + V_OFF;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
+  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
+  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
+  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
+  uint8_t* const udst = it->yuv_in_ + U_OFF;
+  uint8_t* const vdst = it->yuv_in_ + V_OFF;
   int w = (pic->width - x * 16);
   int h = (pic->height - y * 16);
-  int i;
 
   if (w > 16) w = 16;
   if (h > 16) h = 16;
+
   // Luma plane
-  for (i = 0; i < h; ++i) {
-    memcpy(ydst, ysrc, w);
-    if (w < 16) memset(ydst + w, ydst[w - 1], 16 - w);
-    ydst += BPS;
-    ysrc += pic->y_stride;
-  }
-  for (i = h; i < 16; ++i) {
-    memcpy(ydst, ydst - BPS, 16);
-    ydst += BPS;
-  }
-  // U/V plane
-  w = (w + 1) / 2;
-  h = (h + 1) / 2;
-  for (i = 0; i < h; ++i) {
-    memcpy(udst, usrc, w);
-    memcpy(vdst, vsrc, w);
-    if (w < 8) {
-      memset(udst + w, udst[w - 1], 8 - w);
-      memset(vdst + w, vdst[w - 1], 8 - w);
-    }
-    udst += BPS;
-    vdst += BPS;
-    usrc += pic->uv_stride;
-    vsrc += pic->uv_stride;
-  }
-  for (i = h; i < 8; ++i) {
-    memcpy(udst, udst - BPS, 8);
-    memcpy(vdst, vdst - BPS, 8);
-    udst += BPS;
-    vdst += BPS;
+  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
+
+  {   // U/V planes
+    const int uv_w = (w + 1) >> 1;
+    const int uv_h = (h + 1) >> 1;
+    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
+    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Copy back the compressed samples into user space if requested.
 
+static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
+                        int w, int h) {
+  while (h-- > 0) {
+    memcpy(dst, src, w);
+    dst += dst_stride;
+    src += BPS;
+  }
+}
+
 void VP8IteratorExport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   if (enc->config_->show_compressed) {
@@ -133,33 +148,28 @@
     const uint8_t* const usrc = it->yuv_out_ + U_OFF;
     const uint8_t* const vsrc = it->yuv_out_ + V_OFF;
     const WebPPicture* const pic = enc->pic_;
-    uint8_t* ydst = pic->y + (y * pic->y_stride + x) * 16;
-    uint8_t* udst = pic->u + (y * pic->uv_stride + x) * 8;
-    uint8_t* vdst = pic->v + (y * pic->uv_stride + x) * 8;
+    uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
+    uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
+    uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
     int w = (pic->width - x * 16);
     int h = (pic->height - y * 16);
-    int i;
 
     if (w > 16) w = 16;
     if (h > 16) h = 16;
 
     // Luma plane
-    for (i = 0; i < h; ++i) {
-      memcpy(ydst + i * pic->y_stride, ysrc + i * BPS, w);
-    }
-    // U/V plane
-    {
-      const int uv_w = (w + 1) / 2;
-      const int uv_h = (h + 1) / 2;
-      for (i = 0; i < uv_h; ++i) {
-        memcpy(udst + i * pic->uv_stride, usrc + i * BPS, uv_w);
-        memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, uv_w);
-      }
+    ExportBlock(ysrc, ydst, pic->y_stride, w, h);
+
+    {   // U/V planes
+      const int uv_w = (w + 1) >> 1;
+      const int uv_h = (h + 1) >> 1;
+      ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
+      ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
     }
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Non-zero contexts setup/teardown
 
 // Nz bits:
@@ -178,54 +188,58 @@
 
 void VP8IteratorNzToBytes(VP8EncIterator* const it) {
   const int tnz = it->nz_[0], lnz = it->nz_[-1];
+  int* const top_nz = it->top_nz_;
+  int* const left_nz = it->left_nz_;
 
   // Top-Y
-  it->top_nz_[0] = BIT(tnz, 12);
-  it->top_nz_[1] = BIT(tnz, 13);
-  it->top_nz_[2] = BIT(tnz, 14);
-  it->top_nz_[3] = BIT(tnz, 15);
+  top_nz[0] = BIT(tnz, 12);
+  top_nz[1] = BIT(tnz, 13);
+  top_nz[2] = BIT(tnz, 14);
+  top_nz[3] = BIT(tnz, 15);
   // Top-U
-  it->top_nz_[4] = BIT(tnz, 18);
-  it->top_nz_[5] = BIT(tnz, 19);
+  top_nz[4] = BIT(tnz, 18);
+  top_nz[5] = BIT(tnz, 19);
   // Top-V
-  it->top_nz_[6] = BIT(tnz, 22);
-  it->top_nz_[7] = BIT(tnz, 23);
+  top_nz[6] = BIT(tnz, 22);
+  top_nz[7] = BIT(tnz, 23);
   // DC
-  it->top_nz_[8] = BIT(tnz, 24);
+  top_nz[8] = BIT(tnz, 24);
 
   // left-Y
-  it->left_nz_[0] = BIT(lnz,  3);
-  it->left_nz_[1] = BIT(lnz,  7);
-  it->left_nz_[2] = BIT(lnz, 11);
-  it->left_nz_[3] = BIT(lnz, 15);
+  left_nz[0] = BIT(lnz,  3);
+  left_nz[1] = BIT(lnz,  7);
+  left_nz[2] = BIT(lnz, 11);
+  left_nz[3] = BIT(lnz, 15);
   // left-U
-  it->left_nz_[4] = BIT(lnz, 17);
-  it->left_nz_[5] = BIT(lnz, 19);
+  left_nz[4] = BIT(lnz, 17);
+  left_nz[5] = BIT(lnz, 19);
   // left-V
-  it->left_nz_[6] = BIT(lnz, 21);
-  it->left_nz_[7] = BIT(lnz, 23);
+  left_nz[6] = BIT(lnz, 21);
+  left_nz[7] = BIT(lnz, 23);
   // left-DC is special, iterated separately
 }
 
 void VP8IteratorBytesToNz(VP8EncIterator* const it) {
   uint32_t nz = 0;
+  const int* const top_nz = it->top_nz_;
+  const int* const left_nz = it->left_nz_;
   // top
-  nz |= (it->top_nz_[0] << 12) | (it->top_nz_[1] << 13);
-  nz |= (it->top_nz_[2] << 14) | (it->top_nz_[3] << 15);
-  nz |= (it->top_nz_[4] << 18) | (it->top_nz_[5] << 19);
-  nz |= (it->top_nz_[6] << 22) | (it->top_nz_[7] << 23);
-  nz |= (it->top_nz_[8] << 24);  // we propagate the _top_ bit, esp. for intra4
+  nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
+  nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
+  nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
+  nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
+  nz |= (top_nz[8] << 24);  // we propagate the _top_ bit, esp. for intra4
   // left
-  nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7);
-  nz |= (it->left_nz_[2] << 11);
-  nz |= (it->left_nz_[4] << 17) | (it->left_nz_[6] << 21);
+  nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
+  nz |= (left_nz[2] << 11);
+  nz |= (left_nz[4] << 17) | (left_nz[6] << 21);
 
   *it->nz_ = nz;
 }
 
 #undef BIT
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Advance to the next position, doing the bookeeping.
 
 int VP8IteratorNext(VP8EncIterator* const it,
@@ -270,12 +284,12 @@
   return (0 < --it->done_);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Helper function to set mode properties
 
 void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
-  int y;
   uint8_t* preds = it->preds_;
+  int y;
   for (y = 0; y < 4; ++y) {
     memset(preds, mode, 4);
     preds += it->enc_->preds_w_;
@@ -283,14 +297,13 @@
   it->mb_->type_ = 1;
 }
 
-void VP8SetIntra4Mode(const VP8EncIterator* const it, int modes[16]) {
-  int x, y;
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
   uint8_t* preds = it->preds_;
-  for (y = 0; y < 4; ++y) {
-    for (x = 0; x < 4; ++x) {
-      preds[x] = modes[x + y * 4];
-    }
+  int y;
+  for (y = 4; y > 0; --y) {
+    memcpy(preds, modes, 4 * sizeof(*modes));
     preds += it->enc_->preds_w_;
+    modes += 4;
   }
   it->mb_->type_ = 0;
 }
@@ -307,7 +320,7 @@
   it->mb_->segment_ = segment;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra4x4 sub-blocks iteration
 //
 //  We store and update the boundary samples into an array of 37 pixels. They
@@ -347,7 +360,7 @@
 };
 
 void VP8IteratorStartI4(VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   int i;
 
   it->i4_ = 0;    // first 4x4 sub-block
@@ -393,7 +406,7 @@
     }
   }
   // move pointers to next sub-block
-  it->i4_++;
+  ++it->i4_;
   if (it->i4_ == 16) {    // we're done
     return 0;
   }
@@ -402,7 +415,7 @@
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/layer.c b/src/enc/layer.c
index ec4dc87..423127d 100644
--- a/src/enc/layer.c
+++ b/src/enc/layer.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,19 +9,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
 #include <stdlib.h>
-#include "vp8enci.h"
+
+#include "./vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 void VP8EncInitLayer(VP8Encoder* const enc) {
   enc->use_layer_ = (enc->pic_->u0 != NULL);
@@ -34,8 +30,6 @@
 
 void VP8EncCodeLayerBlock(VP8EncIterator* it) {
   (void)it;   // remove a warning
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
 }
 
 int VP8EncFinishLayer(VP8Encoder* const enc) {
diff --git a/src/enc/picture.c b/src/enc/picture.c
index b644662..44eed06 100644
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -11,302 +11,384 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "vp8enci.h"
+#include <math.h>
+
+#include "./vp8enci.h"
+#include "../utils/rescaler.h"
+#include "../utils/utils.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
-// WebPPicture
-//-----------------------------------------------------------------------------
+#define HALVE(x) (((x) + 1) >> 1)
+#define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
 
-int WebPPictureAlloc(WebPPicture* const picture) {
-  if (picture) {
+static const union {
+  uint32_t argb;
+  uint8_t  bytes[4];
+} test_endian = { 0xff000000u };
+#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+
+//------------------------------------------------------------------------------
+// WebPPicture
+//------------------------------------------------------------------------------
+
+int WebPPictureAlloc(WebPPicture* picture) {
+  if (picture != NULL) {
     const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
     const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
     const int width = picture->width;
     const int height = picture->height;
-    const int y_stride = width;
-    const int uv_width = (width + 1) / 2;
-    const int uv_height = (height + 1) / 2;
-    const int uv_stride = uv_width;
-    int uv0_stride = 0;
-    int a_width, a_stride;
-    uint64_t y_size, uv_size, uv0_size, a_size, total_size;
-    uint8_t* mem;
 
-    // U/V
-    switch (uv_csp) {
-      case WEBP_YUV420:
-        break;
+    if (!picture->use_argb) {
+      const int y_stride = width;
+      const int uv_width = HALVE(width);
+      const int uv_height = HALVE(height);
+      const int uv_stride = uv_width;
+      int uv0_stride = 0;
+      int a_width, a_stride;
+      uint64_t y_size, uv_size, uv0_size, a_size, total_size;
+      uint8_t* mem;
+
+      // U/V
+      switch (uv_csp) {
+        case WEBP_YUV420:
+          break;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-      case WEBP_YUV400:    // for now, we'll just reset the U/V samples
-        break;
-      case WEBP_YUV422:
-        uv0_stride = uv_width;
-        break;
-      case WEBP_YUV444:
-        uv0_stride = width;
-        break;
+        case WEBP_YUV400:    // for now, we'll just reset the U/V samples
+          break;
+        case WEBP_YUV422:
+          uv0_stride = uv_width;
+          break;
+        case WEBP_YUV444:
+          uv0_stride = width;
+          break;
 #endif
-      default:
+        default:
+          return 0;
+      }
+      uv0_size = height * uv0_stride;
+
+      // alpha
+      a_width = has_alpha ? width : 0;
+      a_stride = a_width;
+      y_size = (uint64_t)y_stride * height;
+      uv_size = (uint64_t)uv_stride * uv_height;
+      a_size =  (uint64_t)a_stride * height;
+
+      total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
+
+      // Security and validation checks
+      if (width <= 0 || height <= 0 ||         // luma/alpha param error
+          uv_width < 0 || uv_height < 0) {     // u/v param error
         return 0;
-    }
-    uv0_size = height * uv0_stride;
+      }
+      // Clear previous buffer and allocate a new one.
+      WebPPictureFree(picture);   // erase previous buffer
+      mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+      if (mem == NULL) return 0;
 
-    // alpha
-    a_width = has_alpha ? width : 0;
-    a_stride = a_width;
-    y_size = (uint64_t)y_stride * height;
-    uv_size = (uint64_t)uv_stride * uv_height;
-    a_size =  (uint64_t)a_stride * height;
+      // From now on, we're in the clear, we can no longer fail...
+      picture->memory_ = (void*)mem;
+      picture->y_stride  = y_stride;
+      picture->uv_stride = uv_stride;
+      picture->a_stride  = a_stride;
+      picture->uv0_stride = uv0_stride;
+      // TODO(skal): we could align the y/u/v planes and adjust stride.
+      picture->y = mem;
+      mem += y_size;
 
-    total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
+      picture->u = mem;
+      mem += uv_size;
+      picture->v = mem;
+      mem += uv_size;
 
-    // Security and validation checks
-    if (width <= 0 || height <= 0 ||       // check for luma/alpha param error
-        uv_width < 0 || uv_height < 0 ||   // check for u/v param error
-        y_size >= (1ULL << 40) ||            // check for reasonable global size
-        (size_t)total_size != total_size) {  // check for overflow on 32bit
-      return 0;
-    }
-    picture->y_stride  = y_stride;
-    picture->uv_stride = uv_stride;
-    picture->a_stride  = a_stride;
-    picture->uv0_stride  = uv0_stride;
-    WebPPictureFree(picture);   // erase previous buffer
-    mem = (uint8_t*)malloc((size_t)total_size);
-    if (mem == NULL) return 0;
+      if (a_size) {
+        picture->a = mem;
+        mem += a_size;
+      }
+      if (uv0_size) {
+        picture->u0 = mem;
+        mem += uv0_size;
+        picture->v0 = mem;
+        mem += uv0_size;
+      }
+    } else {
+      void* memory;
+      const uint64_t argb_size = (uint64_t)width * height;
+      if (width <= 0 || height <= 0) {
+        return 0;
+      }
+      // Clear previous buffer and allocate a new one.
+      WebPPictureFree(picture);   // erase previous buffer
+      memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
+      if (memory == NULL) return 0;
 
-    picture->y = mem;
-    mem += y_size;
-
-    picture->u = mem;
-    mem += uv_size;
-    picture->v = mem;
-    mem += uv_size;
-
-    if (a_size) {
-      picture->a = mem;
-      mem += a_size;
-    }
-    if (uv0_size) {
-      picture->u0 = mem;
-      mem += uv0_size;
-      picture->v0 = mem;
-      mem += uv0_size;
+      // TODO(skal): align plane to cache line?
+      picture->memory_argb_ = memory;
+      picture->argb = (uint32_t*)memory;
+      picture->argb_stride = width;
     }
   }
   return 1;
 }
 
-// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
-// into 'dst'. Mark 'dst' as not owning any memory. 'src' can be NULL.
-static void WebPPictureGrabSpecs(const WebPPicture* const src,
-                                 WebPPicture* const dst) {
-  if (src) *dst = *src;
-  dst->y = dst->u = dst->v = NULL;
-  dst->u0 = dst->v0 = NULL;
-  dst->a = NULL;
+// Remove reference to the ARGB buffer (doesn't free anything).
+static void PictureResetARGB(WebPPicture* const picture) {
+  picture->memory_argb_ = NULL;
+  picture->argb = NULL;
+  picture->argb_stride = 0;
 }
 
-// Release memory owned by 'picture'.
-void WebPPictureFree(WebPPicture* const picture) {
-  if (picture) {
-    free(picture->y);
-    WebPPictureGrabSpecs(NULL, picture);
+// Remove reference to the YUVA buffer (doesn't free anything).
+static void PictureResetYUVA(WebPPicture* const picture) {
+  picture->memory_ = NULL;
+  picture->y = picture->u = picture->v = picture->a = NULL;
+  picture->u0 = picture->v0 = NULL;
+  picture->y_stride = picture->uv_stride = 0;
+  picture->a_stride = 0;
+  picture->uv0_stride = 0;
+}
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void WebPPictureGrabSpecs(const WebPPicture* const src,
+                                 WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  *dst = *src;
+  PictureResetYUVA(dst);
+  PictureResetARGB(dst);
+}
+
+// Allocate a new argb buffer, discarding any existing one and preserving
+// the other YUV(A) buffer.
+static int PictureAllocARGB(WebPPicture* const picture) {
+  WebPPicture tmp;
+  free(picture->memory_argb_);
+  PictureResetARGB(picture);
+  picture->use_argb = 1;
+  WebPPictureGrabSpecs(picture, &tmp);
+  if (!WebPPictureAlloc(&tmp)) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  picture->memory_argb_ = tmp.memory_argb_;
+  picture->argb = tmp.argb;
+  picture->argb_stride = tmp.argb_stride;
+  return 1;
+}
+
+// Release memory owned by 'picture' (both YUV and ARGB buffers).
+void WebPPictureFree(WebPPicture* picture) {
+  if (picture != NULL) {
+    free(picture->memory_);
+    free(picture->memory_argb_);
+    PictureResetYUVA(picture);
+    PictureResetARGB(picture);
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Picture copying
 
-int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
-  int y;
+// Not worth moving to dsp/enc.c (only used here).
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+                                int* const left, int* const top) {
+  if (!pic->use_argb) {
+    const int is_yuv422 = IS_YUV_CSP(pic->colorspace, WEBP_YUV422);
+    if (IS_YUV_CSP(pic->colorspace, WEBP_YUV420) || is_yuv422) {
+      *left &= ~1;
+      if (!is_yuv422) *top &= ~1;
+    }
+  }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+                                   int* const left, int* const top,
+                                   int width, int height) {
+  SnapTopLeftPosition(pic, left, top);
+  if ((*left) < 0 || (*top) < 0) return 0;
+  if (width <= 0 || height <= 0) return 0;
+  if ((*left) + width > pic->width) return 0;
+  if ((*top) + height > pic->height) return 0;
+  return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   if (src == NULL || dst == NULL) return 0;
   if (src == dst) return 1;
 
   WebPPictureGrabSpecs(src, dst);
   if (!WebPPictureAlloc(dst)) return 0;
 
-  for (y = 0; y < dst->height; ++y) {
-    memcpy(dst->y + y * dst->y_stride,
-           src->y + y * src->y_stride, src->width);
-  }
-  for (y = 0; y < (dst->height + 1) / 2; ++y) {
-    memcpy(dst->u + y * dst->uv_stride,
-           src->u + y * src->uv_stride, (src->width + 1) / 2);
-    memcpy(dst->v + y * dst->uv_stride,
-           src->v + y * src->uv_stride, (src->width + 1) / 2);
-  }
+  if (!src->use_argb) {
+    CopyPlane(src->y, src->y_stride,
+              dst->y, dst->y_stride, dst->width, dst->height);
+    CopyPlane(src->u, src->uv_stride,
+              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    CopyPlane(src->v, src->uv_stride,
+              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      CopyPlane(src->a, src->a_stride,
+                dst->a, dst->a_stride, dst->width, dst->height);
+    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (dst->a != NULL)  {
-    for (y = 0; y < dst->height; ++y) {
-      memcpy(dst->a + y * dst->a_stride,
-             src->a + y * src->a_stride, src->width);
+    if (dst->u0 != NULL)  {
+      int uv0_width = src->width;
+      if (IS_YUV_CSP(dst->colorspace, WEBP_YUV422)) {
+        uv0_width = HALVE(uv0_width);
+      }
+      CopyPlane(src->u0, src->uv0_stride,
+                dst->u0, dst->uv0_stride, uv0_width, dst->height);
+      CopyPlane(src->v0, src->uv0_stride,
+                dst->v0, dst->uv0_stride, uv0_width, dst->height);
     }
-  }
-  if (dst->u0 != NULL)  {
-    int uv0_width = src->width;
-    if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
-      uv0_width = (uv0_width + 1) / 2;
-    }
-    for (y = 0; y < dst->height; ++y) {
-      memcpy(dst->u0 + y * dst->uv0_stride,
-             src->u0 + y * src->uv0_stride, uv0_width);
-      memcpy(dst->v0 + y * dst->uv0_stride,
-             src->v0 + y * src->uv0_stride, uv0_width);
-    }
-  }
 #endif
+  } else {
+    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+              (uint8_t*)dst->argb, 4 * dst->argb_stride,
+              4 * dst->width, dst->height);
+  }
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+int WebPPictureIsView(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    return (picture->memory_argb_ == NULL);
+  }
+  return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  // verify rectangle position.
+  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
+    WebPPictureGrabSpecs(src, dst);
+  }
+  dst->width = width;
+  dst->height = height;
+  if (!src->use_argb) {
+    dst->y = src->y + top * src->y_stride + left;
+    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    if (src->a != NULL) {
+      dst->a = src->a + top * src->a_stride + left;
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (src->u0 != NULL) {
+      const int left_pos =
+          IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
+      dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
+      dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
+    }
+#endif
+  } else {
+    dst->argb = src->argb + top * src->argb_stride + left;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
 // Picture cropping
 
-int WebPPictureCrop(WebPPicture* const pic,
+int WebPPictureCrop(WebPPicture* pic,
                     int left, int top, int width, int height) {
   WebPPicture tmp;
-  int y;
 
   if (pic == NULL) return 0;
-  if (width <= 0 || height <= 0) return 0;
-  if (left < 0 || ((left + width + 1) & ~1) > pic->width) return 0;
-  if (top < 0 || ((top + height + 1) & ~1) > pic->height) return 0;
+  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
 
   WebPPictureGrabSpecs(pic, &tmp);
   tmp.width = width;
   tmp.height = height;
   if (!WebPPictureAlloc(&tmp)) return 0;
 
-  for (y = 0; y < height; ++y) {
-    memcpy(tmp.y + y * tmp.y_stride,
-           pic->y + (top + y) * pic->y_stride + left, width);
-  }
-  for (y = 0; y < (height + 1) / 2; ++y) {
-    const int offset = (y + top / 2) * pic->uv_stride + left / 2;
-    memcpy(tmp.u + y * tmp.uv_stride, pic->u + offset, (width + 1) / 2);
-    memcpy(tmp.v + y * tmp.uv_stride, pic->v + offset, (width + 1) / 2);
-  }
+  if (!pic->use_argb) {
+    const int y_offset = top * pic->y_stride + left;
+    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+    CopyPlane(pic->y + y_offset, pic->y_stride,
+              tmp.y, tmp.y_stride, width, height);
+    CopyPlane(pic->u + uv_offset, pic->uv_stride,
+              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    CopyPlane(pic->v + uv_offset, pic->uv_stride,
+              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
 
+    if (tmp.a != NULL) {
+      const int a_offset = top * pic->a_stride + left;
+      CopyPlane(pic->a + a_offset, pic->a_stride,
+                tmp.a, tmp.a_stride, width, height);
+    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (tmp.a) {
-    for (y = 0; y < height; ++y) {
-      memcpy(tmp.a + y * tmp.a_stride,
-           pic->a + (top + y) * pic->a_stride + left, width);
+    if (tmp.u0 != NULL) {
+      int w = width;
+      int left_pos = left;
+      if (IS_YUV_CSP(tmp.colorspace, WEBP_YUV422)) {
+        w = HALVE(w);
+        left_pos = HALVE(left_pos);
+      }
+      CopyPlane(pic->u0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
+                tmp.u0, tmp.uv0_stride, w, height);
+      CopyPlane(pic->v0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
+                tmp.v0, tmp.uv0_stride, w, height);
     }
-  }
-  if (tmp.u0) {
-    int w = width;
-    int l = left;
-    if (tmp.colorspace == WEBP_YUV422) {
-      w = (w + 1) / 2;
-      l = (l + 1) / 2;
-    }
-    for (y = 0; y < height; ++y) {
-      memcpy(tmp.u0 + y * tmp.uv0_stride,
-             pic->u0 + (top + y) * pic->uv0_stride + l, w);
-      memcpy(tmp.v0 + y * tmp.uv0_stride,
-             pic->v0 + (top + y) * pic->uv0_stride + l, w);
-    }
-  }
 #endif
-
+  } else {
+    const uint8_t* const src =
+        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+    CopyPlane(src, pic->argb_stride * 4,
+              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
+              width * 4, height);
+  }
   WebPPictureFree(pic);
   *pic = tmp;
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple picture rescaler
 
-#define RFIX 30
-#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
-static inline void ImportRow(const uint8_t* src, int src_width,
-                             int32_t* frow, int32_t* irow, int dst_width) {
-  const int x_expand = (src_width < dst_width);
-  const int fx_scale = (1 << RFIX) / dst_width;
-  int x_in = 0;
-  int x_out;
-  int x_accum = 0;
-  if (!x_expand) {
-    int sum = 0;
-    for (x_out = 0; x_out < dst_width; ++x_out) {
-      x_accum += src_width - dst_width;
-      for (; x_accum > 0; x_accum -= dst_width) {
-        sum += src[x_in++];
-      }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in++];
-        const int32_t frac = base * (-x_accum);
-        frow[x_out] = (sum + base) * dst_width - frac;
-        sum = MULT(frac, fx_scale);    // fresh fractional start for next pixel
-      }
-    }
-  } else {        // simple bilinear interpolation
-    int left = src[0], right = src[0];
-    for (x_out = 0; x_out < dst_width; ++x_out) {
-      if (x_accum < 0) {
-        left = right;
-        right = src[++x_in];
-        x_accum += dst_width - 1;
-      }
-      frow[x_out] = right * (dst_width - 1) + (left - right) * x_accum;
-      x_accum -= src_width - 1;
-    }
-  }
-  // Accumulate the new row's contribution
-  for (x_out = 0; x_out < dst_width; ++x_out) {
-    irow[x_out] += frow[x_out];
-  }
-}
-
-static void ExportRow(int32_t* frow, int32_t* irow, uint8_t* dst, int dst_width,
-                      const int yscale, const int64_t fxy_scale) {
-  int x_out;
-  for (x_out = 0; x_out < dst_width; ++x_out) {
-    const int frac = MULT(frow[x_out], yscale);
-    const int v = MULT(irow[x_out] - frac, fxy_scale);
-    dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-    irow[x_out] = frac;   // new fractional start
-  }
-}
-
 static void RescalePlane(const uint8_t* src,
                          int src_width, int src_height, int src_stride,
                          uint8_t* dst,
                          int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work) {
-  const int x_expand = (src_width < dst_width);
-  const int fy_scale = (1 << RFIX) / dst_height;
-  const int64_t fxy_scale = x_expand ?
-      ((int64_t)dst_height << RFIX) / (dst_width * src_height) :
-      ((int64_t)dst_height << RFIX) / (src_width * src_height);
-  int y_accum = src_height;
-  int y;
-  int32_t* irow = work;              // integral contribution
-  int32_t* frow = work + dst_width;  // fractional contribution
-
-  memset(work, 0, 2 * dst_width * sizeof(*work));
-  for (y = 0; y < src_height; ++y) {
-    // import new contribution of one source row.
-    ImportRow(src, src_width, frow, irow, dst_width);
-    src += src_stride;
-    // emit output row(s)
-    y_accum -= dst_height;
-    for (; y_accum <= 0; y_accum += src_height) {
-      const int yscale = fy_scale * (-y_accum);
-      ExportRow(frow, irow, dst, dst_width, yscale, fxy_scale);
-      dst += dst_stride;
-    }
+                         int32_t* const work,
+                         int num_channels) {
+  WebPRescaler rescaler;
+  int y = 0;
+  WebPRescalerInit(&rescaler, src_width, src_height,
+                   dst, dst_width, dst_height, dst_stride,
+                   num_channels,
+                   src_width, dst_width,
+                   src_height, dst_height,
+                   work);
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  while (y < src_height) {
+    y += WebPRescalerImport(&rescaler, src_height - y,
+                            src + y * src_stride, src_stride);
+    WebPRescalerExport(&rescaler);
   }
 }
-#undef MULT
-#undef RFIX
 
-int WebPPictureRescale(WebPPicture* const pic, int width, int height) {
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
   int32_t* work;
@@ -330,123 +412,139 @@
   tmp.height = height;
   if (!WebPPictureAlloc(&tmp)) return 0;
 
-  work = malloc(2 * width * sizeof(int32_t));
-  if (work == NULL) {
-    WebPPictureFree(&tmp);
-    return 0;
-  }
-
-  RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
-               tmp.y, width, height, tmp.y_stride, work);
-  RescalePlane(pic->u,
-               (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
-               tmp.u,
-               (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
-  RescalePlane(pic->v,
-               (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
-               tmp.v,
-               (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (tmp.a) {
-    RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                 tmp.a, width, height, tmp.a_stride, work);
-  }
-  if (tmp.u0) {
-    int s = 1;
-    if ((tmp.colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
-      s = 2;
+  if (!pic->use_argb) {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
     }
-    RescalePlane(
-        pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-        tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work);
-    RescalePlane(
-        pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-        tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work);
-  }
-#endif
 
+    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+                 tmp.y, width, height, tmp.y_stride, work, 1);
+    RescalePlane(pic->u,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.u,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+    RescalePlane(pic->v,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.v,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+
+    if (tmp.a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (tmp.u0 != NULL) {
+      const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
+      RescalePlane(
+          pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
+          tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
+      RescalePlane(
+          pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
+          tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
+    }
+#endif
+  } else {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+
+    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+                 pic->argb_stride * 4,
+                 (uint8_t*)tmp.argb, width, height,
+                 tmp.argb_stride * 4,
+                 work, 4);
+
+  }
   WebPPictureFree(pic);
   free(work);
   *pic = tmp;
   return 1;
 }
 
-//-----------------------------------------------------------------------------
-// Write-to-memory
+//------------------------------------------------------------------------------
+// WebPMemoryWriter: Write-to-memory
 
-typedef struct {
-  uint8_t** mem;
-  size_t    max_size;
-  size_t*   size;
-} WebPMemoryWriter;
-
-static void InitMemoryWriter(WebPMemoryWriter* const writer) {
-  *writer->mem = NULL;
-  *writer->size = 0;
+void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
+  writer->mem = NULL;
+  writer->size = 0;
   writer->max_size = 0;
 }
 
-static int WebPMemoryWrite(const uint8_t* data, size_t data_size,
-                           const WebPPicture* const picture) {
+int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                    const WebPPicture* picture) {
   WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
-  size_t next_size;
+  uint64_t next_size;
   if (w == NULL) {
     return 1;
   }
-  next_size = (*w->size) + data_size;
+  next_size = (uint64_t)w->size + data_size;
   if (next_size > w->max_size) {
     uint8_t* new_mem;
-    size_t next_max_size = w->max_size * 2;
+    uint64_t next_max_size = 2ULL * w->max_size;
     if (next_max_size < next_size) next_max_size = next_size;
-    if (next_max_size < 8192) next_max_size = 8192;
-    new_mem = (uint8_t*)malloc(next_max_size);
+    if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+    new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
     if (new_mem == NULL) {
       return 0;
     }
-    if ((*w->size) > 0) {
-      memcpy(new_mem, *w->mem, *w->size);
+    if (w->size > 0) {
+      memcpy(new_mem, w->mem, w->size);
     }
-    free(*w->mem);
-    *w->mem = new_mem;
-    w->max_size = next_max_size;
+    free(w->mem);
+    w->mem = new_mem;
+    // down-cast is ok, thanks to WebPSafeMalloc
+    w->max_size = (size_t)next_max_size;
   }
-  if (data_size) {
-    memcpy((*w->mem) + (*w->size), data, data_size);
-    *w->size += data_size;
+  if (data_size > 0) {
+    memcpy(w->mem + w->size, data, data_size);
+    w->size += data_size;
   }
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int x_step, int y_step) {
+  if (alpha == NULL) return 0;
+  while (height-- > 0) {
+    int x;
+    for (x = 0; x < width * x_step; x += x_step) {
+      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
+    }
+    alpha += y_step;
+  }
+  return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (!picture->use_argb) {
+    return CheckNonOpaque(picture->a, picture->width, picture->height,
+                          1, picture->a_stride);
+  } else {
+    int x, y;
+    const uint32_t* argb = picture->argb;
+    if (argb == NULL) return 0;
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
+        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
+      }
+      argb += picture->argb_stride;
+    }
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
 // RGB -> YUV conversion
-// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations.
-
-enum { YUV_FRAC = 16 };
-
-static inline int clip_uv(int v) {
-   v = (v + (257 << (YUV_FRAC + 2 - 1))) >> (YUV_FRAC + 2);
-   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
-}
-
-static inline int rgb_to_y(int r, int g, int b) {
-  const int kRound = (1 << (YUV_FRAC - 1)) + (16 << YUV_FRAC);
-  const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + kRound) >> YUV_FRAC;  // no need to clip
-}
-
-static inline int rgb_to_u(int r, int g, int b) {
-  return clip_uv(-9719 * r - 19081 * g + 28800 * b);
-}
-
-static inline int rgb_to_v(int r, int g, int b) {
-  return clip_uv(+28800 * r - 24116 * g - 4684 * b);
-}
 
 // TODO: we can do better than simply 2x2 averaging on U/V samples.
 #define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
@@ -460,8 +558,8 @@
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = rgb_to_u(r, g, b);                   \
-  picture->v[dst] = rgb_to_v(r, g, b);                   \
+  picture->u[dst] = VP8RGBToU(r, g, b);                  \
+  picture->v[dst] = VP8RGBToV(r, g, b);                  \
 }
 
 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -470,36 +568,46 @@
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = rgb_to_u(r, g, b);                  \
-  picture->v0[dst] = rgb_to_v(r, g, b);                  \
+  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
+  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
 }
 
 static void MakeGray(WebPPicture* const picture) {
   int y;
-  const int uv_width =  (picture->width + 1) >> 1;
-  for (y = 0; y < ((picture->height + 1) >> 1); ++y) {
+  const int uv_width = HALVE(picture->width);
+  const int uv_height = HALVE(picture->height);
+  for (y = 0; y < uv_height; ++y) {
     memset(picture->u + y * picture->uv_stride, 128, uv_width);
     memset(picture->v + y * picture->uv_stride, 128, uv_width);
   }
 }
 
-static int Import(WebPPicture* const picture,
-                  const uint8_t* const rgb, int rgb_stride,
-                  int step, int swap_rb, int import_alpha) {
+static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
+                              const uint8_t* const g_ptr,
+                              const uint8_t* const b_ptr,
+                              const uint8_t* const a_ptr,
+                              int step,         // bytes per pixel
+                              int rgb_stride,   // bytes per scanline
+                              WebPPicture* const picture) {
   const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
   int x, y;
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
   const int width = picture->width;
   const int height = picture->height;
+  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+
+  picture->colorspace = uv_csp;
+  picture->use_argb = 0;
+  if (has_alpha) {
+    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
 
   // Import luma plane
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const int offset = step * x + y * rgb_stride;
       picture->y[x + y * picture->y_stride] =
-        rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
     }
   }
 
@@ -509,7 +617,7 @@
       for (x = 0; x < (width >> 1); ++x) {
         RGB_TO_UV(x, y, SUM4);
       }
-      if (picture->width & 1) {
+      if (width & 1) {
         RGB_TO_UV(x, y, SUM2V);
       }
     }
@@ -545,17 +653,65 @@
     MakeGray(picture);
   }
 
-  if (import_alpha) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    const uint8_t* const a_ptr = rgb + 3;
+  if (has_alpha) {
     assert(step >= 4);
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         picture->a[x + y * picture->a_stride] =
-          a_ptr[step * x + y * rgb_stride];
+            a_ptr[step * x + y * rgb_stride];
       }
     }
-#endif
+  }
+  return 1;
+}
+
+static int Import(WebPPicture* const picture,
+                  const uint8_t* const rgb, int rgb_stride,
+                  int step, int swap_rb, int import_alpha) {
+  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* const g_ptr = rgb + 1;
+  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const int width = picture->width;
+  const int height = picture->height;
+
+  if (!picture->use_argb) {
+    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                              picture);
+  }
+  if (import_alpha) {
+    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
+  } else {
+    picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
+
+  if (!import_alpha) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const int offset = step * x + y * rgb_stride;
+        const uint32_t argb =
+            0xff000000u |
+            (r_ptr[offset] << 16) |
+            (g_ptr[offset] <<  8) |
+            (b_ptr[offset]);
+        picture->argb[x + y * picture->argb_stride] = argb;
+      }
+    }
+  } else {
+    int x, y;
+    assert(step >= 4);
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const int offset = step * x + y * rgb_stride;
+        const uint32_t argb = (a_ptr[offset] << 24) |
+                              (r_ptr[offset] << 16) |
+                              (g_ptr[offset] <<  8) |
+                              (b_ptr[offset]);
+        picture->argb[x + y * picture->argb_stride] = argb;
+      }
+    }
   }
   return 1;
 }
@@ -565,42 +721,264 @@
 #undef SUM1
 #undef RGB_TO_UV
 
-int WebPPictureImportRGB(WebPPicture* const picture,
-                         const uint8_t* const rgb, int rgb_stride) {
-  picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
-  if (!WebPPictureAlloc(picture)) return 0;
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
   return Import(picture, rgb, rgb_stride, 3, 0, 0);
 }
 
-int WebPPictureImportBGR(WebPPicture* const picture,
-                         const uint8_t* const rgb, int rgb_stride) {
-  picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
-  if (!WebPPictureAlloc(picture)) return 0;
+int WebPPictureImportBGR(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
   return Import(picture, rgb, rgb_stride, 3, 1, 0);
 }
 
-int WebPPictureImportRGBA(WebPPicture* const picture,
-                          const uint8_t* const rgba, int rgba_stride) {
-  picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  if (!WebPPictureAlloc(picture)) return 0;
+int WebPPictureImportRGBA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
   return Import(picture, rgba, rgba_stride, 4, 0, 1);
 }
 
-int WebPPictureImportBGRA(WebPPicture* const picture,
-                          const uint8_t* const rgba, int rgba_stride) {
-  picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  if (!WebPPictureAlloc(picture)) return 0;
+int WebPPictureImportBGRA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
   return Import(picture, rgba, rgba_stride, 4, 1, 1);
 }
 
-//-----------------------------------------------------------------------------
-// Simplest call:
+int WebPPictureImportRGBX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return Import(picture, rgba, rgba_stride, 4, 0, 0);
+}
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return Import(picture, rgba, rgba_stride, 4, 1, 0);
+}
+
+//------------------------------------------------------------------------------
+// Automatic YUV <-> ARGB conversions.
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->memory_ == NULL || picture->y == NULL ||
+      picture->u == NULL || picture->v == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  // Allocate a new argb buffer (discarding the previous one).
+  if (!PictureAllocARGB(picture)) return 0;
+
+  // Convert
+  {
+    int y;
+    const int width = picture->width;
+    const int height = picture->height;
+    const int argb_stride = 4 * picture->argb_stride;
+    uint8_t* dst = (uint8_t*)picture->argb;
+    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+
+    // First row, with replicated top samples.
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
+    cur_y += picture->y_stride;
+    dst += argb_stride;
+    // Center rows.
+    for (y = 1; y + 1 < height; y += 2) {
+      const uint8_t* const top_u = cur_u;
+      const uint8_t* const top_v = cur_v;
+      cur_u += picture->uv_stride;
+      cur_v += picture->uv_stride;
+      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+               dst, dst + argb_stride, width);
+      cur_y += 2 * picture->y_stride;
+      dst += 2 * argb_stride;
+    }
+    // Last row (if needed), with replicated bottom samples.
+    if (height > 1 && !(height & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    }
+    // Insert alpha values if needed, in replacement for the default 0xff ones.
+    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+      for (y = 0; y < height; ++y) {
+        uint32_t* const dst = picture->argb + y * picture->argb_stride;
+        const uint8_t* const src = picture->a + y * picture->a_stride;
+        int x;
+        for (x = 0; x < width; ++x) {
+          dst[x] = (dst[x] & 0x00ffffffu) | (src[x] << 24);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  if (picture == NULL) return 0;
+  if (picture->argb == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else {
+    const uint8_t* const argb = (const uint8_t*)picture->argb;
+    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
+    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
+    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
+    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+    // We work on a tmp copy of 'picture', because ImportYUVAFromRGBA()
+    // would be calling WebPPictureFree(picture) otherwise.
+    WebPPicture tmp = *picture;
+    PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
+    tmp.use_argb = 0;
+    tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    }
+    // Copy back the YUV specs into 'picture'.
+    tmp.argb = picture->argb;
+    tmp.argb_stride = picture->argb_stride;
+    tmp.memory_argb_ = picture->memory_argb_;
+    *picture = tmp;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x]) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static WEBP_INLINE void flatten(uint8_t* ptr, int v, int stride, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(ptr, v, size);
+    ptr += stride;
+  }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+  int x, y, w, h;
+  const uint8_t* a_ptr;
+  int values[3] = { 0 };
+
+  if (pic == NULL) return;
+
+  a_ptr = pic->a;
+  if (a_ptr == NULL) return;    // nothing to do
+
+  w = pic->width / SIZE;
+  h = pic->height / SIZE;
+  for (y = 0; y < h; ++y) {
+    int need_reset = 1;
+    for (x = 0; x < w; ++x) {
+      const int off_a = (y * pic->a_stride + x) * SIZE;
+      const int off_y = (y * pic->y_stride + x) * SIZE;
+      const int off_uv = (y * pic->uv_stride + x) * SIZE2;
+      if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
+        if (need_reset) {
+          values[0] = pic->y[off_y];
+          values[1] = pic->u[off_uv];
+          values[2] = pic->v[off_uv];
+          need_reset = 0;
+        }
+        flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
+        flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
+        flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
+      } else {
+        need_reset = 1;
+      }
+    }
+    // ignore the left-overs on right/bottom
+  }
+}
+
+#undef SIZE
+#undef SIZE2
+
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
+                          int type, float result[5]) {
+  int c;
+  DistoStats stats[5];
+  int has_alpha;
+
+  if (pic1 == NULL || pic2 == NULL ||
+      pic1->width != pic2->width || pic1->height != pic2->height ||
+      pic1->y == NULL || pic2->y == NULL ||
+      pic1->u == NULL || pic2->u == NULL ||
+      pic1->v == NULL || pic2->v == NULL ||
+      result == NULL) {
+    return 0;
+  }
+  // TODO(skal): provide distortion for ARGB too.
+  if (pic1->use_argb == 1 || pic1->use_argb != pic2->use_argb) {
+    return 0;
+  }
+
+  has_alpha = !!(pic1->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(pic2->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (pic1->a == NULL || pic2->a == NULL))) {
+    return 0;
+  }
+
+  memset(stats, 0, sizeof(stats));
+  VP8SSIMAccumulatePlane(pic1->y, pic1->y_stride,
+                         pic2->y, pic2->y_stride,
+                         pic1->width, pic1->height, &stats[0]);
+  VP8SSIMAccumulatePlane(pic1->u, pic1->uv_stride,
+                         pic2->u, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[1]);
+  VP8SSIMAccumulatePlane(pic1->v, pic1->uv_stride,
+                         pic2->v, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[2]);
+  if (has_alpha) {
+    VP8SSIMAccumulatePlane(pic1->a, pic1->a_stride,
+                           pic2->a, pic2->a_stride,
+                           pic1->width, pic1->height, &stats[3]);
+  }
+  for (c = 0; c <= 4; ++c) {
+    if (type == 1) {
+      const double v = VP8SSIMGet(&stats[c]);
+      result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                   : kMinDistortion_dB);
+    } else {
+      const double v = VP8SSIMGetSquaredError(&stats[c]);
+      result[c] = (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                                   : kMinDistortion_dB);
+    }
+    // Accumulate forward
+    if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simplest high-level calls:
 
 typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
 
 static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
-                     Importer import, float quality_factor, uint8_t** output) {
-  size_t output_size = 0;
+                     Importer import, float quality_factor, int lossless,
+                     uint8_t** output) {
   WebPPicture pic;
   WebPConfig config;
   WebPMemoryWriter wrt;
@@ -611,29 +989,29 @@
     return 0;  // shouldn't happen, except if system installation is broken
   }
 
+  config.lossless = !!lossless;
+  pic.use_argb = !!lossless;
   pic.width = width;
   pic.height = height;
   pic.writer = WebPMemoryWrite;
   pic.custom_ptr = &wrt;
-
-  wrt.mem = output;
-  wrt.size = &output_size;
-  InitMemoryWriter(&wrt);
+  WebPMemoryWriterInit(&wrt);
 
   ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
   WebPPictureFree(&pic);
   if (!ok) {
-    free(*output);
+    free(wrt.mem);
     *output = NULL;
     return 0;
   }
-  return output_size;
+  *output = wrt.mem;
+  return wrt.size;
 }
 
-#define ENCODE_FUNC(NAME, IMPORTER) \
-size_t NAME(const uint8_t* in, int w, int h, int bps, float q, \
-            uint8_t** out) { \
-  return Encode(in, w, h, bps, IMPORTER, q, out);  \
+#define ENCODE_FUNC(NAME, IMPORTER)                                     \
+size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
+            uint8_t** out) {                                            \
+  return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }
 
 ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
@@ -643,7 +1021,20 @@
 
 #undef ENCODE_FUNC
 
-//-----------------------------------------------------------------------------
+#define LOSSLESS_DEFAULT_QUALITY 70.
+#define LOSSLESS_ENCODE_FUNC(NAME, IMPORTER)                                 \
+size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
+  return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
+}
+
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);
+
+#undef LOSSLESS_ENCODE_FUNC
+
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/quant.c b/src/enc/quant.c
index 31ec814..ea15384 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -12,8 +12,8 @@
 #include <assert.h>
 #include <math.h>
 
-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -33,13 +33,13 @@
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
-static inline int clip(int v, int m, int M) {
+static WEBP_INLINE int clip(int v, int m, int M) {
   return v < m ? m : v > M ? M : v;
 }
 
-const uint8_t VP8Zigzag[16] = {
+static const uint8_t kZigzag[16] = {
   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };
 
@@ -132,7 +132,7 @@
   90, 90, 90, 90
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Initialize quantization parameters in VP8Matrix
 
 // Returns the average quantizer
@@ -143,7 +143,7 @@
     m->q_[i] = m->q_[1];
   }
   for (i = 0; i < 16; ++i) {
-    const int j = VP8Zigzag[i];
+    const int j = kZigzag[i];
     const int bias = kBiasMatrices[type][j];
     m->iq_[j] = (1 << QFIX) / m->q_[j];
     m->bias_[j] = BIAS(bias);
@@ -192,7 +192,7 @@
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Initialize filtering parameters
 
 // Very small filter-strength values have close to no visual effect. So we can
@@ -214,7 +214,7 @@
   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 // Note: if you change the values below, remember that the max range
 // allowed by the syntax for DQ_UV is [-16,16].
@@ -286,7 +286,7 @@
   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Form the predictions in cache
 
 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
@@ -299,16 +299,16 @@
 };
 
 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  const uint8_t* left = it->x_ ? enc->y_left_ : NULL;
-  const uint8_t* top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
   VP8EncPredLuma16(it->yuv_p_, left, top);
 }
 
 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  const uint8_t* left = it->x_ ? enc->u_left_ : NULL;
-  const uint8_t* top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 
@@ -316,7 +316,7 @@
   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantize
 
 // Layout:
@@ -341,7 +341,7 @@
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Distortion measurement
 
 static const uint16_t kWeightY[16] = {
@@ -384,7 +384,7 @@
   dst->score += src->score;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Performs trellis-optimized quantization.
 
 // Trellis
@@ -406,13 +406,13 @@
 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
 
-static inline void SetRDScore(int lambda, VP8ModeScore* const rd) {
+static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
   rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 }
 
-static inline score_t RDScoreTrellis(int lambda, score_t rate,
-                                     score_t distortion) {
+static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
+                                          score_t distortion) {
   return rate * lambda + 256 * distortion;
 }
 
@@ -440,7 +440,7 @@
     // compute maximal distortion.
     max_error = 0;
     for (n = first; n < 16; ++n) {
-      const int j  = VP8Zigzag[n];
+      const int j  = kZigzag[n];
       const int err = in[j] * in[j];
       max_error += kWeightTrellis[j] * err;
       if (err > thresh) last = n;
@@ -464,7 +464,7 @@
 
   // traverse trellis.
   for (n = first; n <= last; ++n) {
-    const int j  = VP8Zigzag[n];
+    const int j  = kZigzag[n];
     const int Q  = mtx->q_[j];
     const int iQ = mtx->iq_[j];
     const int B = BIAS(0x00);     // neutral bias
@@ -560,7 +560,7 @@
 
   for (; n >= first; --n) {
     const Node* const node = &NODE(n, best_node);
-    const int j = VP8Zigzag[n];
+    const int j = kZigzag[n];
     out[n] = node->sign ? -node->level : node->level;
     nz |= (node->level != 0);
     in[j] = out[n] * mtx->q_[j];
@@ -571,7 +571,7 @@
 
 #undef NODE
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Performs: difference, transform, quantize, back-transform, add
 // all at once. Output is the reconstructed block in *yuv_out, and the
 // quantized levels in *levels.
@@ -685,7 +685,7 @@
   return (nz << 16);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
 // Pick the mode is lower RD-cost = Rate + lamba * Distortion.
 
@@ -700,7 +700,7 @@
 }
 
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
@@ -738,11 +738,11 @@
   VP8SetIntra16Mode(it, rd->mode_i16);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 // return the cost array corresponding to the surrounding prediction modes.
 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
-                                     const int modes[16]) {
+                                     const uint8_t modes[16]) {
   const int preds_w = it->enc_->preds_w_;
   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
@@ -751,16 +751,21 @@
 }
 
 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i4_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
+  int total_header_bits = 0;
   VP8ModeScore rd_best;
 
+  if (enc->max_i4_header_bits_ == 0) {
+    return 0;
+  }
+
   InitScore(&rd_best);
-  rd_best.score = 0;
+  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
   VP8IteratorStartI4(it);
   do {
     VP8ModeScore rd_i4;
@@ -799,7 +804,9 @@
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
     AddScore(&rd_best, &rd_i4);
-    if (rd_best.score >= rd->score) {
+    total_header_bits += mode_costs[best_mode];
+    if (rd_best.score >= rd->score ||
+        total_header_bits > enc->max_i4_header_bits_) {
       return 0;
     }
     // Copy selected samples if not in the right place already.
@@ -817,10 +824,10 @@
   return 1;   // select intra4x4 over intra16x16
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
   const uint8_t* const src = it->yuv_in_ + U_OFF;
@@ -855,7 +862,7 @@
   AddScore(rd, &rd_best);
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Final reconstruction and quantization.
 
 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
@@ -882,7 +889,7 @@
   rd->nz = nz;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Entry point
 
 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
diff --git a/src/enc/syntax.c b/src/enc/syntax.c
index 77df727..99c21fe 100644
--- a/src/enc/syntax.c
+++ b/src/enc/syntax.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -10,70 +10,190 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
-#include <math.h>
 
-#include "vp8enci.h"
+#include "./vp8enci.h"
+#include "webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define KSIGNATURE 0x9d012a
-#define KHEADER_SIZE 10
-#define KRIFF_SIZE 20
-#define KSIZE_OFFSET (KRIFF_SIZE - 8)
+//------------------------------------------------------------------------------
+// Helper functions
 
-#define MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
-#define MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition
-
-//-----------------------------------------------------------------------------
-// Writers for header's various pieces (in order of appearance)
-
-// Main keyframe header
-
-static void PutLE32(uint8_t* const data, uint32_t val) {
+// TODO(later): Move to webp/format_constants.h?
+static void PutLE24(uint8_t* const data, uint32_t val) {
   data[0] = (val >>  0) & 0xff;
   data[1] = (val >>  8) & 0xff;
   data[2] = (val >> 16) & 0xff;
+}
+
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  PutLE24(data, val);
   data[3] = (val >> 24) & 0xff;
 }
 
-static int PutHeader(int profile, size_t size0, size_t total_size,
-                     WebPPicture* const pic) {
-  uint8_t buf[KHEADER_SIZE];
-  uint8_t RIFF[KRIFF_SIZE] = {
-    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P', 'V', 'P', '8', ' '
+static int IsVP8XNeeded(const VP8Encoder* const enc) {
+  return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
+                             // This could change in the future.
+}
+
+static int PutPaddingByte(const WebPPicture* const pic) {
+
+  const uint8_t pad_byte[1] = { 0 };
+  return !!pic->writer(pad_byte, 1, pic);
+}
+
+//------------------------------------------------------------------------------
+// Writers for header's various pieces (in order of appearance)
+
+static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
+                                       size_t riff_size) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t riff[RIFF_HEADER_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
   };
+  assert(riff_size == (uint32_t)riff_size);
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
+    'V', 'P', '8', 'X'
+  };
+  uint32_t flags = 0;
+
+  assert(IsVP8XNeeded(enc));
+  assert(pic->width >= 1 && pic->height >= 1);
+  assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
+
+  if (enc->has_alpha_) {
+    flags |= ALPHA_FLAG_BIT;
+  }
+
+  PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
+  PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
+  if(!pic->writer(vp8x, sizeof(vp8x), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'A', 'L', 'P', 'H'
+  };
+
+  assert(enc->has_alpha_);
+
+  // Alpha chunk header.
+  PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
+  if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Alpha chunk data.
+  if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Padding.
+  if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
+                                      size_t vp8_size) {
+  uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'V', 'P', '8', ' '
+  };
+  assert(vp8_size == (uint32_t)vp8_size);
+  PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
+  if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
+                                           int profile, size_t size0) {
+  uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
   uint32_t bits;
 
-  if (size0 >= MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION0_OVERFLOW);
+  if (size0 >= VP8_MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
+    return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
   }
 
-  PutLE32(RIFF + 4, total_size + KSIZE_OFFSET);
-  PutLE32(RIFF + 16, total_size);
-  if (!pic->writer(RIFF, sizeof(RIFF), pic)) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
-  }
-
-  bits = 0               // keyframe (1b)
-       | (profile << 1)  // profile (3b)
-       | (1 << 4)        // visible (1b)
-       | (size0 << 5);   // partition length (19b)
-  buf[0] = bits & 0xff;
-  buf[1] = (bits >> 8) & 0xff;
-  buf[2] = (bits >> 16) & 0xff;
+  // Paragraph 9.1.
+  bits = 0                         // keyframe (1b)
+       | (profile << 1)            // profile (3b)
+       | (1 << 4)                  // visible (1b)
+       | ((uint32_t)size0 << 5);   // partition length (19b)
+  vp8_frm_hdr[0] = (bits >>  0) & 0xff;
+  vp8_frm_hdr[1] = (bits >>  8) & 0xff;
+  vp8_frm_hdr[2] = (bits >> 16) & 0xff;
   // signature
-  buf[3] = (KSIGNATURE >> 16) & 0xff;
-  buf[4] = (KSIGNATURE >> 8) & 0xff;
-  buf[5] = (KSIGNATURE >> 0) & 0xff;
+  vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
+  vp8_frm_hdr[4] = (VP8_SIGNATURE >>  8) & 0xff;
+  vp8_frm_hdr[5] = (VP8_SIGNATURE >>  0) & 0xff;
   // dimensions
-  buf[6] = pic->width & 0xff;
-  buf[7] = pic->width >> 8;
-  buf[8] = pic->height & 0xff;
-  buf[9] = pic->height >> 8;
+  vp8_frm_hdr[6] = pic->width & 0xff;
+  vp8_frm_hdr[7] = pic->width >> 8;
+  vp8_frm_hdr[8] = pic->height & 0xff;
+  vp8_frm_hdr[9] = pic->height >> 8;
 
-  return pic->writer(buf, sizeof(buf), pic);
+  if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+// WebP Headers.
+static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
+                          size_t vp8_size, size_t riff_size) {
+  WebPPicture* const pic = enc->pic_;
+  WebPEncodingError err = VP8_ENC_OK;
+
+  // RIFF header.
+  err = PutRIFFHeader(enc, riff_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8X.
+  if (IsVP8XNeeded(enc)) {
+    err = PutVP8XHeader(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // Alpha.
+  if (enc->has_alpha_) {
+    err = PutAlphaChunk(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // VP8 header.
+  err = PutVP8Header(pic, vp8_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8 frame header.
+  err = PutVP8FrameHeader(pic, enc->profile_, size0);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // All OK.
+  return 1;
+
+  // Error.
+ Error:
+  return WebPEncodingSetError(pic, err);
 }
 
 // Segmentation header
@@ -144,7 +264,7 @@
   int p;
   for (p = 0; p < enc->num_parts_ - 1; ++p) {
     const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
-    if (part_size >= MAX_PARTITION_SIZE) {
+    if (part_size >= VP8_MAX_PARTITION_SIZE) {
       return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
     }
     buf[3 * p + 0] = (part_size >>  0) & 0xff;
@@ -154,18 +274,12 @@
   return p ? pic->writer(buf, 3 * p, pic) : 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 
 #define KTRAILER_SIZE 8
 
-static void PutLE24(uint8_t* buf, size_t value) {
-  buf[0] = (value >>  0) & 0xff;
-  buf[1] = (value >>  8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-}
-
 static int WriteExtensions(VP8Encoder* const enc) {
   uint8_t buffer[KTRAILER_SIZE];
   VP8BitWriter* const bw = &enc->bw_;
@@ -182,14 +296,6 @@
       return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
     }
   }
-  // Alpha (bytes 4..6)
-  PutLE24(buffer + 4, enc->alpha_data_size_);
-  if (enc->alpha_data_size_ > 0) {
-    assert(enc->has_alpha_);
-    if (!VP8BitWriterAppend(bw, enc->alpha_data_, enc->alpha_data_size_)) {
-      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-    }
-  }
 
   buffer[KTRAILER_SIZE - 1] = 0x01;  // marker
   if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
@@ -200,14 +306,14 @@
 
 #endif    /* WEBP_EXPERIMENTAL_FEATURES */
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static size_t GeneratePartition0(VP8Encoder* const enc) {
   VP8BitWriter* const bw = &enc->bw_;
   const int mb_size = enc->mb_w_ * enc->mb_h_;
   uint64_t pos1, pos2, pos3;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  const int need_extensions = enc->has_alpha_ || enc->use_layer_;
+  const int need_extensions = enc->use_layer_;
 #endif
 
   pos1 = VP8BitWriterPos(bw);
@@ -240,38 +346,67 @@
   if (enc->pic_->stats) {
     enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
     enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
-    enc->pic_->stats->alpha_data_size = enc->alpha_data_size_;
-    enc->pic_->stats->layer_data_size = enc->layer_data_size_;
+    enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
+    enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_;
   }
   return !bw->error_;
 }
 
+void VP8EncFreeBitWriters(VP8Encoder* const enc) {
+  int p;
+  VP8BitWriterWipeOut(&enc->bw_);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterWipeOut(enc->parts_ + p);
+  }
+}
+
 int VP8EncWrite(VP8Encoder* const enc) {
   WebPPicture* const pic = enc->pic_;
   VP8BitWriter* const bw = &enc->bw_;
+  const int task_percent = 19;
+  const int percent_per_part = task_percent / enc->num_parts_;
+  const int final_percent = enc->percent_ + task_percent;
   int ok = 0;
-  size_t coded_size, pad;
+  size_t vp8_size, pad, riff_size;
   int p;
 
   // Partition #0 with header and partition sizes
-  ok = GeneratePartition0(enc);
+  ok = !!GeneratePartition0(enc);
 
-  // Compute total size (for the RIFF header)
-  coded_size = KHEADER_SIZE + VP8BitWriterSize(bw) + 3 * (enc->num_parts_ - 1);
+  // Compute VP8 size
+  vp8_size = VP8_FRAME_HEADER_SIZE +
+             VP8BitWriterSize(bw) +
+             3 * (enc->num_parts_ - 1);
   for (p = 0; p < enc->num_parts_; ++p) {
-    coded_size += VP8BitWriterSize(enc->parts_ + p);
+    vp8_size += VP8BitWriterSize(enc->parts_ + p);
   }
-  pad = coded_size & 1;
-  coded_size += pad;
+  pad = vp8_size & 1;
+  vp8_size += pad;
+
+  // Compute RIFF size
+  // At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
+  riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
+  if (IsVP8XNeeded(enc)) {  // Add size for: VP8X header + data.
+    riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  }
+  if (enc->has_alpha_) {  // Add size for: ALPH header + data.
+    const uint32_t padded_alpha_size = enc->alpha_data_size_ +
+                                       (enc->alpha_data_size_ & 1);
+    riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
+  }
+  // Sanity check.
+  if (riff_size > 0xfffffffeU) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
+  }
 
   // Emit headers and partition #0
   {
     const uint8_t* const part0 = VP8BitWriterBuf(bw);
     const size_t size0 = VP8BitWriterSize(bw);
-    ok = ok && PutHeader(enc->profile_, size0, coded_size, pic)
+    ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
             && pic->writer(part0, size0, pic)
             && EmitPartitionsSize(enc, pic);
-    free((void*)part0);
+    VP8BitWriterWipeOut(bw);    // will free the internal buffer.
   }
 
   // Token partitions
@@ -280,20 +415,22 @@
     const size_t size = VP8BitWriterSize(enc->parts_ + p);
     if (size)
       ok = ok && pic->writer(buf, size, pic);
-    free((void*)buf);
+    VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
+    ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
+                                  &enc->percent_);
   }
 
   // Padding byte
   if (ok && pad) {
-    const uint8_t pad_byte[1] = { 0 };
-    ok = pic->writer(pad_byte, 1, pic);
+    ok = PutPaddingByte(pic);
   }
 
-  enc->coded_size_ = coded_size + KRIFF_SIZE;
+  enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
+  ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
   return ok;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/enc/tree.c b/src/enc/tree.c
index b1a9aa4..8b25e5e 100644
--- a/src/enc/tree.c
+++ b/src/enc/tree.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -9,13 +9,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "vp8enci.h"
+#include "./vp8enci.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Default probabilities
 
 // Paragraph 13.5
@@ -158,9 +158,12 @@
 
 void VP8DefaultProbas(VP8Encoder* const enc) {
   VP8Proba* const probas = &enc->proba_;
+  probas->use_skip_proba_ = 0;
   memset(probas->segments_, 255u, sizeof(probas->segments_));
   memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
-  probas->use_skip_proba_ = 0;
+  // Note: we could hard-code the level_costs_ corresponding to VP8CoeffsProba0,
+  // but that's ~11k of static data. Better call VP8CalculateLevelCosts() later.
+  probas->dirty_ = 1;
 }
 
 // Paragraph 11.5.  900bytes.
@@ -343,7 +346,7 @@
   } while (VP8IteratorNext(&it, 0));
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 13
 
 const uint8_t
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index 2be079e..a0d9001 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -12,21 +12,22 @@
 #ifndef WEBP_ENC_VP8ENCI_H_
 #define WEBP_ENC_VP8ENCI_H_
 
-#include "string.h"     // for memcpy()
+#include <string.h>     // for memcpy()
+#include "../dsp/dsp.h"
+#include "../utils/bit_writer.h"
 #include "webp/encode.h"
-#include "bit_writer.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Various defines and enums
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 1
-#define ENC_REV_VERSION 2
+#define ENC_MIN_VERSION 2
+#define ENC_REV_VERSION 0
 
 // size of histogram used by CollectHistogram.
 #define MAX_COEFF_THRESH   64
@@ -156,16 +157,17 @@
 #define BIAS(b)  ((b) << (QFIX - 8))
 // Fun fact: this is the _only_ line where we're actually being lossy and
 // discarding bits.
-static inline int QUANTDIV(int n, int iQ, int B) {
+static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
   return (n * iQ + B) >> QFIX;
 }
 extern const uint8_t VP8Zigzag[16];
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Headers
 
+typedef uint32_t proba_t;   // 16b + 16b
 typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
-typedef uint64_t StatsArray[NUM_CTX][NUM_PROBAS][2];
+typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS];
 typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1];
 typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS];  // filter stats
 
@@ -184,8 +186,9 @@
   uint8_t segments_[3];     // probabilities for segment tree
   uint8_t skip_proba_;      // final probability of being skipped.
   ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 924 bytes
-  StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 7.4k
+  StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
   CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 11.4k
+  int dirty_;               // if true, need to call VP8CalculateLevelCosts()
   int use_skip_proba_;      // Note: we always use skip_proba for now.
   int nb_skip_;             // number of skipped blocks
 } VP8Proba;
@@ -199,19 +202,19 @@
   int i4x4_lf_delta_;      // delta filter level for i4x4 relative to i16x16
 } VP8FilterHeader;
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Informations about the macroblocks.
 
 typedef struct {
   // block type
-  uint8_t type_:2;     // 0=i4x4, 1=i16x16
-  uint8_t uv_mode_:2;
-  uint8_t skip_:1;
-  uint8_t segment_:2;
+  unsigned int type_:2;     // 0=i4x4, 1=i16x16
+  unsigned int uv_mode_:2;
+  unsigned int skip_:1;
+  unsigned int segment_:2;
   uint8_t alpha_;      // quantization-susceptibility
 } VP8MBInfo;
 
-typedef struct {
+typedef struct VP8Matrix {
   uint16_t q_[16];        // quantizer steps
   uint16_t iq_[16];       // reciprocals, fixed point.
   uint16_t bias_[16];     // rounding bias
@@ -240,7 +243,7 @@
   int16_t y_ac_levels[16][16];
   int16_t uv_levels[4 + 4][16];
   int mode_i16;               // mode number for intra16 prediction
-  int modes_i4[16];           // mode numbers for intra4 predictions
+  uint8_t modes_i4[16];       // mode numbers for intra4 predictions
   int mode_uv;                // mode number of chroma prediction
   uint32_t nz;                // non-zero blocks
 } VP8ModeScore;
@@ -271,6 +274,7 @@
   LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
   int           do_trellis_;       // if true, perform extra level optimisation
   int           done_;             // true when scan is finished
+  int           percent0_;         // saved initial progress percent
 } VP8EncIterator;
 
   // in iterator.c
@@ -287,6 +291,9 @@
 // it->yuv_out_ or it->yuv_in_.
 int VP8IteratorNext(VP8EncIterator* const it,
                     const uint8_t* const block_to_save);
+// Report progression based on macroblock rows. Return 0 for user-abort request.
+int VP8IteratorProgress(const VP8EncIterator* const it,
+                        int final_delta_percent);
 // Intra4x4 iterations
 void VP8IteratorStartI4(VP8EncIterator* const it);
 // returns true if not done.
@@ -299,13 +306,54 @@
 
 // Helper functions to set mode properties
 void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode);
-void VP8SetIntra4Mode(const VP8EncIterator* const it, int modes[16]);
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes);
 void VP8SetIntraUVMode(const VP8EncIterator* const it, int mode);
 void VP8SetSkip(const VP8EncIterator* const it, int skip);
 void VP8SetSegment(const VP8EncIterator* const it, int segment);
-void VP8IteratorResetCosts(VP8EncIterator* const it);
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Paginated token buffer
+
+// WIP: #define USE_TOKEN_BUFFER
+
+#ifdef USE_TOKEN_BUFFER
+
+#define MAX_NUM_TOKEN 2048
+
+typedef struct VP8Tokens VP8Tokens;
+struct VP8Tokens {
+  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit, bits 0..14: slot
+  int left_;
+  VP8Tokens* next_;
+};
+
+typedef struct {
+  VP8Tokens* rows_;
+  uint16_t* tokens_;    // set to (*last_)->tokens_
+  VP8Tokens** last_;
+  int left_;
+  int error_;  // true in case of malloc error
+} VP8TBuffer;
+
+void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
+int VP8TBufferNewPage(VP8TBuffer* const b);  // allocate a new page
+void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate memory
+
+int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas);
+
+static WEBP_INLINE int VP8AddToken(VP8TBuffer* const b,
+                                   int bit, int proba_idx) {
+  if (b->left_ > 0 || VP8TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  return bit;
+}
+
+#endif  // USE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
 // VP8Encoder
 
 struct VP8Encoder {
@@ -329,10 +377,12 @@
   VP8BitWriter bw_;                         // part0
   VP8BitWriter parts_[MAX_NUM_PARTITIONS];  // token partitions
 
+  int percent_;                             // for progress
+
   // transparency blob
   int has_alpha_;
   uint8_t* alpha_data_;       // non-NULL if transparency is present
-  size_t alpha_data_size_;
+  uint32_t alpha_data_size_;
 
   // enhancement layer
   int use_layer_;
@@ -352,15 +402,16 @@
 
   // probabilities and statistics
   VP8Proba proba_;
-  uint64_t sse_[3];        // sum of Y/U/V squared errors for all macroblocks
+  uint64_t sse_[4];        // sum of Y/U/V/A squared errors for all macroblocks
   uint64_t sse_count_;     // pixel count for the sse_[] stats
   int      coded_size_;
   int      residual_bytes_[3][4];
   int      block_count_[3];
 
   // quality/speed settings
-  int method_;             // 0=fastest, 6=best/slowest.
-  int rd_opt_level_;       // Deduced from method_.
+  int method_;              // 0=fastest, 6=best/slowest.
+  int rd_opt_level_;        // Deduced from method_.
+  int max_i4_header_bits_;  // partition #0 safeness factor
 
   // Memory
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
@@ -380,7 +431,7 @@
   LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // internal functions. Not public.
 
   // in tree.c
@@ -399,6 +450,8 @@
 // and appending an assembly of all the pre-coded token partitions.
 // Return true if everything is ok.
 int VP8EncWrite(VP8Encoder* const enc);
+// Release memory allocated for bit-writing in VP8EncLoop & seq.
+void VP8EncFreeBitWriters(VP8Encoder* const enc);
 
   // in frame.c
 extern const uint8_t VP8EncBands[16 + 1];
@@ -419,13 +472,11 @@
 
   // in webpenc.c
 // Assign an error code to a picture. Return false for convenience.
-int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error);
+int WebPEncodingSetError(const WebPPicture* const pic, WebPEncodingError error);
+int WebPReportProgress(const WebPPicture* const pic,
+                       int percent, int* const percent_store);
+
   // in analysis.c
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
-                         int start_block, int end_block);
-extern VP8CHisto VP8CollectHistogram;
 // Main analysis loop. Decides the segmentations and complexity.
 // Assigns a first guess for Intra16 and uvmode_ prediction modes.
 int VP8EncAnalyze(VP8Encoder* const enc);
@@ -437,10 +488,9 @@
 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
 
   // in alpha.c
-void VP8EncInitAlpha(VP8Encoder* enc);           // initialize alpha compression
-void VP8EncCodeAlphaBlock(VP8EncIterator* it);   // analyze or code a macroblock
-int VP8EncFinishAlpha(VP8Encoder* enc);          // finalize compressed data
-void VP8EncDeleteAlpha(VP8Encoder* enc);         // delete compressed data
+void VP8EncInitAlpha(VP8Encoder* const enc);    // initialize alpha compression
+int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
+void VP8EncDeleteAlpha(VP8Encoder* const enc);  // delete compressed data
 
   // in layer.c
 void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
@@ -448,63 +498,28 @@
 int VP8EncFinishLayer(VP8Encoder* const enc);    // finalize coding
 void VP8EncDeleteLayer(VP8Encoder* enc);         // reclaim memory
 
-  // in dsp.c
-int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]);
-
-// Transforms
-// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
-//          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
-typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                        int do_two);
-typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
-typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
-extern VP8Idct VP8ITransform;
-extern VP8Fdct VP8FTransform;
-extern VP8WHT VP8ITransformWHT;
-extern VP8WHT VP8FTransformWHT;
-// Predictions
-// *dst is the destination block. *top, *top_right and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
-                              const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
-extern VP8Intra4Preds VP8EncPredLuma4;
-extern VP8IntraPreds VP8EncPredLuma16;
-extern VP8IntraPreds VP8EncPredChroma8;
-
-typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
-extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
-typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
-                          const uint16_t* const weights);
-extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
-
-typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
-extern VP8BlockCopy VP8Copy4x4;
-extern VP8BlockCopy VP8Copy8x8;
-extern VP8BlockCopy VP8Copy16x16;
-// Quantization
-typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                int n, const VP8Matrix* const mtx);
-extern VP8QuantizeBlock VP8EncQuantizeBlock;
-
-typedef enum {
-  kSSE2,
-  kSSE3
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8EncGetCPUInfo;
-
-void VP8EncDspInit(void);   // must be called before using any of the above
-
   // in filter.c
-extern void VP8InitFilter(VP8EncIterator* const it);
-extern void VP8StoreFilterStats(VP8EncIterator* const it);
-extern void VP8AdjustFilterStrength(VP8EncIterator* const it);
 
-//-----------------------------------------------------------------------------
+// SSIM utils
+typedef struct {
+  double w, xm, ym, xxm, xym, yym;
+} DistoStats;
+void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst);
+void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
+                            const uint8_t* src2, int stride2,
+                            int W, int H, DistoStats* const stats);
+double VP8SSIMGet(const DistoStats* const stats);
+double VP8SSIMGetSquaredError(const DistoStats* const stats);
+
+// autofilter
+void VP8InitFilter(VP8EncIterator* const it);
+void VP8StoreFilterStats(VP8EncIterator* const it);
+void VP8AdjustFilterStrength(VP8EncIterator* const it);
+
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 
-#endif  // WEBP_ENC_VP8ENCI_H_
+#endif  /* WEBP_ENC_VP8ENCI_H_ */
diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c
new file mode 100644
index 0000000..41aa62b
--- /dev/null
+++ b/src/enc/vp8l.c
@@ -0,0 +1,1150 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// main entry for the lossless encoder.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+//
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./backward_references.h"
+#include "./vp8enci.h"
+#include "./vp8li.h"
+#include "../dsp/lossless.h"
+#include "../utils/bit_writer.h"
+#include "../utils/huffman_encode.h"
+#include "../utils/utils.h"
+#include "webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
+#define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
+#define MAX_COLORS_FOR_GRAPH      64
+
+// -----------------------------------------------------------------------------
+// Palette
+
+static int CompareColors(const void* p1, const void* p2) {
+  const uint32_t a = *(const uint32_t*)p1;
+  const uint32_t b = *(const uint32_t*)p2;
+  return (a < b) ? -1 : (a > b) ? 1 : 0;
+}
+
+// If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
+// creates a palette and returns true, else returns false.
+static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
+                                   uint32_t palette[MAX_PALETTE_SIZE],
+                                   int* const palette_size) {
+  int i, x, y, key;
+  int num_colors = 0;
+  uint8_t in_use[MAX_PALETTE_SIZE * 4] = { 0 };
+  uint32_t colors[MAX_PALETTE_SIZE * 4];
+  static const uint32_t kHashMul = 0x1e35a7bd;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      if (argb[x] == last_pix) {
+        continue;
+      }
+      last_pix = argb[x];
+      key = (kHashMul * last_pix) >> PALETTE_KEY_RIGHT_SHIFT;
+      while (1) {
+        if (!in_use[key]) {
+          colors[key] = last_pix;
+          in_use[key] = 1;
+          ++num_colors;
+          if (num_colors > MAX_PALETTE_SIZE) {
+            return 0;
+          }
+          break;
+        } else if (colors[key] == last_pix) {
+          // The color is already there.
+          break;
+        } else {
+          // Some other color sits there.
+          // Do linear conflict resolution.
+          ++key;
+          key &= (MAX_PALETTE_SIZE * 4 - 1);  // key mask for 1K buffer.
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+
+  // TODO(skal): could we reuse in_use[] to speed up ApplyPalette()?
+  num_colors = 0;
+  for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
+    if (in_use[i]) {
+      palette[num_colors] = colors[i];
+      ++num_colors;
+    }
+  }
+
+  qsort(palette, num_colors, sizeof(*palette), CompareColors);
+  *palette_size = num_colors;
+  return 1;
+}
+
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
+                          double* const nonpredicted_bits,
+                          double* const predicted_bits) {
+  int x, y;
+  const uint32_t* last_line = NULL;
+  uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
+
+  VP8LHistogram* nonpredicted = NULL;
+  VP8LHistogram* predicted =
+      (VP8LHistogram*)malloc(2 * sizeof(*predicted));
+  if (predicted == NULL) return 0;
+  nonpredicted = predicted + 1;
+
+  VP8LHistogramInit(predicted, 0);
+  VP8LHistogramInit(nonpredicted, 0);
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const uint32_t pix = argb[x];
+      const uint32_t pix_diff = VP8LSubPixels(pix, last_pix);
+      if (pix_diff == 0) continue;
+      if (last_line != NULL && pix == last_line[x]) {
+        continue;
+      }
+      last_pix = pix;
+      {
+        const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
+        const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
+        VP8LHistogramAddSinglePixOrCopy(nonpredicted, &pix_token);
+        VP8LHistogramAddSinglePixOrCopy(predicted, &pix_diff_token);
+      }
+    }
+    last_line = argb;
+    argb += argb_stride;
+  }
+  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
+  *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
+  free(predicted);
+  return 1;
+}
+
+static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
+  const WebPPicture* const pic = enc->pic_;
+  assert(pic != NULL && pic->argb != NULL);
+
+  enc->use_palette_ =
+      AnalyzeAndCreatePalette(pic, enc->palette_, &enc->palette_size_);
+
+  if (image_hint == WEBP_HINT_GRAPH) {
+    if (enc->use_palette_ && enc->palette_size_ < MAX_COLORS_FOR_GRAPH) {
+      enc->use_palette_ = 0;
+    }
+  }
+
+  if (!enc->use_palette_) {
+    if (image_hint == WEBP_HINT_PHOTO) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 1;
+    } else {
+      double non_pred_entropy, pred_entropy;
+      if (!AnalyzeEntropy(pic->argb, pic->width, pic->height, pic->argb_stride,
+                          &non_pred_entropy, &pred_entropy)) {
+        return 0;
+      }
+      if (pred_entropy < 0.95 * non_pred_entropy) {
+        enc->use_predict_ = 1;
+        // TODO(vikasa): Observed some correlation of cross_color transform with
+        // predict. Need to investigate this further and add separate heuristic
+        // for setting use_cross_color flag.
+        enc->use_cross_color_ = 1;
+      }
+    }
+  }
+
+  return 1;
+}
+
+static int GetHuffBitLengthsAndCodes(
+    const VP8LHistogramSet* const histogram_image,
+    HuffmanTreeCode* const huffman_codes) {
+  int i, k;
+  int ok = 1;
+  uint64_t total_length_size = 0;
+  uint8_t* mem_buf = NULL;
+  const int histogram_image_size = histogram_image->size;
+
+  // Iterate over all histograms and get the aggregate number of codes used.
+  for (i = 0; i < histogram_image_size; ++i) {
+    const VP8LHistogram* const histo = histogram_image->histograms[i];
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    for (k = 0; k < 5; ++k) {
+      const int num_symbols = (k == 0) ? VP8LHistogramNumCodes(histo)
+                            : (k == 4) ? NUM_DISTANCE_CODES
+                            : 256;
+      codes[k].num_symbols = num_symbols;
+      total_length_size += num_symbols;
+    }
+  }
+
+  // Allocate and Set Huffman codes.
+  {
+    uint16_t* codes;
+    uint8_t* lengths;
+    mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+                                       sizeof(*lengths) + sizeof(*codes));
+    if (mem_buf == NULL) {
+      ok = 0;
+      goto End;
+    }
+    codes = (uint16_t*)mem_buf;
+    lengths = (uint8_t*)&codes[total_length_size];
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      const int bit_length = huffman_codes[i].num_symbols;
+      huffman_codes[i].codes = codes;
+      huffman_codes[i].code_lengths = lengths;
+      codes += bit_length;
+      lengths += bit_length;
+    }
+  }
+
+  // Create Huffman trees.
+  for (i = 0; i < histogram_image_size; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    VP8LHistogram* const histo = histogram_image->histograms[i];
+    ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
+    ok = ok && VP8LCreateHuffmanTree(histo->red_, 15, codes + 1);
+    ok = ok && VP8LCreateHuffmanTree(histo->blue_, 15, codes + 2);
+    ok = ok && VP8LCreateHuffmanTree(histo->alpha_, 15, codes + 3);
+    ok = ok && VP8LCreateHuffmanTree(histo->distance_, 15, codes + 4);
+  }
+
+ End:
+  if (!ok) free(mem_buf);
+  return ok;
+}
+
+static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw, const uint8_t* code_length_bitdepth) {
+  // RFC 1951 will calm you down if you are worried about this funny sequence.
+  // This sequence is tuned from that, but more weighted for lower symbol count,
+  // and more spiking histograms.
+  static const uint8_t kStorageOrder[CODE_LENGTH_CODES] = {
+    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  int i;
+  // Throw away trailing zeros:
+  int codes_to_store = CODE_LENGTH_CODES;
+  for (; codes_to_store > 4; --codes_to_store) {
+    if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+      break;
+    }
+  }
+  VP8LWriteBits(bw, 4, codes_to_store - 4);
+  for (i = 0; i < codes_to_store; ++i) {
+    VP8LWriteBits(bw, 3, code_length_bitdepth[kStorageOrder[i]]);
+  }
+}
+
+static void ClearHuffmanTreeIfOnlyOneSymbol(
+    HuffmanTreeCode* const huffman_code) {
+  int k;
+  int count = 0;
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    if (huffman_code->code_lengths[k] != 0) {
+      ++count;
+      if (count > 1) return;
+    }
+  }
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    huffman_code->code_lengths[k] = 0;
+    huffman_code->codes[k] = 0;
+  }
+}
+
+static void StoreHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeToken* const tokens, const int num_tokens,
+    const HuffmanTreeCode* const huffman_code) {
+  int i;
+  for (i = 0; i < num_tokens; ++i) {
+    const int ix = tokens[i].code;
+    const int extra_bits = tokens[i].extra_bits;
+    VP8LWriteBits(bw, huffman_code->code_lengths[ix], huffman_code->codes[ix]);
+    switch (ix) {
+      case 16:
+        VP8LWriteBits(bw, 2, extra_bits);
+        break;
+      case 17:
+        VP8LWriteBits(bw, 3, extra_bits);
+        break;
+      case 18:
+        VP8LWriteBits(bw, 7, extra_bits);
+        break;
+    }
+  }
+}
+
+static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                const HuffmanTreeCode* const tree) {
+  int ok = 0;
+  uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
+  uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
+  const int max_tokens = tree->num_symbols;
+  int num_tokens;
+  HuffmanTreeCode huffman_code;
+  HuffmanTreeToken* const tokens =
+      (HuffmanTreeToken*)WebPSafeMalloc((uint64_t)max_tokens, sizeof(*tokens));
+  if (tokens == NULL) return 0;
+
+  huffman_code.num_symbols = CODE_LENGTH_CODES;
+  huffman_code.code_lengths = code_length_bitdepth;
+  huffman_code.codes = code_length_bitdepth_symbols;
+
+  VP8LWriteBits(bw, 1, 0);
+  num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
+  {
+    int histogram[CODE_LENGTH_CODES] = { 0 };
+    int i;
+    for (i = 0; i < num_tokens; ++i) {
+      ++histogram[tokens[i].code];
+    }
+
+    if (!VP8LCreateHuffmanTree(histogram, 7, &huffman_code)) {
+      goto End;
+    }
+  }
+
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
+  ClearHuffmanTreeIfOnlyOneSymbol(&huffman_code);
+  {
+    int trailing_zero_bits = 0;
+    int trimmed_length = num_tokens;
+    int write_trimmed_length;
+    int length;
+    int i = num_tokens;
+    while (i-- > 0) {
+      const int ix = tokens[i].code;
+      if (ix == 0 || ix == 17 || ix == 18) {
+        --trimmed_length;   // discount trailing zeros
+        trailing_zero_bits += code_length_bitdepth[ix];
+        if (ix == 17) {
+          trailing_zero_bits += 3;
+        } else if (ix == 18) {
+          trailing_zero_bits += 7;
+        }
+      } else {
+        break;
+      }
+    }
+    write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
+    length = write_trimmed_length ? trimmed_length : num_tokens;
+    VP8LWriteBits(bw, 1, write_trimmed_length);
+    if (write_trimmed_length) {
+      const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
+      const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
+      VP8LWriteBits(bw, 3, nbitpairs - 1);
+      assert(trimmed_length >= 2);
+      VP8LWriteBits(bw, nbitpairs * 2, trimmed_length - 2);
+    }
+    StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
+  }
+  ok = 1;
+ End:
+  free(tokens);
+  return ok;
+}
+
+static int StoreHuffmanCode(VP8LBitWriter* const bw,
+                            const HuffmanTreeCode* const huffman_code) {
+  int i;
+  int count = 0;
+  int symbols[2] = { 0, 0 };
+  const int kMaxBits = 8;
+  const int kMaxSymbol = 1 << kMaxBits;
+
+  // Check whether it's a small tree.
+  for (i = 0; i < huffman_code->num_symbols && count < 3; ++i) {
+    if (huffman_code->code_lengths[i] != 0) {
+      if (count < 2) symbols[count] = i;
+      ++count;
+    }
+  }
+
+  if (count == 0) {   // emit minimal tree for empty cases
+    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+    VP8LWriteBits(bw, 4, 0x01);
+    return 1;
+  } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
+    VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LWriteBits(bw, 1, count - 1);
+    if (symbols[0] <= 1) {
+      VP8LWriteBits(bw, 1, 0);  // Code bit for small (1 bit) symbol value.
+      VP8LWriteBits(bw, 1, symbols[0]);
+    } else {
+      VP8LWriteBits(bw, 1, 1);
+      VP8LWriteBits(bw, 8, symbols[0]);
+    }
+    if (count == 2) {
+      VP8LWriteBits(bw, 8, symbols[1]);
+    }
+    return 1;
+  } else {
+    return StoreFullHuffmanCode(bw, huffman_code);
+  }
+}
+
+static void WriteHuffmanCode(VP8LBitWriter* const bw,
+                             const HuffmanTreeCode* const code, int index) {
+  const int depth = code->code_lengths[index];
+  const int symbol = code->codes[index];
+  VP8LWriteBits(bw, depth, symbol);
+}
+
+static void StoreImageToBitMask(
+    VP8LBitWriter* const bw, int width, int histo_bits,
+    const VP8LBackwardRefs* const refs,
+    const uint16_t* histogram_symbols,
+    const HuffmanTreeCode* const huffman_codes) {
+  // x and y trace the position in the image.
+  int x = 0;
+  int y = 0;
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy* const v = &refs->refs[i];
+    const int histogram_ix = histogram_symbols[histo_bits ?
+                                               (y >> histo_bits) * histo_xsize +
+                                               (x >> histo_bits) : 0];
+    const HuffmanTreeCode* const codes = huffman_codes + 5 * histogram_ix;
+    if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+      WriteHuffmanCode(bw, codes, literal_ix);
+    } else if (PixOrCopyIsLiteral(v)) {
+      static const int order[] = { 1, 2, 0, 3 };
+      int k;
+      for (k = 0; k < 4; ++k) {
+        const int code = PixOrCopyLiteral(v, order[k]);
+        WriteHuffmanCode(bw, codes + k, code);
+      }
+    } else {
+      int bits, n_bits;
+      int code, distance;
+
+      PrefixEncode(v->len, &code, &n_bits, &bits);
+      WriteHuffmanCode(bw, codes, 256 + code);
+      VP8LWriteBits(bw, n_bits, bits);
+
+      distance = PixOrCopyDistance(v);
+      PrefixEncode(distance, &code, &n_bits, &bits);
+      WriteHuffmanCode(bw, codes + 4, code);
+      VP8LWriteBits(bw, n_bits, bits);
+    }
+    x += PixOrCopyLength(v);
+    while (x >= width) {
+      x -= width;
+      ++y;
+    }
+  }
+}
+
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
+static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
+                                const uint32_t* const argb,
+                                int width, int height, int quality) {
+  int i;
+  int ok = 0;
+  VP8LBackwardRefs refs;
+  HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
+  const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
+  VP8LHistogramSet* const histogram_image = VP8LAllocateHistogramSet(1, 0);
+  if (histogram_image == NULL) return 0;
+
+  // Calculate backward references from ARGB image.
+  if (!VP8LGetBackwardReferences(width, height, argb, quality, 0, 1, &refs)) {
+    goto Error;
+  }
+  // Build histogram image and symbols from backward references.
+  VP8LHistogramStoreRefs(&refs, histogram_image->histograms[0]);
+
+  // Create Huffman bit lengths and codes for each histogram image.
+  assert(histogram_image->size == 1);
+  if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    goto Error;
+  }
+
+  // No color cache, no Huffman image.
+  VP8LWriteBits(bw, 1, 0);
+
+  // Store Huffman codes.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    if (!StoreHuffmanCode(bw, codes)) {
+      goto Error;
+    }
+    ClearHuffmanTreeIfOnlyOneSymbol(codes);
+  }
+
+  // Store actual literals.
+  StoreImageToBitMask(bw, width, 0, &refs, histogram_symbols, huffman_codes);
+  ok = 1;
+
+ Error:
+  free(histogram_image);
+  VP8LClearBackwardRefs(&refs);
+  free(huffman_codes[0].codes);
+  return ok;
+}
+
+static int EncodeImageInternal(VP8LBitWriter* const bw,
+                               const uint32_t* const argb,
+                               int width, int height, int quality,
+                               int cache_bits, int histogram_bits) {
+  int ok = 0;
+  const int use_2d_locality = 1;
+  const int use_color_cache = (cache_bits > 0);
+  const uint32_t histogram_image_xysize =
+      VP8LSubSampleSize(width, histogram_bits) *
+      VP8LSubSampleSize(height, histogram_bits);
+  VP8LHistogramSet* histogram_image =
+      VP8LAllocateHistogramSet(histogram_image_xysize, 0);
+  int histogram_image_size = 0;
+  size_t bit_array_size = 0;
+  HuffmanTreeCode* huffman_codes = NULL;
+  VP8LBackwardRefs refs;
+  uint16_t* const histogram_symbols =
+      (uint16_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                sizeof(*histogram_symbols));
+  assert(histogram_bits >= MIN_HUFFMAN_BITS);
+  assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
+
+  // Calculate backward references from ARGB image.
+  if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
+                                 use_2d_locality, &refs)) {
+    goto Error;
+  }
+  // Build histogram image and symbols from backward references.
+  if (!VP8LGetHistoImageSymbols(width, height, &refs,
+                                quality, histogram_bits, cache_bits,
+                                histogram_image,
+                                histogram_symbols)) {
+    goto Error;
+  }
+  // Create Huffman bit lengths and codes for each histogram image.
+  histogram_image_size = histogram_image->size;
+  bit_array_size = 5 * histogram_image_size;
+  huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                   sizeof(*huffman_codes));
+  if (huffman_codes == NULL ||
+      !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    goto Error;
+  }
+
+  // Color Cache parameters.
+  VP8LWriteBits(bw, 1, use_color_cache);
+  if (use_color_cache) {
+    VP8LWriteBits(bw, 4, cache_bits);
+  }
+
+  // Huffman image + meta huffman.
+  {
+    const int write_histogram_image = (histogram_image_size > 1);
+    VP8LWriteBits(bw, 1, write_histogram_image);
+    if (write_histogram_image) {
+      uint32_t* const histogram_argb =
+          (uint32_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                    sizeof(*histogram_argb));
+      int max_index = 0;
+      uint32_t i;
+      if (histogram_argb == NULL) goto Error;
+      for (i = 0; i < histogram_image_xysize; ++i) {
+        const int index = histogram_symbols[i] & 0xffff;
+        histogram_argb[i] = 0xff000000 | (index << 8);
+        if (index >= max_index) {
+          max_index = index + 1;
+        }
+      }
+      histogram_image_size = max_index;
+
+      VP8LWriteBits(bw, 3, histogram_bits - 2);
+      ok = EncodeImageNoHuffman(bw, histogram_argb,
+                                VP8LSubSampleSize(width, histogram_bits),
+                                VP8LSubSampleSize(height, histogram_bits),
+                                quality);
+      free(histogram_argb);
+      if (!ok) goto Error;
+    }
+  }
+
+  // Store Huffman codes.
+  {
+    int i;
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      HuffmanTreeCode* const codes = &huffman_codes[i];
+      if (!StoreHuffmanCode(bw, codes)) goto Error;
+      ClearHuffmanTreeIfOnlyOneSymbol(codes);
+    }
+  }
+  // Free combined histograms.
+  free(histogram_image);
+  histogram_image = NULL;
+
+  // Store actual literals.
+  StoreImageToBitMask(bw, width, histogram_bits, &refs,
+                      histogram_symbols, huffman_codes);
+  ok = 1;
+
+ Error:
+  if (!ok) free(histogram_image);
+
+  VP8LClearBackwardRefs(&refs);
+  if (huffman_codes != NULL) {
+    free(huffman_codes->codes);
+    free(huffman_codes);
+  }
+  free(histogram_symbols);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Transforms
+
+// Check if it would be a good idea to subtract green from red and blue. We
+// only impact entropy in red/blue components, don't bother to look at others.
+static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
+                                     int width, int height,
+                                     VP8LBitWriter* const bw) {
+  if (!enc->use_palette_) {
+    int i;
+    const uint32_t* const argb = enc->argb_;
+    double bit_cost_before, bit_cost_after;
+    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
+    if (histo == NULL) return 0;
+
+    VP8LHistogramInit(histo, 1);
+    for (i = 0; i < width * height; ++i) {
+      const uint32_t c = argb[i];
+      ++histo->red_[(c >> 16) & 0xff];
+      ++histo->blue_[(c >> 0) & 0xff];
+    }
+    bit_cost_before = VP8LHistogramEstimateBits(histo);
+
+    VP8LHistogramInit(histo, 1);
+    for (i = 0; i < width * height; ++i) {
+      const uint32_t c = argb[i];
+      const int green = (c >> 8) & 0xff;
+      ++histo->red_[((c >> 16) - green) & 0xff];
+      ++histo->blue_[((c >> 0) - green) & 0xff];
+    }
+    bit_cost_after = VP8LHistogramEstimateBits(histo);
+    free(histo);
+
+    // Check if subtracting green yields low entropy.
+    enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
+    if (enc->use_subtract_green_) {
+      VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+      VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
+      VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
+    }
+  }
+  return 1;
+}
+
+static int ApplyPredictFilter(const VP8LEncoder* const enc,
+                              int width, int height, int quality,
+                              VP8LBitWriter* const bw) {
+  const int pred_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+
+  VP8LResidualImage(width, height, pred_bits, enc->argb_, enc->argb_scratch_,
+                    enc->transform_data_);
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
+  assert(pred_bits >= 2);
+  VP8LWriteBits(bw, 3, pred_bits - 2);
+  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
+                            transform_width, transform_height, quality)) {
+    return 0;
+  }
+  return 1;
+}
+
+static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                 int width, int height, int quality,
+                                 VP8LBitWriter* const bw) {
+  const int ccolor_transform_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
+  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+  const int step = (quality == 0) ? 32 : 8;
+
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
+                          enc->argb_, enc->transform_data_);
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
+  assert(ccolor_transform_bits >= 2);
+  VP8LWriteBits(bw, 3, ccolor_transform_bits - 2);
+  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
+                            transform_width, transform_height, quality)) {
+    return 0;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+  data[3] = (val >> 24) & 0xff;
+}
+
+static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
+                                         size_t riff_size, size_t vp8l_size) {
+  uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
+    'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
+  };
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static int WriteImageSize(const WebPPicture* const pic,
+                          VP8LBitWriter* const bw) {
+  const int width = pic->width - 1;
+  const int height = pic->height - 1;
+  assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
+
+  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, width);
+  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, height);
+  return !bw->error_;
+}
+
+static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
+  VP8LWriteBits(bw, 1, has_alpha);
+  VP8LWriteBits(bw, VP8L_VERSION_BITS, VP8L_VERSION);
+  return !bw->error_;
+}
+
+static WebPEncodingError WriteImage(const WebPPicture* const pic,
+                                    VP8LBitWriter* const bw,
+                                    size_t* const coded_size) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
+  const size_t webpll_size = VP8LBitWriterNumBytes(bw);
+  const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
+  const size_t pad = vp8l_size & 1;
+  const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
+
+  err = WriteRiffHeader(pic, riff_size, vp8l_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!pic->writer(webpll_data, webpll_size, pic)) {
+    err = VP8_ENC_ERROR_BAD_WRITE;
+    goto Error;
+  }
+
+  if (pad) {
+    const uint8_t pad_byte[1] = { 0 };
+    if (!pic->writer(pad_byte, 1, pic)) {
+      err = VP8_ENC_ERROR_BAD_WRITE;
+      goto Error;
+    }
+  }
+  *coded_size = CHUNK_HEADER_SIZE + riff_size;
+  return VP8_ENC_OK;
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+// Allocates the memory for argb (W x H) buffer, 2 rows of context for
+// prediction and transform data.
+static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
+                                                 int width, int height) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const int tile_size = 1 << enc->transform_bits_;
+  const uint64_t image_size = width * height;
+  const uint64_t argb_scratch_size = tile_size * width + width;
+  const uint64_t transform_data_size =
+      (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
+      (uint64_t)VP8LSubSampleSize(height, enc->transform_bits_);
+  const uint64_t total_size =
+      image_size + argb_scratch_size + transform_data_size;
+  uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  if (mem == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  enc->argb_ = mem;
+  mem += image_size;
+  enc->argb_scratch_ = mem;
+  mem += argb_scratch_size;
+  enc->transform_data_ = mem;
+  enc->current_width_ = width;
+
+ Error:
+  return err;
+}
+
+// Bundles multiple (2, 4 or 8) pixels into a single pixel.
+// Returns the new xsize.
+static void BundleColorMap(const WebPPicture* const pic,
+                           int xbits, uint32_t* bundled_argb, int xs) {
+  int y;
+  const int bit_depth = 1 << (3 - xbits);
+  uint32_t code = 0;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    for (x = 0; x < width; ++x) {
+      const int mask = (1 << xbits) - 1;
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0;
+      }
+      // TODO(vikasa): simplify the bundling logic.
+      code |= (argb[x] & 0xff00) << (bit_depth * xsub);
+      bundled_argb[y * xs + (x >> xbits)] = 0xff000000 | code;
+    }
+    argb += pic->argb_stride;
+  }
+}
+
+// Note: Expects "enc->palette_" to be set properly.
+// Also, "enc->palette_" will be modified after this call and should not be used
+// later.
+static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
+                                      VP8LEncoder* const enc, int quality) {
+  WebPEncodingError err = VP8_ENC_OK;
+  int i, x, y;
+  const WebPPicture* const pic = enc->pic_;
+  uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t* const palette = enc->palette_;
+  const int palette_size = enc->palette_size_;
+
+  // Replace each input pixel by corresponding palette index.
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const uint32_t pix = argb[x];
+      for (i = 0; i < palette_size; ++i) {
+        if (pix == palette[i]) {
+          argb[x] = 0xff000000u | (i << 8);
+          break;
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+
+  // Save palette to bitstream.
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
+  assert(palette_size >= 1);
+  VP8LWriteBits(bw, 8, palette_size - 1);
+  for (i = palette_size - 1; i >= 1; --i) {
+    palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+  }
+  if (!EncodeImageNoHuffman(bw, palette, palette_size, 1, quality)) {
+    err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+    goto Error;
+  }
+
+  if (palette_size <= 16) {
+    // Image can be packed (multiple pixels per uint32_t).
+    int xbits = 1;
+    if (palette_size <= 2) {
+      xbits = 3;
+    } else if (palette_size <= 4) {
+      xbits = 2;
+    }
+    err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+    if (err != VP8_ENC_OK) goto Error;
+    BundleColorMap(pic, xbits, enc->argb_, enc->current_width_);
+  }
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetHistoBits(const WebPConfig* const config,
+                        const WebPPicture* const pic) {
+  const int width = pic->width;
+  const int height = pic->height;
+  const size_t hist_size = sizeof(VP8LHistogram);
+  // Make tile size a function of encoding method (Range: 0 to 6).
+  int histo_bits = 7 - config->method;
+  while (1) {
+    const size_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                   VP8LSubSampleSize(height, histo_bits) *
+                                   hist_size;
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static void InitEncParams(VP8LEncoder* const enc) {
+  const WebPConfig* const config = enc->config_;
+  const WebPPicture* const picture = enc->pic_;
+  const int method = config->method;
+  const float quality = config->quality;
+  enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
+  enc->histo_bits_ = GetHistoBits(config, picture);
+  enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LEncoder
+
+static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
+                                   const WebPPicture* const picture) {
+  VP8LEncoder* const enc = (VP8LEncoder*)calloc(1, sizeof(*enc));
+  if (enc == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc->config_ = config;
+  enc->pic_ = picture;
+  return enc;
+}
+
+static void VP8LEncoderDelete(VP8LEncoder* enc) {
+  free(enc->argb_);
+  free(enc);
+}
+
+// -----------------------------------------------------------------------------
+// Main call
+
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const int quality = (int)config->quality;
+  const int width = picture->width;
+  const int height = picture->height;
+  VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
+  const size_t byte_position = VP8LBitWriterNumBytes(bw);
+
+  if (enc == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  InitEncParams(enc);
+
+  // ---------------------------------------------------------------------------
+  // Analyze image (entropy, num_palettes etc)
+
+  if (!VP8LEncAnalyze(enc, config->image_hint)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (enc->use_palette_) {
+    err = ApplyPalette(bw, enc, quality);
+    if (err != VP8_ENC_OK) goto Error;
+    // Color cache is disabled for palette.
+    enc->cache_bits_ = 0;
+  }
+
+  // In case image is not packed.
+  if (enc->argb_ == NULL) {
+    int y;
+    err = AllocateTransformBuffer(enc, width, height);
+    if (err != VP8_ENC_OK) goto Error;
+    for (y = 0; y < height; ++y) {
+      memcpy(enc->argb_ + y * width,
+             picture->argb + y * picture->argb_stride,
+             width * sizeof(*enc->argb_));
+    }
+    enc->current_width_ = width;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Apply transforms and write transform data.
+
+  if (!EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (enc->use_predict_) {
+    if (!ApplyPredictFilter(enc, enc->current_width_, height, quality, bw)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  if (enc->use_cross_color_) {
+    if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  VP8LWriteBits(bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
+
+  // ---------------------------------------------------------------------------
+  // Estimate the color cache size.
+
+  if (enc->cache_bits_ > 0) {
+    if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
+                                           height, &enc->cache_bits_)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Encode and write the transformed image.
+
+  if (!EncodeImageInternal(bw, enc->argb_, enc->current_width_, height,
+                           quality, enc->cache_bits_, enc->histo_bits_)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    stats->lossless_features = 0;
+    if (enc->use_predict_) stats->lossless_features |= 1;
+    if (enc->use_cross_color_) stats->lossless_features |= 2;
+    if (enc->use_subtract_green_) stats->lossless_features |= 4;
+    if (enc->use_palette_) stats->lossless_features |= 8;
+    stats->histogram_bits = enc->histo_bits_;
+    stats->transform_bits = enc->transform_bits_;
+    stats->cache_bits = enc->cache_bits_;
+    stats->palette_size = enc->palette_size_;
+    stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
+  }
+
+ Error:
+  VP8LEncoderDelete(enc);
+  return err;
+}
+
+int VP8LEncodeImage(const WebPConfig* const config,
+                    const WebPPicture* const picture) {
+  int width, height;
+  int has_alpha;
+  size_t coded_size;
+  int percent = 0;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBitWriter bw;
+
+  if (picture == NULL) return 0;
+
+  if (config == NULL || picture->argb == NULL) {
+    err = VP8_ENC_ERROR_NULL_PARAMETER;
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+
+  width = picture->width;
+  height = picture->height;
+  if (!VP8LBitWriterInit(&bw, (width * height) >> 1)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 1, &percent)) {
+ UserAbort:
+    err = VP8_ENC_ERROR_USER_ABORT;
+    goto Error;
+  }
+  // Reset stats (for pure lossless coding)
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.f;
+    stats->PSNR[1] = 99.f;
+    stats->PSNR[2] = 99.f;
+    stats->PSNR[3] = 99.f;
+    stats->PSNR[4] = 99.f;
+  }
+
+  // Write image size.
+  if (!WriteImageSize(picture, &bw)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  has_alpha = WebPPictureHasTransparency(picture);
+  // Write the non-trivial Alpha flag and lossless version.
+  if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+
+  // Encode main image stream.
+  err = VP8LEncodeStream(config, picture, &bw);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
+  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+
+  // Finish the RIFF chunk.
+  err = WriteImage(picture, &bw, &coded_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
+
+  // Save size.
+  if (picture->stats != NULL) {
+    picture->stats->coded_size += (int)coded_size;
+    picture->stats->lossless_size = (int)coded_size;
+  }
+
+  if (picture->extra_info != NULL) {
+    const int mb_w = (width + 15) >> 4;
+    const int mb_h = (height + 15) >> 4;
+    memset(picture->extra_info, 0, mb_w * mb_h * sizeof(*picture->extra_info));
+  }
+
+ Error:
+  if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  VP8LBitWriterDestroy(&bw);
+  if (err != VP8_ENC_OK) {
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/enc/vp8li.h b/src/enc/vp8li.h
new file mode 100644
index 0000000..5f3665a
--- /dev/null
+++ b/src/enc/vp8li.h
@@ -0,0 +1,68 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Lossless encoder: internal header.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+
+#ifndef WEBP_ENC_VP8LI_H_
+#define WEBP_ENC_VP8LI_H_
+
+#include "./histogram.h"
+#include "../utils/bit_writer.h"
+#include "webp/encode.h"
+#include "webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+typedef struct {
+  const WebPConfig* config_;    // user configuration and parameters
+  const WebPPicture* pic_;      // input picture.
+
+  uint32_t* argb_;              // Transformed argb image data.
+  uint32_t* argb_scratch_;      // Scratch memory for argb rows
+                                // (used for prediction).
+  uint32_t* transform_data_;    // Scratch memory for transform data.
+  int       current_width_;     // Corresponds to packed image width.
+
+  // Encoding parameters derived from quality parameter.
+  int histo_bits_;
+  int transform_bits_;
+  int cache_bits_;        // If equal to 0, don't use color cache.
+
+  // Encoding parameters derived from image characteristics.
+  int use_cross_color_;
+  int use_subtract_green_;
+  int use_predict_;
+  int use_palette_;
+  int palette_size_;
+  uint32_t palette_[MAX_PALETTE_SIZE];
+} VP8LEncoder;
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// Encodes the picture.
+// Returns 0 if config or picture is NULL or picture doesn't have valid argb
+// input.
+int VP8LEncodeImage(const WebPConfig* const config,
+                    const WebPPicture* const picture);
+
+// Encodes the main image stream using the supplied bit writer.
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_ENC_VP8LI_H_ */
diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c
index 819dd63..3c27558 100644
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
@@ -14,7 +14,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "vp8enci.h"
+#include "./vp8enci.h"
+#include "./vp8li.h"
+#include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
 
@@ -26,17 +28,15 @@
 #include <stdio.h>
 #endif
 
-#define MAX_DIMENSION 16384   // maximum width/height allowed by the spec
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 int WebPGetEncoderVersion(void) {
   return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPPicture
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static int DummyWriter(const uint8_t* data, size_t data_size,
                        const WebPPicture* const picture) {
@@ -47,11 +47,11 @@
   return 1;
 }
 
-int WebPPictureInitInternal(WebPPicture* const picture, int version) {
-  if (version != WEBP_ENCODER_ABI_VERSION) {
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
     return 0;   // caller/system version mismatch!
   }
-  if (picture) {
+  if (picture != NULL) {
     memset(picture, 0, sizeof(*picture));
     picture->writer = DummyWriter;
     WebPEncodingSetError(picture, VP8_ENC_OK);
@@ -59,9 +59,9 @@
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Encoder
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static void ResetSegmentHeader(VP8Encoder* const enc) {
   VP8SegmentHeader* const hdr = &enc->segment_hdr_;
@@ -112,11 +112,15 @@
 
 static void MapConfigToTools(VP8Encoder* const enc) {
   const int method = enc->config_->method;
+  const int limit = 100 - enc->config_->partition_limit;
   enc->method_ = method;
   enc->rd_opt_level_ = (method >= 6) ? 3
                      : (method >= 5) ? 2
                      : (method >= 3) ? 1
                      : 0;
+  enc->max_i4_header_bits_ =
+      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
+      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
 }
 
 // Memory scaling with dimensions:
@@ -140,8 +144,8 @@
 //              LFStats: 2048
 // Picture size (yuv): 589824
 
-static VP8Encoder* InitEncoder(const WebPConfig* const config,
-                               WebPPicture* const picture) {
+static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+                                  WebPPicture* const picture) {
   const int use_filter =
       (config->filter_strength > 0) || (config->autofilter > 0);
   const int mb_w = (picture->width + 15) >> 4;
@@ -161,13 +165,14 @@
       config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
   VP8Encoder* enc;
   uint8_t* mem;
-  size_t size = sizeof(VP8Encoder) + ALIGN_CST  // main struct
-              + cache_size                      // working caches
-              + info_size                       // modes info
-              + preds_size                      // prediction modes
-              + samples_size                    // top/left samples
-              + nz_size                         // coeff context bits
-              + lf_stats_size;                  // autofilter stats
+  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
+                      + ALIGN_CST                      // cache alignment
+                      + cache_size                     // working caches
+                      + info_size                      // modes info
+                      + preds_size                     // prediction modes
+                      + samples_size                   // top/left samples
+                      + nz_size                        // coeff context bits
+                      + lf_stats_size;                 // autofilter stats
 
 #ifdef PRINT_MEMORY_INFO
   printf("===================================\n");
@@ -195,7 +200,7 @@
          mb_w * mb_h * 384 * sizeof(uint8_t));
   printf("===================================\n");
 #endif
-  mem = (uint8_t*)malloc(size);
+  mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
   if (mem == NULL) {
     WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     return NULL;
@@ -240,6 +245,7 @@
   enc->config_ = config;
   enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
   enc->pic_ = picture;
+  enc->percent_ = 0;
 
   MapConfigToTools(enc);
   VP8EncDspInit();
@@ -248,25 +254,25 @@
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
   VP8EncInitAlpha(enc);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
   VP8EncInitLayer(enc);
 #endif
 
   return enc;
 }
 
-static void DeleteEncoder(VP8Encoder* enc) {
-  if (enc) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
+static void DeleteVP8Encoder(VP8Encoder* enc) {
+  if (enc != NULL) {
     VP8EncDeleteAlpha(enc);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
     VP8EncDeleteLayer(enc);
 #endif
     free(enc);
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 static double GetPSNR(uint64_t err, uint64_t size) {
   return err ? 10. * log10(255. * 255. * size / err) : 99.;
@@ -280,11 +286,12 @@
   stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
   stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
   stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+  stats->PSNR[4] = (float)GetPSNR(sse[3], size);
 }
 
 static void StoreStats(VP8Encoder* const enc) {
   WebPAuxStats* const stats = enc->pic_->stats;
-  if (stats) {
+  if (stats != NULL) {
     int i, s;
     for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
       stats->segment_level[i] = enc->dqm_[i].fstrength_;
@@ -299,19 +306,32 @@
       stats->block_count[i] = enc->block_count_[i];
     }
   }
+  WebPReportProgress(enc->pic_, 100, &enc->percent_);  // done!
 }
 
-int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error) {
-  assert((int)error <= VP8_ENC_ERROR_BAD_WRITE);
+int WebPEncodingSetError(const WebPPicture* const pic,
+                         WebPEncodingError error) {
+  assert((int)error < VP8_ENC_ERROR_LAST);
   assert((int)error >= VP8_ENC_OK);
-  pic->error_code = error;
+  ((WebPPicture*)pic)->error_code = error;
   return 0;
 }
 
-//-----------------------------------------------------------------------------
+int WebPReportProgress(const WebPPicture* const pic,
+                       int percent, int* const percent_store) {
+  if (percent_store != NULL && percent != *percent_store) {
+    *percent_store = percent;
+    if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
+      // user abort requested
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
+      return 0;
+    }
+  }
+  return 1;  // ok
+}
+//------------------------------------------------------------------------------
 
-int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) {
-  VP8Encoder* enc;
+int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   int ok;
 
   if (pic == NULL)
@@ -323,23 +343,43 @@
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   if (pic->width <= 0 || pic->height <= 0)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  if (pic->y == NULL || pic->u == NULL || pic->v == NULL)
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
-  if (pic->width >= MAX_DIMENSION || pic->height >= MAX_DIMENSION)
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
 
-  enc = InitEncoder(config, pic);
-  if (enc == NULL) return 0;  // pic->error is already set.
-  ok = VP8EncAnalyze(enc)
-    && VP8StatLoop(enc)
-    && VP8EncLoop(enc)
+  if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
+  if (!config->lossless) {
+    VP8Encoder* enc = NULL;
+    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+      if (pic->argb != NULL) {
+        if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
+      } else {
+        return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+      }
+    }
+
+    enc = InitVP8Encoder(config, pic);
+    if (enc == NULL) return 0;  // pic->error is already set.
+    // Note: each of the tasks below account for 20% in the progress report.
+    ok = VP8EncAnalyze(enc)
+      && VP8StatLoop(enc)
+      && VP8EncLoop(enc)
+      && VP8EncFinishAlpha(enc)
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-    && VP8EncFinishAlpha(enc)
-    && VP8EncFinishLayer(enc)
+      && VP8EncFinishLayer(enc)
 #endif
-    && VP8EncWrite(enc);
-  StoreStats(enc);
-  DeleteEncoder(enc);
+      && VP8EncWrite(enc);
+    StoreStats(enc);
+    if (!ok) {
+      VP8EncFreeBitWriters(enc);
+    }
+    DeleteVP8Encoder(enc);
+  } else {
+    if (pic->argb == NULL)
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+
+    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
+  }
 
   return ok;
 }
diff --git a/src/utils/bit_reader.c b/src/utils/bit_reader.c
new file mode 100644
index 0000000..1afb1db
--- /dev/null
+++ b/src/utils/bit_reader.c
@@ -0,0 +1,229 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./bit_reader.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define MK(X) (((bit_t)(X) << (BITS)) | (MASK))
+
+//------------------------------------------------------------------------------
+// VP8BitReader
+
+void VP8InitBitReader(VP8BitReader* const br,
+                      const uint8_t* const start, const uint8_t* const end) {
+  assert(br != NULL);
+  assert(start != NULL);
+  assert(start <= end);
+  br->range_   = MK(255 - 1);
+  br->buf_     = start;
+  br->buf_end_ = end;
+  br->value_   = 0;
+  br->missing_ = 8;   // to load the very first 8bits
+  br->eof_     = 0;
+}
+
+const uint8_t kVP8Log2Range[128] = {
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = (range << kVP8Log2Range[range]) + trailing 1's
+const bit_t kVP8NewRange[128] = {
+  MK(127), MK(127), MK(191), MK(127), MK(159), MK(191), MK(223), MK(127),
+  MK(143), MK(159), MK(175), MK(191), MK(207), MK(223), MK(239), MK(127),
+  MK(135), MK(143), MK(151), MK(159), MK(167), MK(175), MK(183), MK(191),
+  MK(199), MK(207), MK(215), MK(223), MK(231), MK(239), MK(247), MK(127),
+  MK(131), MK(135), MK(139), MK(143), MK(147), MK(151), MK(155), MK(159),
+  MK(163), MK(167), MK(171), MK(175), MK(179), MK(183), MK(187), MK(191),
+  MK(195), MK(199), MK(203), MK(207), MK(211), MK(215), MK(219), MK(223),
+  MK(227), MK(231), MK(235), MK(239), MK(243), MK(247), MK(251), MK(127),
+  MK(129), MK(131), MK(133), MK(135), MK(137), MK(139), MK(141), MK(143),
+  MK(145), MK(147), MK(149), MK(151), MK(153), MK(155), MK(157), MK(159),
+  MK(161), MK(163), MK(165), MK(167), MK(169), MK(171), MK(173), MK(175),
+  MK(177), MK(179), MK(181), MK(183), MK(185), MK(187), MK(189), MK(191),
+  MK(193), MK(195), MK(197), MK(199), MK(201), MK(203), MK(205), MK(207),
+  MK(209), MK(211), MK(213), MK(215), MK(217), MK(219), MK(221), MK(223),
+  MK(225), MK(227), MK(229), MK(231), MK(233), MK(235), MK(237), MK(239),
+  MK(241), MK(243), MK(245), MK(247), MK(249), MK(251), MK(253), MK(127)
+};
+
+#undef MK
+
+void VP8LoadFinalBytes(VP8BitReader* const br) {
+  assert(br != NULL && br->buf_ != NULL);
+  // Only read 8bits at a time
+  if (br->buf_ < br->buf_end_) {
+    br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 + br->missing_);
+    br->missing_ -= 8;
+  } else {
+    br->eof_ = 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Higher-level calls
+
+uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+  uint32_t v = 0;
+  while (bits-- > 0) {
+    v |= VP8GetBit(br, 0x80) << bits;
+  }
+  return v;
+}
+
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
+  const int value = VP8GetValue(br, bits);
+  return VP8Get(br) ? -value : value;
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitReader
+
+#define MAX_NUM_BIT_READ 25
+
+static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
+  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
+  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
+};
+
+void VP8LInitBitReader(VP8LBitReader* const br,
+                       const uint8_t* const start,
+                       size_t length) {
+  size_t i;
+  assert(br != NULL);
+  assert(start != NULL);
+  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
+
+  br->buf_ = start;
+  br->len_ = length;
+  br->val_ = 0;
+  br->pos_ = 0;
+  br->bit_pos_ = 0;
+  br->eos_ = 0;
+  br->error_ = 0;
+  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
+    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
+    ++br->pos_;
+  }
+}
+
+void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
+                            const uint8_t* const buf, size_t len) {
+  assert(br != NULL);
+  assert(buf != NULL);
+  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
+  br->eos_ = (br->pos_ >= len);
+  br->buf_ = buf;
+  br->len_ = len;
+}
+
+static void ShiftBytes(VP8LBitReader* const br) {
+  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
+    br->val_ >>= 8;
+    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << 56;
+    ++br->pos_;
+    br->bit_pos_ -= 8;
+  }
+}
+
+void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= 32) {
+#if defined(__x86_64__) || defined(_M_X64)
+    if (br->pos_ + 8 < br->len_) {
+      br->val_ >>= 32;
+      // The expression below needs a little-endian arch to work correctly.
+      // This gives a large speedup for decoding speed.
+      br->val_ |= *(const uint64_t *)(br->buf_ + br->pos_) << 32;
+      br->pos_ += 4;
+      br->bit_pos_ -= 32;
+    } else {
+      // Slow path.
+      ShiftBytes(br);
+    }
+#else
+    // Always the slow path.
+    ShiftBytes(br);
+#endif
+  }
+  if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
+    br->eos_ = 1;
+  }
+}
+
+uint32_t VP8LReadOneBit(VP8LBitReader* const br) {
+  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
+  // Flag an error at end_of_stream.
+  if (!br->eos_) {
+    ++br->bit_pos_;
+    if (br->bit_pos_ >= 32) {
+      ShiftBytes(br);
+    }
+    // After this last bit is read, check if eos needs to be flagged.
+    if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
+      br->eos_ = 1;
+    }
+  } else {
+    br->error_ = 1;
+  }
+  return val;
+}
+
+uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
+  uint32_t val = 0;
+  assert(n_bits >= 0);
+  // Flag an error if end_of_stream or n_bits is more than allowed limit.
+  if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
+    // If this read is going to cross the read buffer, set the eos flag.
+    if (br->pos_ == br->len_) {
+      if ((br->bit_pos_ + n_bits) >= 64) {
+        br->eos_ = 1;
+        if ((br->bit_pos_ + n_bits) > 64) return val;
+      }
+    }
+    val = (br->val_ >> br->bit_pos_) & kBitMask[n_bits];
+    br->bit_pos_ += n_bits;
+    if (br->bit_pos_ >= 40) {
+      if (br->pos_ + 5 < br->len_) {
+        br->val_ >>= 40;
+        br->val_ |=
+            (((uint64_t)br->buf_[br->pos_ + 0]) << 24) |
+            (((uint64_t)br->buf_[br->pos_ + 1]) << 32) |
+            (((uint64_t)br->buf_[br->pos_ + 2]) << 40) |
+            (((uint64_t)br->buf_[br->pos_ + 3]) << 48) |
+            (((uint64_t)br->buf_[br->pos_ + 4]) << 56);
+        br->pos_ += 5;
+        br->bit_pos_ -= 40;
+      }
+      if (br->bit_pos_ >= 8) {
+        ShiftBytes(br);
+      }
+    }
+  } else {
+    br->error_ = 1;
+  }
+  return val;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
new file mode 100644
index 0000000..11a40a5
--- /dev/null
+++ b/src/utils/bit_reader.h
@@ -0,0 +1,197 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
+
+#ifndef WEBP_UTILS_BIT_READER_H_
+#define WEBP_UTILS_BIT_READER_H_
+
+#include <assert.h>
+#ifdef _MSC_VER
+#include <stdlib.h>  // _byteswap_ulong
+#endif
+#include <string.h>  // For memcpy
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define BITS 32     // can be 32, 16 or 8
+#define MASK ((((bit_t)1) << (BITS)) - 1)
+#if (BITS == 32)
+typedef uint64_t bit_t;   // natural register type
+typedef uint32_t lbit_t;  // natural type for memory I/O
+#elif (BITS == 16)
+typedef uint32_t bit_t;
+typedef uint16_t lbit_t;
+#else
+typedef uint32_t bit_t;
+typedef uint8_t lbit_t;
+#endif
+
+//------------------------------------------------------------------------------
+// Bitreader and code-tree reader
+
+typedef struct VP8BitReader VP8BitReader;
+struct VP8BitReader {
+  const uint8_t* buf_;        // next byte to be read
+  const uint8_t* buf_end_;    // end of read buffer
+  int eof_;                   // true if input is exhausted
+
+  // boolean decoder
+  bit_t range_;            // current range minus 1. In [127, 254] interval.
+  bit_t value_;            // current value
+  int missing_;            // number of missing bits in value_ (8bit)
+};
+
+// Initialize the bit reader and the boolean decoder.
+void VP8InitBitReader(VP8BitReader* const br,
+                      const uint8_t* const start, const uint8_t* const end);
+
+// return the next value made of 'num_bits' bits
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
+static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
+  return VP8GetValue(br, 1);
+}
+
+// return the next value with sign-extension.
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+
+// Read a bit with proba 'prob'. Speed-critical function!
+extern const uint8_t kVP8Log2Range[128];
+extern const bit_t kVP8NewRange[128];
+
+void VP8LoadFinalBytes(VP8BitReader* const br);    // special case for the tail
+
+static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
+  assert(br && br->buf_);
+  // Read 'BITS' bits at a time if possible.
+  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
+    // convert memory type to register type (with some zero'ing!)
+    bit_t bits;
+    lbit_t in_bits = *(lbit_t*)br->buf_;
+    br->buf_ += (BITS) >> 3;
+#if !defined(__BIG_ENDIAN__)
+#if (BITS == 32)
+#if defined(__i386__) || defined(__x86_64__)
+    __asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
+    bits = (bit_t)in_bits;   // 32b -> 64b zero-extension
+#elif defined(_MSC_VER)
+    bits = _byteswap_ulong(in_bits);
+#else
+    bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
+         | ((in_bits << 8) & 0xff0000)  | (in_bits << 24);
+#endif  // x86
+#elif (BITS == 16)
+    // gcc will recognize a 'rorw $8, ...' here:
+    bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
+#endif
+#else    // LITTLE_ENDIAN
+    bits = (bit_t)in_bits;
+#endif
+    br->value_ |= bits << br->missing_;
+    br->missing_ -= (BITS);
+  } else {
+    VP8LoadFinalBytes(br);    // no need to be inlined
+  }
+}
+
+static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, bit_t split) {
+  const bit_t value_split = split | (MASK);
+  if (br->missing_ > 0) {  // Make sure we have a least BITS bits in 'value_'
+    VP8LoadNewBytes(br);
+  }
+  if (br->value_ > value_split) {
+    br->range_ -= value_split + 1;
+    br->value_ -= value_split + 1;
+    return 1;
+  } else {
+    br->range_ = value_split;
+    return 0;
+  }
+}
+
+static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
+  // range_ is in [0..127] interval here.
+  const int idx = br->range_ >> (BITS);
+  const int shift = kVP8Log2Range[idx];
+  br->range_ = kVP8NewRange[idx];
+  br->value_ <<= shift;
+  br->missing_ += shift;
+}
+
+static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+  // It's important to avoid generating a 64bit x 64bit multiply here.
+  // We just need an 8b x 8b after all.
+  const bit_t split =
+      (bit_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
+  const int bit = VP8BitUpdate(br, split);
+  if (br->range_ <= (((bit_t)0x7e << (BITS)) | (MASK))) {
+    VP8Shift(br);
+  }
+  return bit;
+}
+
+static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+  const bit_t split = (br->range_ >> 1);
+  const int bit = VP8BitUpdate(br, split);
+  VP8Shift(br);
+  return bit ? -v : v;
+}
+
+
+// -----------------------------------------------------------------------------
+// Bitreader
+
+typedef struct {
+  uint64_t       val_;
+  const uint8_t* buf_;
+  size_t         len_;
+  size_t         pos_;
+  int            bit_pos_;
+  int            eos_;
+  int            error_;
+} VP8LBitReader;
+
+void VP8LInitBitReader(VP8LBitReader* const br,
+                       const uint8_t* const start,
+                       size_t length);
+
+//  Sets a new data buffer.
+void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
+                            const uint8_t* const buffer, size_t length);
+
+// Reads the specified number of bits from Read Buffer.
+// Flags an error in case end_of_stream or n_bits is more than allowed limit.
+// Flags eos if this read attempt is going to cross the read buffer.
+uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
+
+// Reads one bit from Read Buffer. Flags an error in case end_of_stream.
+// Flags eos after reading last bit from the buffer.
+uint32_t VP8LReadOneBit(VP8LBitReader* const br);
+
+// VP8LReadOneBitUnsafe is faster than VP8LReadOneBit, but it can be called only
+// 32 times after the last VP8LFillBitWindow. Any subsequent calls
+// (without VP8LFillBitWindow) will return invalid data.
+static WEBP_INLINE uint32_t VP8LReadOneBitUnsafe(VP8LBitReader* const br) {
+  const uint32_t val = (br->val_ >> br->bit_pos_) & 1;
+  ++br->bit_pos_;
+  return val;
+}
+
+// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
+void VP8LFillBitWindow(VP8LBitReader* const br);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_BIT_READER_H_ */
diff --git a/src/utils/bit_writer.c b/src/utils/bit_writer.c
new file mode 100644
index 0000000..671159c
--- /dev/null
+++ b/src/utils/bit_writer.c
@@ -0,0 +1,284 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Bit writing and boolean coder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
+
+#include <assert.h>
+#include <string.h>   // for memcpy()
+#include <stdlib.h>
+#include "./bit_writer.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// VP8BitWriter
+
+static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
+  uint8_t* new_buf;
+  size_t new_size;
+  const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
+  const size_t needed_size = (size_t)needed_size_64b;
+  if (needed_size_64b != needed_size) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (needed_size <= bw->max_pos_) return 1;
+  // If the following line wraps over 32bit, the test just after will catch it.
+  new_size = 2 * bw->max_pos_;
+  if (new_size < needed_size) new_size = needed_size;
+  if (new_size < 1024) new_size = 1024;
+  new_buf = (uint8_t*)malloc(new_size);
+  if (new_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  memcpy(new_buf, bw->buf_, bw->pos_);
+  free(bw->buf_);
+  bw->buf_ = new_buf;
+  bw->max_pos_ = new_size;
+  return 1;
+}
+
+static void kFlush(VP8BitWriter* const bw) {
+  const int s = 8 + bw->nb_bits_;
+  const int32_t bits = bw->value_ >> s;
+  assert(bw->nb_bits_ >= 0);
+  bw->value_ -= bits << s;
+  bw->nb_bits_ -= 8;
+  if ((bits & 0xff) != 0xff) {
+    size_t pos = bw->pos_;
+    if (!BitWriterResize(bw, bw->run_ + 1)) {
+      return;
+    }
+    if (bits & 0x100) {  // overflow -> propagate carry over pending 0xff's
+      if (pos > 0) bw->buf_[pos - 1]++;
+    }
+    if (bw->run_ > 0) {
+      const int value = (bits & 0x100) ? 0x00 : 0xff;
+      for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
+    }
+    bw->buf_[pos++] = bits;
+    bw->pos_ = pos;
+  } else {
+    bw->run_++;   // delay writing of bytes 0xff, pending eventual carry.
+  }
+}
+
+//------------------------------------------------------------------------------
+// renormalization
+
+static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = ((range + 1) << kVP8Log2Range[range]) - 1
+static const uint8_t kNewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
+  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
+  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
+  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
+  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
+  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
+  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
+};
+
+int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
+  const int split = (bw->range_ * prob) >> 8;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {   // emit 'shift' bits out and renormalize
+    const int shift = kNorm[bw->range_];
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= shift;
+    bw->nb_bits_ += shift;
+    if (bw->nb_bits_ > 0) kFlush(bw);
+  }
+  return bit;
+}
+
+int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
+  const int split = bw->range_ >> 1;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= 1;
+    bw->nb_bits_ += 1;
+    if (bw->nb_bits_ > 0) kFlush(bw);
+  }
+  return bit;
+}
+
+void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits) {
+  int mask;
+  for (mask = 1 << (nb_bits - 1); mask; mask >>= 1)
+    VP8PutBitUniform(bw, value & mask);
+}
+
+void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) {
+  if (!VP8PutBitUniform(bw, value != 0))
+    return;
+  if (value < 0) {
+    VP8PutValue(bw, ((-value) << 1) | 1, nb_bits + 1);
+  } else {
+    VP8PutValue(bw, value << 1, nb_bits + 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
+  bw->range_   = 255 - 1;
+  bw->value_   = 0;
+  bw->run_     = 0;
+  bw->nb_bits_ = -8;
+  bw->pos_     = 0;
+  bw->max_pos_ = 0;
+  bw->error_   = 0;
+  bw->buf_     = NULL;
+  return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
+}
+
+uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
+  VP8PutValue(bw, 0, 9 - bw->nb_bits_);
+  bw->nb_bits_ = 0;   // pad with zeroes
+  kFlush(bw);
+  return bw->buf_;
+}
+
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size) {
+  assert(data);
+  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  if (!BitWriterResize(bw, size)) return 0;
+  memcpy(bw->buf_ + bw->pos_, data, size);
+  bw->pos_ += size;
+  return 1;
+}
+
+void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
+  if (bw) {
+    free(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+
+// Returns 1 on success.
+static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
+  uint8_t* allocated_buf;
+  size_t allocated_size;
+  const size_t current_size = VP8LBitWriterNumBytes(bw);
+  const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
+  const size_t size_required = (size_t)size_required_64b;
+  if (size_required != size_required_64b) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (bw->max_bytes_ > 0 && size_required <= bw->max_bytes_) return 1;
+  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (allocated_size < size_required) allocated_size = size_required;
+  // make allocated size multiple of 1k
+  allocated_size = (((allocated_size >> 10) + 1) << 10);
+  allocated_buf = (uint8_t*)malloc(allocated_size);
+  if (allocated_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  memcpy(allocated_buf, bw->buf_, current_size);
+  free(bw->buf_);
+  bw->buf_ = allocated_buf;
+  bw->max_bytes_ = allocated_size;
+  memset(allocated_buf + current_size, 0, allocated_size - current_size);
+  return 1;
+}
+
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
+  memset(bw, 0, sizeof(*bw));
+  return VP8LBitWriterResize(bw, expected_size);
+}
+
+void VP8LBitWriterDestroy(VP8LBitWriter* const bw) {
+  if (bw != NULL) {
+    free(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
+  if (n_bits < 1) return;
+#if !defined(__BIG_ENDIAN__)
+  // Technically, this branch of the code can write up to 25 bits at a time,
+  // but in prefix encoding, the maximum number of bits written is 18 at a time.
+  {
+    uint8_t* const p = &bw->buf_[bw->bit_pos_ >> 3];
+    uint32_t v = *(const uint32_t*)p;
+    v |= bits << (bw->bit_pos_ & 7);
+    *(uint32_t*)p = v;
+    bw->bit_pos_ += n_bits;
+  }
+#else  // BIG_ENDIAN
+  {
+    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
+    const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
+    const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
+    // implicit & 0xff is assumed for uint8_t arithmetics
+    *p++ |= bits << bits_reserved_in_first_byte;
+    bits >>= 8 - bits_reserved_in_first_byte;
+    if (bits_left_to_write >= 1) {
+      *p++ = bits;
+      bits >>= 8;
+      if (bits_left_to_write >= 9) {
+        *p++ = bits;
+        bits >>= 8;
+      }
+    }
+    assert(n_bits <= 25);
+    *p = bits;
+    bw->bit_pos_ += n_bits;
+  }
+#endif
+  if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
+    const uint64_t extra_size = 32768ULL + bw->max_bytes_;
+    if (extra_size != (size_t)extra_size ||
+        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+      bw->bit_pos_ = 0;
+      bw->error_ = 1;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/bit_writer.h b/src/utils/bit_writer.h
new file mode 100644
index 0000000..30f71a8
--- /dev/null
+++ b/src/utils/bit_writer.h
@@ -0,0 +1,123 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Bit writing and boolean coder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_BIT_WRITER_H_
+#define WEBP_UTILS_BIT_WRITER_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Bit-writing
+
+typedef struct VP8BitWriter VP8BitWriter;
+struct VP8BitWriter {
+  int32_t  range_;      // range-1
+  int32_t  value_;
+  int      run_;        // number of outstanding bits
+  int      nb_bits_;    // number of pending bits
+  uint8_t* buf_;        // internal buffer. Re-allocated regularly. Not owned.
+  size_t   pos_;
+  size_t   max_pos_;
+  int      error_;      // true in case of error
+};
+
+// Initialize the object. Allocates some initial memory based on expected_size.
+int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size);
+// Finalize the bitstream coding. Returns a pointer to the internal buffer.
+uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw);
+// Release any pending memory and zeroes the object. Not a mandatory call.
+// Only useful in case of error, when the internal buffer hasn't been grabbed!
+void VP8BitWriterWipeOut(VP8BitWriter* const bw);
+
+int VP8PutBit(VP8BitWriter* const bw, int bit, int prob);
+int VP8PutBitUniform(VP8BitWriter* const bw, int bit);
+void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits);
+void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits);
+
+// Appends some bytes to the internal buffer. Data is copied.
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size);
+
+// return approximate write position (in bits)
+static WEBP_INLINE uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
+  return (uint64_t)(bw->pos_ + bw->run_) * 8 + 8 + bw->nb_bits_;
+}
+
+// Returns a pointer to the internal buffer.
+static WEBP_INLINE uint8_t* VP8BitWriterBuf(const VP8BitWriter* const bw) {
+  return bw->buf_;
+}
+// Returns the size of the internal buffer.
+static WEBP_INLINE size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
+  return bw->pos_;
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+// TODO(vikasa): VP8LBitWriter is copied as-is from lossless code. There's scope
+// of re-using VP8BitWriter. Will evaluate once basic lossless encoder is
+// implemented.
+
+typedef struct {
+  uint8_t* buf_;
+  size_t bit_pos_;
+  size_t max_bytes_;
+
+  // After all bits are written, the caller must observe the state of
+  // error_. A value of 1 indicates that a memory allocation failure
+  // has happened during bit writing. A value of 0 indicates successful
+  // writing of bits.
+  int error_;
+} VP8LBitWriter;
+
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
+  return (bw->bit_pos_ + 7) >> 3;
+}
+
+static WEBP_INLINE uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  return bw->buf_;
+}
+
+// Returns 0 in case of memory allocation error.
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+
+void VP8LBitWriterDestroy(VP8LBitWriter* const bw);
+
+// This function writes bits into bytes in increasing addresses, and within
+// a byte least-significant-bit first.
+//
+// The function can write up to 16 bits in one go with WriteBits
+// Example: let's assume that 3 bits (Rs below) have been written already:
+//
+// BYTE-0     BYTE+1       BYTE+2
+//
+// 0000 0RRR    0000 0000    0000 0000
+//
+// Now, we could write 5 or less bits in MSB by just sifting by 3
+// and OR'ing to BYTE-0.
+//
+// For n bits, we take the last 5 bytes, OR that with high bits in BYTE-0,
+// and locate the rest in BYTE+1 and BYTE+2.
+//
+// VP8LBitWriter's error_ flag is set in case of  memory allocation error.
+void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_BIT_WRITER_H_ */
diff --git a/src/utils/color_cache.c b/src/utils/color_cache.c
new file mode 100644
index 0000000..560f81d
--- /dev/null
+++ b/src/utils/color_cache.c
@@ -0,0 +1,44 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Color Cache for WebP Lossless
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "./color_cache.h"
+#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// VP8LColorCache.
+
+int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+  const int hash_size = 1 << hash_bits;
+  assert(cc != NULL);
+  assert(hash_bits > 0);
+  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
+                                          sizeof(*cc->colors_));
+  if (cc->colors_ == NULL) return 0;
+  cc->hash_shift_ = 32 - hash_bits;
+  return 1;
+}
+
+void VP8LColorCacheClear(VP8LColorCache* const cc) {
+  if (cc != NULL) {
+    free(cc->colors_);
+    cc->colors_ = NULL;
+  }
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
diff --git a/src/utils/color_cache.h b/src/utils/color_cache.h
new file mode 100644
index 0000000..a587531
--- /dev/null
+++ b/src/utils/color_cache.h
@@ -0,0 +1,68 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Color Cache for WebP Lossless
+//
+// Authors: Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#ifndef WEBP_UTILS_COLOR_CACHE_H_
+#define WEBP_UTILS_COLOR_CACHE_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Main color cache struct.
+typedef struct {
+  uint32_t *colors_;  // color entries
+  int hash_shift_;    // Hash shift: 32 - hash_bits.
+} VP8LColorCache;
+
+static const uint32_t kHashMul = 0x1e35a7bd;
+
+static WEBP_INLINE uint32_t VP8LColorCacheLookup(
+    const VP8LColorCache* const cc, uint32_t key) {
+  assert(key <= (~0U >> cc->hash_shift_));
+  return cc->colors_[key];
+}
+
+static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
+                                             uint32_t argb) {
+  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  cc->colors_[key] = argb;
+}
+
+static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
+                                              uint32_t argb) {
+  return (kHashMul * argb) >> cc->hash_shift_;
+}
+
+static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
+                                              uint32_t argb) {
+  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  return cc->colors_[key] == argb;
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes the color cache with 'hash_bits' bits for the keys.
+// Returns false in case of memory error.
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits);
+
+// Delete the memory associated to color cache.
+void VP8LColorCacheClear(VP8LColorCache* const color_cache);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_UTILS_COLOR_CACHE_H_
diff --git a/src/utils/filters.c b/src/utils/filters.c
new file mode 100644
index 0000000..08f52a3
--- /dev/null
+++ b/src/utils/filters.c
@@ -0,0 +1,229 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author: Urvang (urvang@google.com)
+
+#include "./filters.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                              \
+  assert(in != NULL);                                       \
+  assert(out != NULL);                                      \
+  assert(width > 0);                                        \
+  assert(height > 0);                                       \
+  assert(bpp > 0);                                          \
+  assert(stride >= width * bpp);
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
+                                    uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
+  int h;
+  const uint8_t* preds = (inverse ? out : in);
+  SANITY_CHECK(in, out);
+
+  // Filter line-by-line.
+  for (h = 0; h < height; ++h) {
+    // Leftmost pixel is predicted from above (except for topmost scanline).
+    if (h == 0) {
+      memcpy((void*)out, (const void*)in, bpp);
+    } else {
+      PredictLine(in, preds - stride, out, bpp, inverse);
+    }
+    PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int bpp, int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, bpp, stride, 0, filtered_data);
+}
+
+static void HorizontalUnfilter(const uint8_t* data, int width, int height,
+                               int bpp, int stride, uint8_t* recon_data) {
+  DoHorizontalFilter(data, width, height, bpp, stride, 1, recon_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+    int width, int height, int bpp, int stride, int inverse, uint8_t* out) {
+  int h;
+  const uint8_t* preds = (inverse ? out : in);
+  SANITY_CHECK(in, out);
+
+  // Very first top-left pixel is copied.
+  memcpy((void*)out, (const void*)in, bpp);
+  // Rest of top scan-line is left-predicted.
+  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
+
+  // Filter line-by-line.
+  for (h = 1; h < height; ++h) {
+    in += stride;
+    out += stride;
+    PredictLine(in, preds, out, bpp * width, inverse);
+    preds += stride;
+  }
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int bpp, int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, bpp, stride, 0, filtered_data);
+}
+
+static void VerticalUnfilter(const uint8_t* data, int width, int height,
+                             int bpp, int stride, uint8_t* recon_data) {
+  DoVerticalFilter(data, width, height, bpp, stride, 1, recon_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return (g < 0) ? 0 : (g > 255) ? 255 : g;
+}
+
+static WEBP_INLINE
+void DoGradientFilter(const uint8_t* in, int width, int height,
+                      int bpp, int stride, int inverse, uint8_t* out) {
+  const uint8_t* preds = (inverse ? out : in);
+  int h;
+  SANITY_CHECK(in, out);
+
+  // left prediction for top scan-line
+  memcpy((void*)out, (const void*)in, bpp);
+  PredictLine(in + bpp, preds, out + bpp, bpp * (width - 1), inverse);
+
+  // Filter line-by-line.
+  for (h = 1; h < height; ++h) {
+    int w;
+    preds += stride;
+    in += stride;
+    out += stride;
+    // leftmost pixel: predict from above.
+    PredictLine(in, preds - stride, out, bpp, inverse);
+    for (w = bpp; w < width * bpp; ++w) {
+      const int pred = GradientPredictor(preds[w - bpp],
+                                         preds[w - stride],
+                                         preds[w - stride - bpp]);
+      out[w] = in[w] + (inverse ? pred : -pred);
+    }
+  }
+}
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int bpp, int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, bpp, stride, 0, filtered_data);
+}
+
+static void GradientUnfilter(const uint8_t* data, int width, int height,
+                             int bpp, int stride, uint8_t* recon_data) {
+  DoGradientFilter(data, width, height, bpp, stride, 1, recon_data);
+}
+
+#undef SANITY_CHECK
+
+// -----------------------------------------------------------------------------
+// Quick estimate of a potentially interesting filter mode to try, in addition
+// to the default NONE.
+
+#define SMAX 16
+#define SDIFF(a, b) (abs((a) - (b)) >> 4)   // Scoring diff, in [0..SMAX)
+
+WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
+                                    int width, int height, int stride) {
+  int i, j;
+  int bins[WEBP_FILTER_LAST][SMAX];
+  memset(bins, 0, sizeof(bins));
+  // We only sample every other pixels. That's enough.
+  for (j = 2; j < height - 1; j += 2) {
+    const uint8_t* const p = data + j * stride;
+    int mean = p[0];
+    for (i = 2; i < width - 1; i += 2) {
+      const int diff0 = SDIFF(p[i], mean);
+      const int diff1 = SDIFF(p[i], p[i - 1]);
+      const int diff2 = SDIFF(p[i], p[i - width]);
+      const int grad_pred =
+          GradientPredictor(p[i - 1], p[i - width], p[i - width - 1]);
+      const int diff3 = SDIFF(p[i], grad_pred);
+      bins[WEBP_FILTER_NONE][diff0] = 1;
+      bins[WEBP_FILTER_HORIZONTAL][diff1] = 1;
+      bins[WEBP_FILTER_VERTICAL][diff2] = 1;
+      bins[WEBP_FILTER_GRADIENT][diff3] = 1;
+      mean = (3 * mean + p[i] + 2) >> 2;
+    }
+  }
+  {
+    WEBP_FILTER_TYPE filter, best_filter = WEBP_FILTER_NONE;
+    int best_score = 0x7fffffff;
+    for (filter = WEBP_FILTER_NONE; filter < WEBP_FILTER_LAST; ++filter) {
+      int score = 0;
+      for (i = 0; i < SMAX; ++i) {
+        if (bins[filter][i] > 0) {
+          score += i;
+        }
+      }
+      if (score < best_score) {
+        best_score = score;
+        best_filter = filter;
+      }
+    }
+    return best_filter;
+  }
+}
+
+#undef SMAX
+#undef SDIFF
+
+//------------------------------------------------------------------------------
+
+const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST] = {
+  NULL,              // WEBP_FILTER_NONE
+  HorizontalFilter,  // WEBP_FILTER_HORIZONTAL
+  VerticalFilter,    // WEBP_FILTER_VERTICAL
+  GradientFilter     // WEBP_FILTER_GRADIENT
+};
+
+const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
+  NULL,                // WEBP_FILTER_NONE
+  HorizontalUnfilter,  // WEBP_FILTER_HORIZONTAL
+  VerticalUnfilter,    // WEBP_FILTER_VERTICAL
+  GradientUnfilter     // WEBP_FILTER_GRADIENT
+};
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/filters.h b/src/utils/filters.h
new file mode 100644
index 0000000..4989bb7
--- /dev/null
+++ b/src/utils/filters.h
@@ -0,0 +1,54 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_UTILS_FILTERS_H_
+#define WEBP_UTILS_FILTERS_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Filters.
+typedef enum {
+  WEBP_FILTER_NONE = 0,
+  WEBP_FILTER_HORIZONTAL,
+  WEBP_FILTER_VERTICAL,
+  WEBP_FILTER_GRADIENT,
+  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
+  WEBP_FILTER_BEST,
+  WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+                               int bpp, int stride, uint8_t* out);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'bpp' is number of bytes per pixel, and
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// Reconstruct the original data from the given filtered data.
+extern const WebPFilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// Fast estimate of a potentially good filter.
+extern WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
+                                           int width, int height, int stride);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_FILTERS_H_ */
diff --git a/src/utils/huffman.c b/src/utils/huffman.c
new file mode 100644
index 0000000..54d85f9
--- /dev/null
+++ b/src/utils/huffman.c
@@ -0,0 +1,238 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Utilities for building and looking up Huffman trees.
+//
+// Author: Urvang Joshi (urvang@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "./huffman.h"
+#include "../utils/utils.h"
+#include "webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define NON_EXISTENT_SYMBOL (-1)
+
+static void TreeNodeInit(HuffmanTreeNode* const node) {
+  node->children_ = -1;   // means: 'unassigned so far'
+}
+
+static int NodeIsEmpty(const HuffmanTreeNode* const node) {
+  return (node->children_ < 0);
+}
+
+static int IsFull(const HuffmanTree* const tree) {
+  return (tree->num_nodes_ == tree->max_nodes_);
+}
+
+static void AssignChildren(HuffmanTree* const tree,
+                           HuffmanTreeNode* const node) {
+  HuffmanTreeNode* const children = tree->root_ + tree->num_nodes_;
+  node->children_ = (int)(children - node);
+  assert(children - node == (int)(children - node));
+  tree->num_nodes_ += 2;
+  TreeNodeInit(children + 0);
+  TreeNodeInit(children + 1);
+}
+
+static int TreeInit(HuffmanTree* const tree, int num_leaves) {
+  assert(tree != NULL);
+  if (num_leaves == 0) return 0;
+  // We allocate maximum possible nodes in the tree at once.
+  // Note that a Huffman tree is a full binary tree; and in a full binary tree
+  // with L leaves, the total number of nodes N = 2 * L - 1.
+  tree->max_nodes_ = 2 * num_leaves - 1;
+  tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
+                                                 sizeof(*tree->root_));
+  if (tree->root_ == NULL) return 0;
+  TreeNodeInit(tree->root_);  // Initialize root.
+  tree->num_nodes_ = 1;
+  return 1;
+}
+
+void HuffmanTreeRelease(HuffmanTree* const tree) {
+  if (tree != NULL) {
+    free(tree->root_);
+    tree->root_ = NULL;
+    tree->max_nodes_ = 0;
+    tree->num_nodes_ = 0;
+  }
+}
+
+int HuffmanCodeLengthsToCodes(const int* const code_lengths,
+                              int code_lengths_size, int* const huff_codes) {
+  int symbol;
+  int code_len;
+  int code_length_hist[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+  int curr_code;
+  int next_codes[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+  int max_code_length = 0;
+
+  assert(code_lengths != NULL);
+  assert(code_lengths_size > 0);
+  assert(huff_codes != NULL);
+
+  // Calculate max code length.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    if (code_lengths[symbol] > max_code_length) {
+      max_code_length = code_lengths[symbol];
+    }
+  }
+  if (max_code_length > MAX_ALLOWED_CODE_LENGTH) return 0;
+
+  // Calculate code length histogram.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    ++code_length_hist[code_lengths[symbol]];
+  }
+  code_length_hist[0] = 0;
+
+  // Calculate the initial values of 'next_codes' for each code length.
+  // next_codes[code_len] denotes the code to be assigned to the next symbol
+  // of code length 'code_len'.
+  curr_code = 0;
+  next_codes[0] = -1;  // Unused, as code length = 0 implies code doesn't exist.
+  for (code_len = 1; code_len <= max_code_length; ++code_len) {
+    curr_code = (curr_code + code_length_hist[code_len - 1]) << 1;
+    next_codes[code_len] = curr_code;
+  }
+
+  // Get symbols.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    if (code_lengths[symbol] > 0) {
+      huff_codes[symbol] = next_codes[code_lengths[symbol]]++;
+    } else {
+      huff_codes[symbol] = NON_EXISTENT_SYMBOL;
+    }
+  }
+  return 1;
+}
+
+static int TreeAddSymbol(HuffmanTree* const tree,
+                         int symbol, int code, int code_length) {
+  HuffmanTreeNode* node = tree->root_;
+  const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
+  while (code_length-- > 0) {
+    if (node >= max_node) {
+      return 0;
+    }
+    if (NodeIsEmpty(node)) {
+      if (IsFull(tree)) return 0;    // error: too many symbols.
+      AssignChildren(tree, node);
+    } else if (HuffmanTreeNodeIsLeaf(node)) {
+      return 0;  // leaf is already occupied.
+    }
+    node += node->children_ + ((code >> code_length) & 1);
+  }
+  if (NodeIsEmpty(node)) {
+    node->children_ = 0;      // turn newly created node into a leaf.
+  } else if (!HuffmanTreeNodeIsLeaf(node)) {
+    return 0;   // trying to assign a symbol to already used code.
+  }
+  node->symbol_ = symbol;  // Add symbol in this node.
+  return 1;
+}
+
+int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
+                             const int* const code_lengths,
+                             int code_lengths_size) {
+  int symbol;
+  int num_symbols = 0;
+  int root_symbol = 0;
+
+  assert(tree != NULL);
+  assert(code_lengths != NULL);
+
+  // Find out number of symbols and the root symbol.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    if (code_lengths[symbol] > 0) {
+      // Note: code length = 0 indicates non-existent symbol.
+      ++num_symbols;
+      root_symbol = symbol;
+    }
+  }
+
+  // Initialize the tree. Will fail for num_symbols = 0
+  if (!TreeInit(tree, num_symbols)) return 0;
+
+  // Build tree.
+  if (num_symbols == 1) {  // Trivial case.
+    const int max_symbol = code_lengths_size;
+    if (root_symbol < 0 || root_symbol >= max_symbol) {
+      HuffmanTreeRelease(tree);
+      return 0;
+    }
+    return TreeAddSymbol(tree, root_symbol, 0, 0);
+  } else {  // Normal case.
+    int ok = 0;
+
+    // Get Huffman codes from the code lengths.
+    int* const codes =
+        (int*)WebPSafeMalloc((uint64_t)code_lengths_size, sizeof(*codes));
+    if (codes == NULL) goto End;
+
+    if (!HuffmanCodeLengthsToCodes(code_lengths, code_lengths_size, codes)) {
+      goto End;
+    }
+
+    // Add symbols one-by-one.
+    for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+      if (code_lengths[symbol] > 0) {
+        if (!TreeAddSymbol(tree, symbol, codes[symbol], code_lengths[symbol])) {
+          goto End;
+        }
+      }
+    }
+    ok = 1;
+ End:
+    free(codes);
+    ok = ok && IsFull(tree);
+    if (!ok) HuffmanTreeRelease(tree);
+    return ok;
+  }
+}
+
+int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
+                             const int* const code_lengths,
+                             const int* const codes,
+                             const int* const symbols, int max_symbol,
+                             int num_symbols) {
+  int ok = 0;
+  int i;
+
+  assert(tree != NULL);
+  assert(code_lengths != NULL);
+  assert(codes != NULL);
+  assert(symbols != NULL);
+
+  // Initialize the tree. Will fail if num_symbols = 0.
+  if (!TreeInit(tree, num_symbols)) return 0;
+
+  // Add symbols one-by-one.
+  for (i = 0; i < num_symbols; ++i) {
+    if (codes[i] != NON_EXISTENT_SYMBOL) {
+      if (symbols[i] < 0 || symbols[i] >= max_symbol) {
+        goto End;
+      }
+      if (!TreeAddSymbol(tree, symbols[i], codes[i], code_lengths[i])) {
+        goto End;
+      }
+    }
+  }
+  ok = 1;
+ End:
+  ok = ok && IsFull(tree);
+  if (!ok) HuffmanTreeRelease(tree);
+  return ok;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/huffman.h b/src/utils/huffman.h
new file mode 100644
index 0000000..19f50ec
--- /dev/null
+++ b/src/utils/huffman.h
@@ -0,0 +1,78 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Utilities for building and looking up Huffman trees.
+//
+// Author: Urvang Joshi (urvang@google.com)
+
+#ifndef WEBP_UTILS_HUFFMAN_H_
+#define WEBP_UTILS_HUFFMAN_H_
+
+#include <assert.h>
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// A node of a Huffman tree.
+typedef struct {
+  int symbol_;
+  int children_;  // delta offset to both children (contiguous) or 0 if leaf.
+} HuffmanTreeNode;
+
+// Huffman Tree.
+typedef struct HuffmanTree HuffmanTree;
+struct HuffmanTree {
+  HuffmanTreeNode* root_;   // all the nodes, starting at root.
+  int max_nodes_;           // max number of nodes
+  int num_nodes_;           // number of currently occupied nodes
+};
+
+// Returns true if the given node is a leaf of the Huffman tree.
+static WEBP_INLINE int HuffmanTreeNodeIsLeaf(
+    const HuffmanTreeNode* const node) {
+  return (node->children_ == 0);
+}
+
+// Go down one level. Most critical function. 'right_child' must be 0 or 1.
+static WEBP_INLINE const HuffmanTreeNode* HuffmanTreeNextNode(
+    const HuffmanTreeNode* node, int right_child) {
+  return node + node->children_ + right_child;
+}
+
+// Releases the nodes of the Huffman tree.
+// Note: It does NOT free 'tree' itself.
+void HuffmanTreeRelease(HuffmanTree* const tree);
+
+// Builds Huffman tree assuming code lengths are implicitly in symbol order.
+// Returns false in case of error (invalid tree or memory error).
+int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
+                             const int* const code_lengths,
+                             int code_lengths_size);
+
+// Build a Huffman tree with explicitly given lists of code lengths, codes
+// and symbols. Verifies that all symbols added are smaller than max_symbol.
+// Returns false in case of an invalid symbol, invalid tree or memory error.
+int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
+                             const int* const code_lengths,
+                             const int* const codes,
+                             const int* const symbols, int max_symbol,
+                             int num_symbols);
+
+// Utility: converts Huffman code lengths to corresponding Huffman codes.
+// 'huff_codes' should be pre-allocated.
+// Returns false in case of error (memory allocation, invalid codes).
+int HuffmanCodeLengthsToCodes(const int* const code_lengths,
+                              int code_lengths_size, int* const huff_codes);
+
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_UTILS_HUFFMAN_H_
diff --git a/src/utils/huffman_encode.c b/src/utils/huffman_encode.c
new file mode 100644
index 0000000..2686c66
--- /dev/null
+++ b/src/utils/huffman_encode.c
@@ -0,0 +1,439 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Entropy encoding (Huffman) for webp lossless.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./huffman_encode.h"
+#include "../utils/utils.h"
+#include "webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Util function to optimize the symbol map for RLE coding
+
+// Heuristics for selecting the stride ranges to collapse.
+static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
+  return abs(a - b) < 4;
+}
+
+// Change the population counts in a way that the consequent
+// Hufmann tree compression, especially its RLE-part, give smaller output.
+static int OptimizeHuffmanForRle(int length, int* const counts) {
+  uint8_t* good_for_rle;
+  // 1) Let's make the Huffman code more compatible with rle encoding.
+  int i;
+  for (; length >= 0; --length) {
+    if (length == 0) {
+      return 1;  // All zeros.
+    }
+    if (counts[length - 1] != 0) {
+      // Now counts[0..length - 1] does not have trailing zeros.
+      break;
+    }
+  }
+  // 2) Let's mark all population counts that already can be encoded
+  // with an rle code.
+  good_for_rle = (uint8_t*)calloc(length, 1);
+  if (good_for_rle == NULL) {
+    return 0;
+  }
+  {
+    // Let's not spoil any of the existing good rle codes.
+    // Mark any seq of 0's that is longer as 5 as a good_for_rle.
+    // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
+    int symbol = counts[0];
+    int stride = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || counts[i] != symbol) {
+        if ((symbol == 0 && stride >= 5) ||
+            (symbol != 0 && stride >= 7)) {
+          int k;
+          for (k = 0; k < stride; ++k) {
+            good_for_rle[i - k - 1] = 1;
+          }
+        }
+        stride = 1;
+        if (i != length) {
+          symbol = counts[i];
+        }
+      } else {
+        ++stride;
+      }
+    }
+  }
+  // 3) Let's replace those population counts that lead to more rle codes.
+  {
+    int stride = 0;
+    int limit = counts[0];
+    int sum = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || good_for_rle[i] ||
+          (i != 0 && good_for_rle[i - 1]) ||
+          !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+        if (stride >= 4 || (stride >= 3 && sum == 0)) {
+          int k;
+          // The stride must end, collapse what we have, if we have enough (4).
+          int count = (sum + stride / 2) / stride;
+          if (count < 1) {
+            count = 1;
+          }
+          if (sum == 0) {
+            // Don't make an all zeros stride to be upgraded to ones.
+            count = 0;
+          }
+          for (k = 0; k < stride; ++k) {
+            // We don't want to change value at counts[i],
+            // that is already belonging to the next stride. Thus - 1.
+            counts[i - k - 1] = count;
+          }
+        }
+        stride = 0;
+        sum = 0;
+        if (i < length - 3) {
+          // All interesting strides have a count of at least 4,
+          // at least when non-zeros.
+          limit = (counts[i] + counts[i + 1] +
+                   counts[i + 2] + counts[i + 3] + 2) / 4;
+        } else if (i < length) {
+          limit = counts[i];
+        } else {
+          limit = 0;
+        }
+      }
+      ++stride;
+      if (i != length) {
+        sum += counts[i];
+        if (stride >= 4) {
+          limit = (sum + stride / 2) / stride;
+        }
+      }
+    }
+  }
+  free(good_for_rle);
+  return 1;
+}
+
+typedef struct {
+  int total_count_;
+  int value_;
+  int pool_index_left_;
+  int pool_index_right_;
+} HuffmanTree;
+
+// A comparer function for two Huffman trees: sorts first by 'total count'
+// (more comes first), and then by 'value' (more comes first).
+static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
+  const HuffmanTree* const t1 = (const HuffmanTree*)ptr1;
+  const HuffmanTree* const t2 = (const HuffmanTree*)ptr2;
+  if (t1->total_count_ > t2->total_count_) {
+    return -1;
+  } else if (t1->total_count_ < t2->total_count_) {
+    return 1;
+  } else {
+    if (t1->value_ < t2->value_) {
+      return -1;
+    }
+    if (t1->value_ > t2->value_) {
+      return 1;
+    }
+    return 0;
+  }
+}
+
+static void SetBitDepths(const HuffmanTree* const tree,
+                         const HuffmanTree* const pool,
+                         uint8_t* const bit_depths, int level) {
+  if (tree->pool_index_left_ >= 0) {
+    SetBitDepths(&pool[tree->pool_index_left_], pool, bit_depths, level + 1);
+    SetBitDepths(&pool[tree->pool_index_right_], pool, bit_depths, level + 1);
+  } else {
+    bit_depths[tree->value_] = level;
+  }
+}
+
+// Create an optimal Huffman tree.
+//
+// (data,length): population counts.
+// tree_limit: maximum bit depth (inclusive) of the codes.
+// bit_depths[]: how many bits are used for the symbol.
+//
+// Returns 0 when an error has occurred.
+//
+// The catch here is that the tree cannot be arbitrarily deep
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+static int GenerateOptimalTree(const int* const histogram, int histogram_size,
+                               int tree_depth_limit,
+                               uint8_t* const bit_depths) {
+  int count_min;
+  HuffmanTree* tree_pool;
+  HuffmanTree* tree;
+  int tree_size_orig = 0;
+  int i;
+
+  for (i = 0; i < histogram_size; ++i) {
+    if (histogram[i] != 0) {
+      ++tree_size_orig;
+    }
+  }
+
+  // 3 * tree_size is enough to cover all the nodes representing a
+  // population and all the inserted nodes combining two existing nodes.
+  // The tree pool needs 2 * (tree_size_orig - 1) entities, and the
+  // tree needs exactly tree_size_orig entities.
+  tree = (HuffmanTree*)WebPSafeMalloc(3ULL * tree_size_orig, sizeof(*tree));
+  if (tree == NULL) return 0;
+  tree_pool = tree + tree_size_orig;
+
+  // For block sizes with less than 64k symbols we never need to do a
+  // second iteration of this loop.
+  // If we actually start running inside this loop a lot, we would perhaps
+  // be better off with the Katajainen algorithm.
+  assert(tree_size_orig <= (1 << (tree_depth_limit - 1)));
+  for (count_min = 1; ; count_min *= 2) {
+    int tree_size = tree_size_orig;
+    // We need to pack the Huffman tree in tree_depth_limit bits.
+    // So, we try by faking histogram entries to be at least 'count_min'.
+    int idx = 0;
+    int j;
+    for (j = 0; j < histogram_size; ++j) {
+      if (histogram[j] != 0) {
+        const int count =
+            (histogram[j] < count_min) ? count_min : histogram[j];
+        tree[idx].total_count_ = count;
+        tree[idx].value_ = j;
+        tree[idx].pool_index_left_ = -1;
+        tree[idx].pool_index_right_ = -1;
+        ++idx;
+      }
+    }
+
+    // Build the Huffman tree.
+    qsort(tree, tree_size, sizeof(*tree), CompareHuffmanTrees);
+
+    if (tree_size > 1) {  // Normal case.
+      int tree_pool_size = 0;
+      while (tree_size > 1) {  // Finish when we have only one root.
+        int count;
+        tree_pool[tree_pool_size++] = tree[tree_size - 1];
+        tree_pool[tree_pool_size++] = tree[tree_size - 2];
+        count = tree_pool[tree_pool_size - 1].total_count_ +
+            tree_pool[tree_pool_size - 2].total_count_;
+        tree_size -= 2;
+        {
+          // Search for the insertion point.
+          int k;
+          for (k = 0; k < tree_size; ++k) {
+            if (tree[k].total_count_ <= count) {
+              break;
+            }
+          }
+          memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
+          tree[k].total_count_ = count;
+          tree[k].value_ = -1;
+
+          tree[k].pool_index_left_ = tree_pool_size - 1;
+          tree[k].pool_index_right_ = tree_pool_size - 2;
+          tree_size = tree_size + 1;
+        }
+      }
+      SetBitDepths(&tree[0], tree_pool, bit_depths, 0);
+    } else if (tree_size == 1) {  // Trivial case: only one element.
+      bit_depths[tree[0].value_] = 1;
+    }
+
+    {
+      // Test if this Huffman tree satisfies our 'tree_depth_limit' criteria.
+      int max_depth = bit_depths[0];
+      for (j = 1; j < histogram_size; ++j) {
+        if (max_depth < bit_depths[j]) {
+          max_depth = bit_depths[j];
+        }
+      }
+      if (max_depth <= tree_depth_limit) {
+        break;
+      }
+    }
+  }
+  free(tree);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Coding of the Huffman tree values
+
+static HuffmanTreeToken* CodeRepeatedValues(int repetitions,
+                                            HuffmanTreeToken* tokens,
+                                            int value, int prev_value) {
+  assert(value <= MAX_ALLOWED_CODE_LENGTH);
+  if (value != prev_value) {
+    tokens->code = value;
+    tokens->extra_bits = 0;
+    ++tokens;
+    --repetitions;
+  }
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = value;
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 7) {
+      tokens->code = 16;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 16;
+      tokens->extra_bits = 3;
+      ++tokens;
+      repetitions -= 6;
+    }
+  }
+  return tokens;
+}
+
+static HuffmanTreeToken* CodeRepeatedZeros(int repetitions,
+                                           HuffmanTreeToken* tokens) {
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = 0;   // 0-value
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 11) {
+      tokens->code = 17;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else if (repetitions < 139) {
+      tokens->code = 18;
+      tokens->extra_bits = repetitions - 11;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 18;
+      tokens->extra_bits = 0x7f;  // 138 repeated 0s
+      ++tokens;
+      repetitions -= 138;
+    }
+  }
+  return tokens;
+}
+
+int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
+                                    HuffmanTreeToken* tokens, int max_tokens) {
+  HuffmanTreeToken* const starting_token = tokens;
+  HuffmanTreeToken* const ending_token = tokens + max_tokens;
+  const int depth_size = tree->num_symbols;
+  int prev_value = 8;  // 8 is the initial value for rle.
+  int i = 0;
+  assert(tokens != NULL);
+  while (i < depth_size) {
+    const int value = tree->code_lengths[i];
+    int k = i + 1;
+    int runs;
+    while (k < depth_size && tree->code_lengths[k] == value) ++k;
+    runs = k - i;
+    if (value == 0) {
+      tokens = CodeRepeatedZeros(runs, tokens);
+    } else {
+      tokens = CodeRepeatedValues(runs, tokens, value, prev_value);
+      prev_value = value;
+    }
+    i += runs;
+    assert(tokens <= ending_token);
+  }
+  (void)ending_token;    // suppress 'unused variable' warning
+  return (int)(tokens - starting_token);
+}
+
+// -----------------------------------------------------------------------------
+
+// Pre-reversed 4-bit values.
+static const uint8_t kReversedBits[16] = {
+  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+};
+
+static uint32_t ReverseBits(int num_bits, uint32_t bits) {
+  uint32_t retval = 0;
+  int i = 0;
+  while (i < num_bits) {
+    i += 4;
+    retval |= kReversedBits[bits & 0xf] << (MAX_ALLOWED_CODE_LENGTH + 1 - i);
+    bits >>= 4;
+  }
+  retval >>= (MAX_ALLOWED_CODE_LENGTH + 1 - num_bits);
+  return retval;
+}
+
+// Get the actual bit values for a tree of bit depths.
+static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
+  // 0 bit-depth means that the symbol does not exist.
+  int i;
+  int len;
+  uint32_t next_code[MAX_ALLOWED_CODE_LENGTH + 1];
+  int depth_count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+
+  assert(tree != NULL);
+  len = tree->num_symbols;
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    assert(code_length <= MAX_ALLOWED_CODE_LENGTH);
+    ++depth_count[code_length];
+  }
+  depth_count[0] = 0;  // ignore unused symbol
+  next_code[0] = 0;
+  {
+    uint32_t code = 0;
+    for (i = 1; i <= MAX_ALLOWED_CODE_LENGTH; ++i) {
+      code = (code + depth_count[i - 1]) << 1;
+      next_code[i] = code;
+    }
+  }
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    tree->codes[i] = ReverseBits(code_length, next_code[code_length]++);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Main entry point
+
+int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
+                          HuffmanTreeCode* const tree) {
+  const int num_symbols = tree->num_symbols;
+  if (!OptimizeHuffmanForRle(num_symbols, histogram)) {
+    return 0;
+  }
+  if (!GenerateOptimalTree(histogram, num_symbols,
+                           tree_depth_limit, tree->code_lengths)) {
+    return 0;
+  }
+  // Create the actual bit codes for the bit lengths.
+  ConvertBitDepthsToSymbols(tree);
+  return 1;
+}
diff --git a/src/utils/huffman_encode.h b/src/utils/huffman_encode.h
new file mode 100644
index 0000000..ada59d7
--- /dev/null
+++ b/src/utils/huffman_encode.h
@@ -0,0 +1,47 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Entropy encoding (Huffman) for webp lossless
+
+#ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_
+#define WEBP_UTILS_HUFFMAN_ENCODE_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Struct for holding the tree header in coded form.
+typedef struct {
+  uint8_t code;         // value (0..15) or escape code (16,17,18)
+  uint8_t extra_bits;   // extra bits for escape codes
+} HuffmanTreeToken;
+
+// Struct to represent the tree codes (depth and bits array).
+typedef struct {
+  int       num_symbols;   // Number of symbols.
+  uint8_t*  code_lengths;  // Code lengths of the symbols.
+  uint16_t* codes;         // Symbol Codes.
+} HuffmanTreeCode;
+
+// Turn the Huffman tree into a token sequence.
+// Returns the number of tokens used.
+int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
+                                    HuffmanTreeToken* tokens, int max_tokens);
+
+// Create an optimized tree, and tokenize it.
+int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
+                          HuffmanTreeCode* const tree);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_UTILS_HUFFMAN_ENCODE_H_
diff --git a/src/utils/quant_levels.c b/src/utils/quant_levels.c
new file mode 100644
index 0000000..f688439
--- /dev/null
+++ b/src/utils/quant_levels.c
@@ -0,0 +1,154 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Quantize levels for specified number of quantization-levels ([2, 256]).
+// Min and max values are preserved (usual 0 and 255 for alpha plane).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "./quant_levels.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define NUM_SYMBOLS     256
+
+#define MAX_ITER  6             // Maximum number of convergence steps.
+#define ERROR_THRESHOLD 1e-4    // MSE stopping criterion.
+
+// -----------------------------------------------------------------------------
+// Quantize levels.
+
+int QuantizeLevels(uint8_t* const data, int width, int height,
+                   int num_levels, uint64_t* const sse) {
+  int freq[NUM_SYMBOLS] = { 0 };
+  int q_level[NUM_SYMBOLS] = { 0 };
+  double inv_q_level[NUM_SYMBOLS] = { 0 };
+  int min_s = 255, max_s = 0;
+  const size_t data_size = height * width;
+  int i, num_levels_in, iter;
+  double last_err = 1.e38, err = 0.;
+  const double err_threshold = ERROR_THRESHOLD * data_size;
+
+  if (data == NULL) {
+    return 0;
+  }
+
+  if (width <= 0 || height <= 0) {
+    return 0;
+  }
+
+  if (num_levels < 2 || num_levels > 256) {
+    return 0;
+  }
+
+  {
+    size_t n;
+    num_levels_in = 0;
+    for (n = 0; n < data_size; ++n) {
+      num_levels_in += (freq[data[n]] == 0);
+      if (min_s > data[n]) min_s = data[n];
+      if (max_s < data[n]) max_s = data[n];
+      ++freq[data[n]];
+    }
+  }
+
+  if (num_levels_in <= num_levels) goto End;  // nothing to do!
+
+  // Start with uniformly spread centroids.
+  for (i = 0; i < num_levels; ++i) {
+    inv_q_level[i] = min_s + (double)(max_s - min_s) * i / (num_levels - 1);
+  }
+
+  // Fixed values. Won't be changed.
+  q_level[min_s] = 0;
+  q_level[max_s] = num_levels - 1;
+  assert(inv_q_level[0] == min_s);
+  assert(inv_q_level[num_levels - 1] == max_s);
+
+  // k-Means iterations.
+  for (iter = 0; iter < MAX_ITER; ++iter) {
+    double q_sum[NUM_SYMBOLS] = { 0 };
+    double q_count[NUM_SYMBOLS] = { 0 };
+    int s, slot = 0;
+
+    // Assign classes to representatives.
+    for (s = min_s; s <= max_s; ++s) {
+      // Keep track of the nearest neighbour 'slot'
+      while (slot < num_levels - 1 &&
+             2 * s > inv_q_level[slot] + inv_q_level[slot + 1]) {
+        ++slot;
+      }
+      if (freq[s] > 0) {
+        q_sum[slot] += s * freq[s];
+        q_count[slot] += freq[s];
+      }
+      q_level[s] = slot;
+    }
+
+    // Assign new representatives to classes.
+    if (num_levels > 2) {
+      for (slot = 1; slot < num_levels - 1; ++slot) {
+        const double count = q_count[slot];
+        if (count > 0.) {
+          inv_q_level[slot] = q_sum[slot] / count;
+        }
+      }
+    }
+
+    // Compute convergence error.
+    err = 0.;
+    for (s = min_s; s <= max_s; ++s) {
+      const double error = s - inv_q_level[q_level[s]];
+      err += freq[s] * error * error;
+    }
+
+    // Check for convergence: we stop as soon as the error is no
+    // longer improving.
+    if (last_err - err < err_threshold) break;
+    last_err = err;
+  }
+
+  // Remap the alpha plane to quantized values.
+  {
+    // double->int rounding operation can be costly, so we do it
+    // once for all before remapping. We also perform the data[] -> slot
+    // mapping, while at it (avoid one indirection in the final loop).
+    uint8_t map[NUM_SYMBOLS];
+    int s;
+    size_t n;
+    for (s = min_s; s <= max_s; ++s) {
+      const int slot = q_level[s];
+      map[s] = (uint8_t)(inv_q_level[slot] + .5);
+    }
+    // Final pass.
+    for (n = 0; n < data_size; ++n) {
+      data[n] = map[data[n]];
+    }
+  }
+ End:
+  // Store sum of squared error if needed.
+  if (sse != NULL) *sse = (uint64_t)err;
+
+  return 1;
+}
+
+int DequantizeLevels(uint8_t* const data, int width, int height) {
+  if (data == NULL || width <= 0 || height <= 0) return 0;
+  // TODO(skal): implement gradient smoothing.
+  (void)data;
+  (void)width;
+  (void)height;
+  return 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/quant_levels.h b/src/utils/quant_levels.h
new file mode 100644
index 0000000..8dd3afe
--- /dev/null
+++ b/src/utils/quant_levels.h
@@ -0,0 +1,39 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Alpha plane quantization utility
+//
+// Author:  Vikas Arora (vikasa@google.com)
+
+#ifndef WEBP_UTILS_QUANT_LEVELS_H_
+#define WEBP_UTILS_QUANT_LEVELS_H_
+
+#include <stdlib.h>
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Replace the input 'data' of size 'width'x'height' with 'num-levels'
+// quantized values. If not NULL, 'sse' will contain the sum of squared error.
+// Valid range for 'num_levels' is [2, 256].
+// Returns false in case of error (data is NULL, or parameters are invalid).
+int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
+                   uint64_t* const sse);
+
+// Apply post-processing to input 'data' of size 'width'x'height' assuming
+// that the source was quantized to a reduced number of levels.
+// Returns false in case of error (data is NULL, invalid parameters, ...).
+int DequantizeLevels(uint8_t* const data, int width, int height);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_QUANT_LEVELS_H_ */
diff --git a/src/utils/rescaler.c b/src/utils/rescaler.c
new file mode 100644
index 0000000..9825dcb
--- /dev/null
+++ b/src/utils/rescaler.c
@@ -0,0 +1,152 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "./rescaler.h"
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define RFIX 30
+#define MULT_FIX(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+
+void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+                      uint8_t* const dst, int dst_width, int dst_height,
+                      int dst_stride, int num_channels, int x_add, int x_sub,
+                      int y_add, int y_sub, int32_t* const work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  wrk->num_channels = num_channels;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << RFIX) / x_sub;
+  wrk->fy_scale = (1 << RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + num_channels * dst_width;
+}
+
+void WebPRescalerImportRow(WebPRescaler* const wrk,
+                           const uint8_t* const src, int channel) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int x_in = channel;
+  int x_out;
+  int accum = 0;
+  if (!wrk->x_expand) {
+    int sum = 0;
+    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+      accum += wrk->x_add;
+      for (; accum > 0; accum -= wrk->x_sub) {
+        sum += src[x_in];
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in];
+        const int32_t frac = base * (-accum);
+        x_in += x_stride;
+        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[channel], right = src[channel];
+    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+      if (accum < 0) {
+        left = right;
+        x_in += x_stride;
+        right = src[x_in];
+        accum += wrk->x_add;
+      }
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      accum -= wrk->x_sub;
+    }
+  }
+  // Accumulate the new row's contribution
+  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+    wrk->irow[x_out] += wrk->frow[x_out];
+  }
+}
+
+uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
+  if (wrk->y_accum <= 0) {
+    int x_out;
+    uint8_t* const dst = wrk->dst;
+    int32_t* const irow = wrk->irow;
+    const int32_t* const frow = wrk->frow;
+    const int yscale = wrk->fy_scale * (-wrk->y_accum);
+    const int x_out_max = wrk->dst_width * wrk->num_channels;
+
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const int frac = (int)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+      irow[x_out] = frac;   // new fractional start
+    }
+    wrk->y_accum += wrk->y_add;
+    wrk->dst += wrk->dst_stride;
+    return dst;
+  } else {
+    return NULL;
+  }
+}
+
+#undef MULT_FIX
+#undef RFIX
+
+//------------------------------------------------------------------------------
+// all-in-one calls
+
+int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+                       const uint8_t* src, int src_stride) {
+  int total_imported = 0;
+  while (total_imported < num_lines && wrk->y_accum > 0) {
+    int channel;
+    for (channel = 0; channel < wrk->num_channels; ++channel) {
+      WebPRescalerImportRow(wrk, src, channel);
+    }
+    src += src_stride;
+    ++total_imported;
+    wrk->y_accum -= wrk->y_sub;
+  }
+  return total_imported;
+}
+
+int WebPRescalerExport(WebPRescaler* const rescaler) {
+  int total_exported = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    WebPRescalerExportRow(rescaler);
+    ++total_exported;
+  }
+  return total_exported;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/rescaler.h b/src/utils/rescaler.h
new file mode 100644
index 0000000..88ac1b7
--- /dev/null
+++ b/src/utils/rescaler.h
@@ -0,0 +1,76 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_RESCALER_H_
+#define WEBP_UTILS_RESCALER_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#include "webp/types.h"
+
+// Structure used for on-the-fly rescaling
+typedef struct {
+  int x_expand;               // true if we're expanding in the x direction
+  int num_channels;           // bytes to jump between pixels
+  int fy_scale, fx_scale;     // fixed-point scaling factor
+  int64_t fxy_scale;          // ''
+  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  int y_accum;                // vertical accumulator
+  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
+  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int src_width, src_height;  // source dimensions
+  int dst_width, dst_height;  // destination dimensions
+  uint8_t* dst;
+  int dst_stride;
+  int32_t* irow, *frow;       // work buffer
+} WebPRescaler;
+
+// Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
+void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels,
+                      int x_add, int x_sub,
+                      int y_add, int y_sub,
+                      int32_t* const work);
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported.
+void WebPRescalerImportRow(WebPRescaler* const rescaler,
+                           const uint8_t* const src, int channel);
+
+// Import multiple rows over all channels, until at least one row is ready to
+// be exported. Returns the actual number of lines that were imported.
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
+                       const uint8_t* src, int src_stride);
+
+// Return true if there is pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return (rescaler->y_accum <= 0);
+}
+
+// Export one row from rescaler. Returns the pointer where output was written,
+// or NULL if no row was pending.
+uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk);
+
+// Export as many rows as possible. Return the numbers of rows written.
+int WebPRescalerExport(WebPRescaler* const wrk);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_RESCALER_H_ */
diff --git a/src/utils/thread.c b/src/utils/thread.c
new file mode 100644
index 0000000..ce89cf9
--- /dev/null
+++ b/src/utils/thread.c
@@ -0,0 +1,247 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <string.h>   // for memset()
+#include "./thread.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef WEBP_USE_THREAD
+
+#if defined(_WIN32)
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#include <process.h>
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static int pthread_create(pthread_t* const thread, const void* attr,
+                          unsigned int (__stdcall *start)(void*), void* arg) {
+  (void)attr;
+  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
+                                      0,      /* unsigned stack_size */
+                                      start,
+                                      arg,
+                                      0,      /* unsigned initflag */
+                                      NULL);  /* unsigned *thrdaddr */
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t* const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t* const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static int pthread_cond_destroy(pthread_cond_t* const condition) {
+  int ok = 1;
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+  return !ok;
+}
+
+static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
+  (void)cond_attr;
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL ||
+      condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  return 0;
+}
+
+static int pthread_cond_signal(pthread_cond_t* const condition) {
+  int ok = 1;
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+  return !ok;
+}
+
+static int pthread_cond_wait(pthread_cond_t* const condition,
+                             pthread_mutex_t* const mutex) {
+  int ok;
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
+    return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+  return !ok;
+}
+
+#else  // _WIN32
+# define THREADFN void*
+# define THREAD_RETURN(val) val
+#endif
+
+//------------------------------------------------------------------------------
+
+static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
+  WebPWorker* const worker = (WebPWorker*)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->mutex_);
+    while (worker->status_ == OK) {   // wait in idling mode
+      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      if (worker->hook) {
+        worker->had_error |= !worker->hook(worker->data1, worker->data2);
+      }
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {   // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for Sync())
+    pthread_cond_signal(&worker->condition_);
+    pthread_mutex_unlock(&worker->mutex_);
+  }
+  return THREAD_RETURN(NULL);    // Thread is finished
+}
+
+// main thread state control
+static void WebPWorkerChangeState(WebPWorker* const worker,
+                                  WebPWorkerStatus new_status) {
+  // no-op when attempting to change state on a thread that didn't come up
+  if (worker->status_ < OK) return;
+
+  pthread_mutex_lock(&worker->mutex_);
+  // wait for the worker to finish
+  while (worker->status_ != OK) {
+    pthread_cond_wait(&worker->condition_, &worker->mutex_);
+  }
+  // assign new status and release the working thread if needed
+  if (new_status != OK) {
+    worker->status_ = new_status;
+    pthread_cond_signal(&worker->condition_);
+  }
+  pthread_mutex_unlock(&worker->mutex_);
+}
+
+#endif
+
+//------------------------------------------------------------------------------
+
+void WebPWorkerInit(WebPWorker* const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+int WebPWorkerSync(WebPWorker* const worker) {
+#ifdef WEBP_USE_THREAD
+  WebPWorkerChangeState(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+int WebPWorkerReset(WebPWorker* const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#ifdef WEBP_USE_THREAD
+    if (pthread_mutex_init(&worker->mutex_, NULL) ||
+        pthread_cond_init(&worker->condition_, NULL)) {
+      return 0;
+    }
+    pthread_mutex_lock(&worker->mutex_);
+    ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->mutex_);
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = WebPWorkerSync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+void WebPWorkerLaunch(WebPWorker* const worker) {
+#ifdef WEBP_USE_THREAD
+  WebPWorkerChangeState(worker, WORK);
+#else
+  if (worker->hook)
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+#endif
+}
+
+void WebPWorkerEnd(WebPWorker* const worker) {
+  if (worker->status_ >= OK) {
+#ifdef WEBP_USE_THREAD
+    WebPWorkerChangeState(worker, NOT_OK);
+    pthread_join(worker->thread_, NULL);
+    pthread_mutex_destroy(&worker->mutex_);
+    pthread_cond_destroy(&worker->condition_);
+#else
+    worker->status_ = NOT_OK;
+#endif
+  }
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/thread.h b/src/utils/thread.h
new file mode 100644
index 0000000..3191890
--- /dev/null
+++ b/src/utils/thread.h
@@ -0,0 +1,86 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_THREAD_H_
+#define WEBP_UTILS_THREAD_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if WEBP_USE_THREAD
+
+#if defined(_WIN32)
+
+#include <windows.h>
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+
+#else
+
+#include <pthread.h>
+
+#endif    /* _WIN32 */
+#endif    /* WEBP_USE_THREAD */
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,   // object is unusable
+  OK,           // ready to work
+  WORK          // busy finishing the current task
+} WebPWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2), and should return false in case of error.
+typedef int (*WebPWorkerHook)(void*, void*);
+
+// Synchronize object used to launch job in the worker thread
+typedef struct {
+#if WEBP_USE_THREAD
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+#endif
+  WebPWorkerStatus status_;
+  WebPWorkerHook hook;    // hook to call
+  void* data1;            // first argument passed to 'hook'
+  void* data2;            // second argument passed to 'hook'
+  int had_error;          // return value of the last call to 'hook'
+} WebPWorker;
+
+// Must be called first, before any other method.
+void WebPWorkerInit(WebPWorker* const worker);
+// Must be called initialize the object and spawn the thread. Re-entrant.
+// Will potentially launch the thread. Returns false in case of error.
+int WebPWorkerReset(WebPWorker* const worker);
+// Make sure the previous work is finished. Returns true if worker->had_error
+// was not set and not error condition was triggered by the working thread.
+int WebPWorkerSync(WebPWorker* const worker);
+// Trigger the thread to call hook() with data1 and data2 argument. These
+// hook/data1/data2 can be changed at any time before calling this function,
+// but not be changed afterward until the next call to WebPWorkerSync().
+void WebPWorkerLaunch(WebPWorker* const worker);
+// Kill the thread and terminate the object. To use the object again, one
+// must call WebPWorkerReset() again.
+void WebPWorkerEnd(WebPWorker* const worker);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_THREAD_H_ */
diff --git a/src/utils/utils.c b/src/utils/utils.c
new file mode 100644
index 0000000..673b7e2
--- /dev/null
+++ b/src/utils/utils.c
@@ -0,0 +1,44 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Misc. common utility functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Checked memory allocation
+
+static int CheckSizeArguments(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * size;
+  if (nmemb == 0) return 1;
+  if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
+  if (total_size != (size_t)total_size) return 0;
+  return 1;
+}
+
+void* WebPSafeMalloc(uint64_t nmemb, size_t size) {
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
+  return malloc((size_t)(nmemb * size));
+}
+
+void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
+  return calloc((size_t)nmemb, size);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..aa44569
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,44 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Misc. common utility functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_UTILS_H_
+#define WEBP_UTILS_UTILS_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Memory allocation
+
+// This is the maximum memory amount that libwebp will ever try to allocate.
+#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 40)
+
+// size-checking safe malloc/calloc: verify that the requested size is not too
+// large, or return NULL. You don't need to call these for constructs like
+// malloc(sizeof(foo)), but only if there's picture-dependent size involved
+// somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
+// safe malloc() borrows the signature from calloc(), pointing at the dangerous
+// underlying multiply involved.
+void* WebPSafeMalloc(uint64_t nmemb, size_t size);
+// Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
+// in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
+void* WebPSafeCalloc(uint64_t nmemb, size_t size);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_UTILS_H_ */