Merge "Upgrade to flac 1.3.2 (2017-01-01)."
diff --git a/README.version b/README.version
index 0f07f97..7f4074e 100644
--- a/README.version
+++ b/README.version
@@ -1,3 +1,3 @@
-URL: http://downloads.xiph.org/releases/flac/flac-1.2.1.tar.gz
-Version: 1.2.1
+URL: https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
+Version: 1.3.2
 BugComponent: 42195
diff --git a/config.h b/config.h
index 856866e..b0d0707 100644
--- a/config.h
+++ b/config.h
@@ -1,50 +1,83 @@
-/* config.h.  Generated by configure.  */
-/* config.h.in.  Generated from configure.in by autoheader.  */
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Target processor is big endian. */
+#define CPU_IS_BIG_ENDIAN 0
+
+/* Target processor is little endian. */
+#define CPU_IS_LITTLE_ENDIAN 1
+
+/* Set FLAC__BYTES_PER_WORD to 8 (4 is the default) */
+#define ENABLE_64_BIT_WORDS 0
 
 /* define to align allocated memory on 32-byte boundaries */
 #define FLAC__ALIGN_MALLOC_DATA 1
 
 /* define if building for ia32/i386 */
-/* #define FLAC__CPU_IA32 1 */
+/* #undef FLAC__CPU_IA32 */
 
 /* define if building for PowerPC */
 /* #undef FLAC__CPU_PPC */
 
+/* define if building for PowerPC with SPE ABI */
+/* #undef FLAC__CPU_PPC_SPE */
+
 /* define if building for SPARC */
 /* #undef FLAC__CPU_SPARC */
 
-/* define if you are compiling for PowerPC and have the 'as' assembler */
-/* #undef FLAC__HAS_AS */
+/* define if building for x86_64 */
+/* #undef FLAC__CPU_X86_64 */
 
 /* define if you have docbook-to-man or docbook2man */
-#define FLAC__HAS_DOCBOOK_TO_MAN 1
-
-/* define if you are compiling for PowerPC and have the 'gas' assembler */
-/* #define FLAC__HAS_GAS 0 */
+/* #undef FLAC__HAS_DOCBOOK_TO_MAN */
 
 /* define if you are compiling for x86 and have the NASM assembler */
-/* #define FLAC__HAS_NASM 0 */
+/* #undef FLAC__HAS_NASM */
 
 /* define if you have the ogg library */
-/* #undef FLAC__HAS_OGG */
+#define FLAC__HAS_OGG 0
+
+/* Set to 1 if <x86intrin.h> is available. */
+#define FLAC__HAS_X86INTRIN 1
 
 /* define to disable use of assembly code */
 #define FLAC__NO_ASM 1
 
-/* define if your operating system supports SSE instructions */
-/* #undef FLAC__SSE_OS */
-
 /* define if building for Darwin / MacOS X */
 /* #undef FLAC__SYS_DARWIN */
 
 /* define if building for Linux */
-/* #undef FLAC__SYS_LINUX */
-
-/* define to enable use of 3Dnow! instructions */
-/* #define FLAC__USE_3DNOW 0 */
+#define FLAC__SYS_LINUX 1
 
 /* define to enable use of Altivec instructions */
-/* #define FLAC__USE_ALTIVEC 0 */
+/* #define FLAC__USE_ALTIVEC 1 */
+
+/* define to enable use of AVX instructions */
+/* #define FLAC__USE_AVX 1 */
+
+/* Compiler has the __builtin_bswap16 intrinsic */
+#define HAVE_BSWAP16 1
+
+/* Compiler has the __builtin_bswap32 intrinsic */
+#define HAVE_BSWAP32 1
+
+/* Define to 1 if you have the <byteswap.h> header file. */
+#define HAVE_BYTESWAP_H 1
+
+/* define if you have clock_gettime */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 if you have the <cpuid.h> header file. */
+/* #undef HAVE_CPUID_H */
+
+/* Define to 1 if C++ supports variable-length arrays. */
+#define HAVE_CXX_VARARRAYS 1
+
+/* Define to 1 if C supports variable-length arrays. */
+#define HAVE_C_VARARRAYS 1
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
@@ -55,14 +88,17 @@
 /* Define to 1 if you have the `getopt_long' function. */
 #define HAVE_GETOPT_LONG 1
 
-/* Define if you have the iconv() function. */
+/* Define if you have the iconv() function and it works. */
 #define HAVE_ICONV 1
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
 /* Define if you have <langinfo.h> and nl_langinfo(CODESET). */
-/* #define HAVE_LANGINFO_CODESET 0 */
+#define HAVE_LANGINFO_CODESET 1
+
+/* lround support */
+#define HAVE_LROUND 1
 
 /* Define to 1 if you have the <memory.h> header file. */
 #define HAVE_MEMORY_H 1
@@ -82,37 +118,65 @@
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if typeof works with your compiler. */
+#define HAVE_TYPEOF 1
+
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
+/* Define to 1 if you have the <x86intrin.h> header file. */
+/* #undef HAVE_X86INTRIN_H */
+
 /* Define as const if the declaration of iconv() needs const. */
 #define ICONV_CONST 
 
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
 /* Name of package */
 #define PACKAGE "flac"
 
 /* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
+#define PACKAGE_BUGREPORT "flac-dev@xiph.org"
 
 /* Define to the full name of this package. */
-#define PACKAGE_NAME ""
+#define PACKAGE_NAME "flac"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING ""
+#define PACKAGE_STRING "flac 1.3.2"
 
 /* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME ""
+#define PACKAGE_TARNAME "flac"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL "https://www.xiph.org/flac/"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION ""
+#define PACKAGE_VERSION "1.3.2"
 
-/* The size of a `void*', as computed by sizeof. */
+/* The size of `off_t', as computed by sizeof. */
+#if __LP64__
+#define SIZEOF_OFF_T 8
+#else
+#define SIZEOF_OFF_T 4
+#endif
+
+/* The size of `void*', as computed by sizeof. */
 #if __LP64__
 #define SIZEOF_VOIDP 8
 #else
@@ -122,18 +186,63 @@
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
 
-/* Version number of package */
-#define VERSION "1.3.1"
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
 
-/* Define to 1 if your processor stores words with the most significant byte
-   first (like Motorola and SPARC, unlike Intel and VAX). */
-/* #undef WORDS_BIGENDIAN */
+
+/* Version number of package */
+#define VERSION "1.3.2"
+
+/* Target processor is big endian. */
+#define WORDS_BIGENDIAN 0
+
+/* Enable large inode numbers on Mac OS X 10.5.  */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-/* #define _FILE_OFFSET_BITS 64 */
+/* #undef _FILE_OFFSET_BITS */
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
 /* #undef _LARGEFILE_SOURCE */
 
 /* Define for large files, on AIX-style hosts. */
 /* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to __typeof__ if your compiler spells it that way. */
+/* #undef typeof */
diff --git a/include/FLAC/all.h b/include/FLAC/all.h
new file mode 100644
index 0000000..11d47d7
--- /dev/null
+++ b/include/FLAC/all.h
@@ -0,0 +1,371 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__ALL_H
+#define FLAC__ALL_H
+
+#include "export.h"
+
+#include "assert.h"
+#include "callback.h"
+#include "format.h"
+#include "metadata.h"
+#include "ordinals.h"
+#include "stream_decoder.h"
+#include "stream_encoder.h"
+
+/** \mainpage
+ *
+ * \section intro Introduction
+ *
+ * This is the documentation for the FLAC C and C++ APIs.  It is
+ * highly interconnected; this introduction should give you a top
+ * level idea of the structure and how to find the information you
+ * need.  As a prerequisite you should have at least a basic
+ * knowledge of the FLAC format, documented
+ * <A HREF="../format.html">here</A>.
+ *
+ * \section c_api FLAC C API
+ *
+ * The FLAC C API is the interface to libFLAC, a set of structures
+ * describing the components of FLAC streams, and functions for
+ * encoding and decoding streams, as well as manipulating FLAC
+ * metadata in files.  The public include files will be installed
+ * in your include area (for example /usr/include/FLAC/...).
+ *
+ * By writing a little code and linking against libFLAC, it is
+ * relatively easy to add FLAC support to another program.  The
+ * library is licensed under <A HREF="../license.html">Xiph's BSD license</A>.
+ * Complete source code of libFLAC as well as the command-line
+ * encoder and plugins is available and is a useful source of
+ * examples.
+ *
+ * Aside from encoders and decoders, libFLAC provides a powerful
+ * metadata interface for manipulating metadata in FLAC files.  It
+ * allows the user to add, delete, and modify FLAC metadata blocks
+ * and it can automatically take advantage of PADDING blocks to avoid
+ * rewriting the entire FLAC file when changing the size of the
+ * metadata.
+ *
+ * libFLAC usually only requires the standard C library and C math
+ * library. In particular, threading is not used so there is no
+ * dependency on a thread library. However, libFLAC does not use
+ * global variables and should be thread-safe.
+ *
+ * libFLAC also supports encoding to and decoding from Ogg FLAC.
+ * However the metadata editing interfaces currently have limited
+ * read-only support for Ogg FLAC files.
+ *
+ * \section cpp_api FLAC C++ API
+ *
+ * The FLAC C++ API is a set of classes that encapsulate the
+ * structures and functions in libFLAC.  They provide slightly more
+ * functionality with respect to metadata but are otherwise
+ * equivalent.  For the most part, they share the same usage as
+ * their counterparts in libFLAC, and the FLAC C API documentation
+ * can be used as a supplement.  The public include files
+ * for the C++ API will be installed in your include area (for
+ * example /usr/include/FLAC++/...).
+ *
+ * libFLAC++ is also licensed under
+ * <A HREF="../license.html">Xiph's BSD license</A>.
+ *
+ * \section getting_started Getting Started
+ *
+ * A good starting point for learning the API is to browse through
+ * the <A HREF="modules.html">modules</A>.  Modules are logical
+ * groupings of related functions or classes, which correspond roughly
+ * to header files or sections of header files.  Each module includes a
+ * detailed description of the general usage of its functions or
+ * classes.
+ *
+ * From there you can go on to look at the documentation of
+ * individual functions.  You can see different views of the individual
+ * functions through the links in top bar across this page.
+ *
+ * If you prefer a more hands-on approach, you can jump right to some
+ * <A HREF="../documentation_example_code.html">example code</A>.
+ *
+ * \section porting_guide Porting Guide
+ *
+ * Starting with FLAC 1.1.3 a \link porting Porting Guide \endlink
+ * has been introduced which gives detailed instructions on how to
+ * port your code to newer versions of FLAC.
+ *
+ * \section embedded_developers Embedded Developers
+ *
+ * libFLAC has grown larger over time as more functionality has been
+ * included, but much of it may be unnecessary for a particular embedded
+ * implementation.  Unused parts may be pruned by some simple editing of
+ * src/libFLAC/Makefile.am.  In general, the decoders, encoders, and
+ * metadata interface are all independent from each other.
+ *
+ * It is easiest to just describe the dependencies:
+ *
+ * - All modules depend on the \link flac_format Format \endlink module.
+ * - The decoders and encoders depend on the bitbuffer.
+ * - The decoder is independent of the encoder.  The encoder uses the
+ *   decoder because of the verify feature, but this can be removed if
+ *   not needed.
+ * - Parts of the metadata interface require the stream decoder (but not
+ *   the encoder).
+ * - Ogg support is selectable through the compile time macro
+ *   \c FLAC__HAS_OGG.
+ *
+ * For example, if your application only requires the stream decoder, no
+ * encoder, and no metadata interface, you can remove the stream encoder
+ * and the metadata interface, which will greatly reduce the size of the
+ * library.
+ *
+ * Also, there are several places in the libFLAC code with comments marked
+ * with "OPT:" where a #define can be changed to enable code that might be
+ * faster on a specific platform.  Experimenting with these can yield faster
+ * binaries.
+ */
+
+/** \defgroup porting Porting Guide for New Versions
+ *
+ * This module describes differences in the library interfaces from
+ * version to version.  It assists in the porting of code that uses
+ * the libraries to newer versions of FLAC.
+ *
+ * One simple facility for making porting easier that has been added
+ * in FLAC 1.1.3 is a set of \c #defines in \c export.h of each
+ * library's includes (e.g. \c include/FLAC/export.h).  The
+ * \c #defines mirror the libraries'
+ * <A HREF="http://www.gnu.org/software/libtool/manual/libtool.html#Libtool-versioning">libtool version numbers</A>,
+ * e.g. in libFLAC there are \c FLAC_API_VERSION_CURRENT,
+ * \c FLAC_API_VERSION_REVISION, and \c FLAC_API_VERSION_AGE.
+ * These can be used to support multiple versions of an API during the
+ * transition phase, e.g.
+ *
+ * \code
+ * #if !defined(FLAC_API_VERSION_CURRENT) || FLAC_API_VERSION_CURRENT <= 7
+ *   legacy code
+ * #else
+ *   new code
+ * #endif
+ * \endcode
+ *
+ * The source will work for multiple versions and the legacy code can
+ * easily be removed when the transition is complete.
+ *
+ * Another available symbol is FLAC_API_SUPPORTS_OGG_FLAC (defined in
+ * include/FLAC/export.h), which can be used to determine whether or not
+ * the library has been compiled with support for Ogg FLAC.  This is
+ * simpler than trying to call an Ogg init function and catching the
+ * error.
+ */
+
+/** \defgroup porting_1_1_2_to_1_1_3 Porting from FLAC 1.1.2 to 1.1.3
+ *  \ingroup porting
+ *
+ *  \brief
+ *  This module describes porting from FLAC 1.1.2 to FLAC 1.1.3.
+ *
+ * The main change between the APIs in 1.1.2 and 1.1.3 is that they have
+ * been simplified.  First, libOggFLAC has been merged into libFLAC and
+ * libOggFLAC++ has been merged into libFLAC++.  Second, both the three
+ * decoding layers and three encoding layers have been merged into a
+ * single stream decoder and stream encoder.  That is, the functionality
+ * of FLAC__SeekableStreamDecoder and FLAC__FileDecoder has been merged
+ * into FLAC__StreamDecoder, and FLAC__SeekableStreamEncoder and
+ * FLAC__FileEncoder into FLAC__StreamEncoder.  Only the
+ * FLAC__StreamDecoder and FLAC__StreamEncoder remain.  What this means
+ * is there is now a single API that can be used to encode or decode
+ * streams to/from native FLAC or Ogg FLAC and the single API can work
+ * on both seekable and non-seekable streams.
+ *
+ * Instead of creating an encoder or decoder of a certain layer, now the
+ * client will always create a FLAC__StreamEncoder or
+ * FLAC__StreamDecoder.  The old layers are now differentiated by the
+ * initialization function.  For example, for the decoder,
+ * FLAC__stream_decoder_init() has been replaced by
+ * FLAC__stream_decoder_init_stream().  This init function takes
+ * callbacks for the I/O, and the seeking callbacks are optional.  This
+ * allows the client to use the same object for seekable and
+ * non-seekable streams.  For decoding a FLAC file directly, the client
+ * can use FLAC__stream_decoder_init_file() and pass just a filename
+ * and fewer callbacks; most of the other callbacks are supplied
+ * internally.  For situations where fopen()ing by filename is not
+ * possible (e.g. Unicode filenames on Windows) the client can instead
+ * open the file itself and supply the FILE* to
+ * FLAC__stream_decoder_init_FILE().  The init functions now returns a
+ * FLAC__StreamDecoderInitStatus instead of FLAC__StreamDecoderState.
+ * Since the callbacks and client data are now passed to the init
+ * function, the FLAC__stream_decoder_set_*_callback() functions and
+ * FLAC__stream_decoder_set_client_data() are no longer needed.  The
+ * rest of the calls to the decoder are the same as before.
+ *
+ * There are counterpart init functions for Ogg FLAC, e.g.
+ * FLAC__stream_decoder_init_ogg_stream().  All the rest of the calls
+ * and callbacks are the same as for native FLAC.
+ *
+ * As an example, in FLAC 1.1.2 a seekable stream decoder would have
+ * been set up like so:
+ *
+ * \code
+ * FLAC__SeekableStreamDecoder *decoder = FLAC__seekable_stream_decoder_new();
+ * if(decoder == NULL) do_something;
+ * FLAC__seekable_stream_decoder_set_md5_checking(decoder, true);
+ * [... other settings ...]
+ * FLAC__seekable_stream_decoder_set_read_callback(decoder, my_read_callback);
+ * FLAC__seekable_stream_decoder_set_seek_callback(decoder, my_seek_callback);
+ * FLAC__seekable_stream_decoder_set_tell_callback(decoder, my_tell_callback);
+ * FLAC__seekable_stream_decoder_set_length_callback(decoder, my_length_callback);
+ * FLAC__seekable_stream_decoder_set_eof_callback(decoder, my_eof_callback);
+ * FLAC__seekable_stream_decoder_set_write_callback(decoder, my_write_callback);
+ * FLAC__seekable_stream_decoder_set_metadata_callback(decoder, my_metadata_callback);
+ * FLAC__seekable_stream_decoder_set_error_callback(decoder, my_error_callback);
+ * FLAC__seekable_stream_decoder_set_client_data(decoder, my_client_data);
+ * if(FLAC__seekable_stream_decoder_init(decoder) != FLAC__SEEKABLE_STREAM_DECODER_OK) do_something;
+ * \endcode
+ *
+ * In FLAC 1.1.3 it is like this:
+ *
+ * \code
+ * FLAC__StreamDecoder *decoder = FLAC__stream_decoder_new();
+ * if(decoder == NULL) do_something;
+ * FLAC__stream_decoder_set_md5_checking(decoder, true);
+ * [... other settings ...]
+ * if(FLAC__stream_decoder_init_stream(
+ *   decoder,
+ *   my_read_callback,
+ *   my_seek_callback,      // or NULL
+ *   my_tell_callback,      // or NULL
+ *   my_length_callback,    // or NULL
+ *   my_eof_callback,       // or NULL
+ *   my_write_callback,
+ *   my_metadata_callback,  // or NULL
+ *   my_error_callback,
+ *   my_client_data
+ * ) != FLAC__STREAM_DECODER_INIT_STATUS_OK) do_something;
+ * \endcode
+ *
+ * or you could do;
+ *
+ * \code
+ * [...]
+ * FILE *file = fopen("somefile.flac","rb");
+ * if(file == NULL) do_somthing;
+ * if(FLAC__stream_decoder_init_FILE(
+ *   decoder,
+ *   file,
+ *   my_write_callback,
+ *   my_metadata_callback,  // or NULL
+ *   my_error_callback,
+ *   my_client_data
+ * ) != FLAC__STREAM_DECODER_INIT_STATUS_OK) do_something;
+ * \endcode
+ *
+ * or just:
+ *
+ * \code
+ * [...]
+ * if(FLAC__stream_decoder_init_file(
+ *   decoder,
+ *   "somefile.flac",
+ *   my_write_callback,
+ *   my_metadata_callback,  // or NULL
+ *   my_error_callback,
+ *   my_client_data
+ * ) != FLAC__STREAM_DECODER_INIT_STATUS_OK) do_something;
+ * \endcode
+ *
+ * Another small change to the decoder is in how it handles unparseable
+ * streams.  Before, when the decoder found an unparseable stream
+ * (reserved for when the decoder encounters a stream from a future
+ * encoder that it can't parse), it changed the state to
+ * \c FLAC__STREAM_DECODER_UNPARSEABLE_STREAM.  Now the decoder instead
+ * drops sync and calls the error callback with a new error code
+ * \c FLAC__STREAM_DECODER_ERROR_STATUS_UNPARSEABLE_STREAM.  This is
+ * more robust.  If your error callback does not discriminate on the the
+ * error state, your code does not need to be changed.
+ *
+ * The encoder now has a new setting:
+ * FLAC__stream_encoder_set_apodization().  This is for setting the
+ * method used to window the data before LPC analysis.  You only need to
+ * add a call to this function if the default is not suitable.   There
+ * are also two new convenience functions that may be useful:
+ * FLAC__metadata_object_cuesheet_calculate_cddb_id() and
+ * FLAC__metadata_get_cuesheet().
+ *
+ * The \a bytes parameter to FLAC__StreamDecoderReadCallback,
+ * FLAC__StreamEncoderReadCallback, and FLAC__StreamEncoderWriteCallback
+ * is now \c size_t instead of \c unsigned.
+ */
+
+/** \defgroup porting_1_1_3_to_1_1_4 Porting from FLAC 1.1.3 to 1.1.4
+ *  \ingroup porting
+ *
+ *  \brief
+ *  This module describes porting from FLAC 1.1.3 to FLAC 1.1.4.
+ *
+ * There were no changes to any of the interfaces from 1.1.3 to 1.1.4.
+ * There was a slight change in the implementation of
+ * FLAC__stream_encoder_set_metadata(); the function now makes a copy
+ * of the \a metadata array of pointers so the client no longer needs
+ * to maintain it after the call.  The objects themselves that are
+ * pointed to by the array are still not copied though and must be
+ * maintained until the call to FLAC__stream_encoder_finish().
+ */
+
+/** \defgroup porting_1_1_4_to_1_2_0 Porting from FLAC 1.1.4 to 1.2.0
+ *  \ingroup porting
+ *
+ *  \brief
+ *  This module describes porting from FLAC 1.1.4 to FLAC 1.2.0.
+ *
+ * There were only very minor changes to the interfaces from 1.1.4 to 1.2.0.
+ * In libFLAC, \c FLAC__format_sample_rate_is_subset() was added.
+ * In libFLAC++, \c FLAC::Decoder::Stream::get_decode_position() was added.
+ *
+ * Finally, value of the constant \c FLAC__FRAME_HEADER_RESERVED_LEN
+ * has changed to reflect the conversion of one of the reserved bits
+ * into active use.  It used to be \c 2 and now is \c 1.  However the
+ * FLAC frame header length has not changed, so to skip the proper
+ * number of bits, use \c FLAC__FRAME_HEADER_RESERVED_LEN +
+ * \c FLAC__FRAME_HEADER_BLOCKING_STRATEGY_LEN
+ */
+
+/** \defgroup flac FLAC C API
+ *
+ * The FLAC C API is the interface to libFLAC, a set of structures
+ * describing the components of FLAC streams, and functions for
+ * encoding and decoding streams, as well as manipulating FLAC
+ * metadata in files.
+ *
+ * You should start with the format components as all other modules
+ * are dependent on it.
+ */
+
+#endif
diff --git a/include/FLAC/assert.h b/include/FLAC/assert.h
index dc9bcef..b546fd0 100644
--- a/include/FLAC/assert.h
+++ b/include/FLAC/assert.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/callback.h b/include/FLAC/callback.h
index ce8787f..f942dd2 100644
--- a/include/FLAC/callback.h
+++ b/include/FLAC/callback.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2004-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/export.h b/include/FLAC/export.h
index 9cc9e13..d52f0bb 100644
--- a/include/FLAC/export.h
+++ b/include/FLAC/export.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/format.h b/include/FLAC/format.h
index 7424565..c087d4a 100644
--- a/include/FLAC/format.h
+++ b/include/FLAC/format.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -512,8 +512,8 @@
 	FLAC__METADATA_TYPE_UNDEFINED = 7,
 	/**< marker to denote beginning of undefined type range; this number will increase as new metadata types are added */
 
-        FLAC__MAX_METADATA_TYPE = FLAC__MAX_METADATA_TYPE_CODE,
-        /**< No type will ever be greater than this. There is not enough room in the protocol block. */
+	FLAC__MAX_METADATA_TYPE = FLAC__MAX_METADATA_TYPE_CODE,
+	/**< No type will ever be greater than this. There is not enough room in the protocol block. */
 } FLAC__MetadataType;
 
 /** Maps a FLAC__MetadataType to a C string.
diff --git a/include/FLAC/metadata.h b/include/FLAC/metadata.h
index b8ad8b4..4e18cd6 100644
--- a/include/FLAC/metadata.h
+++ b/include/FLAC/metadata.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/ordinals.h b/include/FLAC/ordinals.h
index b1e1acf..ea52ea6 100644
--- a/include/FLAC/ordinals.h
+++ b/include/FLAC/ordinals.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/stream_decoder.h b/include/FLAC/stream_decoder.h
index 9bfdd1f..39c958d 100644
--- a/include/FLAC/stream_decoder.h
+++ b/include/FLAC/stream_decoder.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -228,7 +228,7 @@
 	 */
 
 	FLAC__STREAM_DECODER_ABORTED,
-	/**< The decoder was aborted by the read callback. */
+	/**< The decoder was aborted by the read or write callback. */
 
 	FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR,
 	/**< An error occurred allocating memory.  The decoder is in an invalid
diff --git a/include/FLAC/stream_encoder.h b/include/FLAC/stream_encoder.h
index efc213a..40a2fd3 100644
--- a/include/FLAC/stream_encoder.h
+++ b/include/FLAC/stream_encoder.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/share/alloc.h b/include/share/alloc.h
index 3b70749..914de9b 100644
--- a/include/share/alloc.h
+++ b/include/share/alloc.h
@@ -1,6 +1,6 @@
 /* alloc - Convenience routines for safely allocating memory
  * Copyright (C) 2007-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -52,7 +52,7 @@
 # ifndef SIZE_T_MAX
 #  ifdef _MSC_VER
 #   ifdef _WIN64
-#    define SIZE_T_MAX 0xffffffffffffffffui64
+#    define SIZE_T_MAX FLAC__U64L(0xffffffffffffffff)
 #   else
 #    define SIZE_T_MAX 0xffffffff
 #   endif
diff --git a/include/share/compat.h b/include/share/compat.h
index 671b110..2083f3a 100644
--- a/include/share/compat.h
+++ b/include/share/compat.h
@@ -1,5 +1,5 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2012-2014  Xiph.org Foundation
+ * Copyright (C) 2012-2016  Xiph.org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -117,7 +117,9 @@
 #endif
 
 #if defined _MSC_VER
-#  if _MSC_VER >= 1600
+#  if _MSC_VER >= 1800
+#    include <inttypes.h>
+#  elif _MSC_VER >= 1600
 /* Visual Studio 2010 has decent C99 support */
 #    include <stdint.h>
 #    define PRIu64 "llu"
@@ -144,23 +146,26 @@
 
 #ifdef _WIN32
 /* All char* strings are in UTF-8 format. Added to support Unicode files on Windows */
-#include "share/win_utf8_io.h"
 
+#include "share/win_utf8_io.h"
 #define flac_printf printf_utf8
 #define flac_fprintf fprintf_utf8
 #define flac_vfprintf vfprintf_utf8
-#define flac_fopen fopen_utf8
-#define flac_chmod chmod_utf8
-#define flac_utime utime_utf8
-#define flac_unlink unlink_utf8
-#define flac_rename rename_utf8
-#define flac_stat _stat64_utf8
+
+#include "share/windows_unicode_filenames.h"
+#define flac_fopen flac_internal_fopen_utf8
+#define flac_chmod flac_internal_chmod_utf8
+#define flac_utime flac_internal_utime_utf8
+#define flac_unlink flac_internal_unlink_utf8
+#define flac_rename flac_internal_rename_utf8
+#define flac_stat flac_internal_stat64_utf8
 
 #else
 
 #define flac_printf printf
 #define flac_fprintf fprintf
 #define flac_vfprintf vfprintf
+
 #define flac_fopen fopen
 #define flac_chmod chmod
 #define flac_utime utime
diff --git a/include/share/endswap.h b/include/share/endswap.h
index 4fde4c1..9088a74 100644
--- a/include/share/endswap.h
+++ b/include/share/endswap.h
@@ -1,5 +1,5 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2012-2014  Xiph.org Foundation
+ * Copyright (C) 2012-2016  Xiph.org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -43,11 +43,15 @@
 
 #define	ENDSWAP_16(x)		(__builtin_bswap16 (x))
 #define	ENDSWAP_32(x)		(__builtin_bswap32 (x))
+#define	ENDSWAP_64(x)		(__builtin_bswap64 (x))
 
-#elif defined _MSC_VER		/* Windows. Apparently in <stdlib.h>. */
+#elif defined _MSC_VER		/* Windows */
+
+#include <stdlib.h>
 
 #define	ENDSWAP_16(x)		(_byteswap_ushort (x))
 #define	ENDSWAP_32(x)		(_byteswap_ulong (x))
+#define	ENDSWAP_64(x)		(_byteswap_uint64 (x))
 
 #elif defined HAVE_BYTESWAP_H		/* Linux */
 
@@ -55,16 +59,18 @@
 
 #define	ENDSWAP_16(x)		(bswap_16 (x))
 #define	ENDSWAP_32(x)		(bswap_32 (x))
+#define	ENDSWAP_64(x)		(bswap_64 (x))
 
 #else
 
 #define	ENDSWAP_16(x)		((((x) >> 8) & 0xFF) | (((x) & 0xFF) << 8))
 #define	ENDSWAP_32(x)		((((x) >> 24) & 0xFF) | (((x) >> 8) & 0xFF00) | (((x) & 0xFF00) << 8) | (((x) & 0xFF) << 24))
+#define	ENDSWAP_64(x)		((ENDSWAP_32(((x) >> 32) & 0xFFFFFFFF)) | (ENDSWAP_32((x) & 0xFFFFFFFF) << 32))
 
 #endif
 
 
-/* Host to little-endian byte swapping. */
+/* Host to little-endian byte swapping (for MD5 calculation) */
 #if CPU_IS_BIG_ENDIAN
 
 #define H2LE_16(x)		ENDSWAP_16 (x)
diff --git a/include/share/getopt.h b/include/share/getopt.h
new file mode 100644
index 0000000..66aced0
--- /dev/null
+++ b/include/share/getopt.h
@@ -0,0 +1,184 @@
+/*
+	NOTE:
+	I cannot get the vanilla getopt code to work (i.e. compile only what
+	is needed and not duplicate symbols found in the standard library)
+	on all the platforms that FLAC supports.  In particular the gating
+	of code with the ELIDE_CODE #define is not accurate enough on systems
+	that are POSIX but not glibc.  If someone has a patch that works on
+	GNU/Linux, Darwin, AND Solaris please submit it on the project page:
+		https://sourceforge.net/p/flac/patches/
+
+	In the meantime I have munged the global symbols and removed gates
+	around code, while at the same time trying to touch the original as
+	little as possible.
+*/
+/* Declarations for getopt.
+   Copyright (C) 1989,90,91,92,93,94,96,97,98 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+#ifndef SHARE__GETOPT_H
+#define SHARE__GETOPT_H
+
+/*[JEC] was:#ifndef __need_getopt*/
+/*[JEC] was:# define _GETOPT_H 1*/
+/*[JEC] was:#endif*/
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* For communication from `share__getopt' to the caller.
+   When `share__getopt' finds an option that takes an argument,
+   the argument value is returned here.
+   Also, when `ordering' is RETURN_IN_ORDER,
+   each non-option ARGV-element is returned here.  */
+
+extern char *share__optarg;
+
+/* Index in ARGV of the next element to be scanned.
+   This is used for communication to and from the caller
+   and for communication between successive calls to `share__getopt'.
+
+   On entry to `share__getopt', zero means this is the first call; initialize.
+
+   When `share__getopt' returns -1, this is the index of the first of the
+   non-option elements that the caller should itself scan.
+
+   Otherwise, `share__optind' communicates from one call to the next
+   how much of ARGV has been scanned so far.  */
+
+extern int share__optind;
+
+/* Callers store zero here to inhibit the error message `share__getopt' prints
+   for unrecognized options.  */
+
+extern int share__opterr;
+
+/* Set to an option character which was unrecognized.  */
+
+extern int share__optopt;
+
+/*[JEC] was:#ifndef __need_getopt */
+/* Describe the long-named options requested by the application.
+   The LONG_OPTIONS argument to share__getopt_long or share__getopt_long_only is a vector
+   of `struct share__option' terminated by an element containing a name which is
+   zero.
+
+   The field `has_arg' is:
+   share__no_argument		(or 0) if the option does not take an argument,
+   share__required_argument	(or 1) if the option requires an argument,
+   share__optional_argument 	(or 2) if the option takes an optional argument.
+
+   If the field `flag' is not NULL, it points to a variable that is set
+   to the value given in the field `val' when the option is found, but
+   left unchanged if the option is not found.
+
+   To have a long-named option do something other than set an `int' to
+   a compiled-in constant, such as set a value from `share__optarg', set the
+   option's `flag' field to zero and its `val' field to a nonzero
+   value (the equivalent single-letter option character, if there is
+   one).  For long options that have a zero `flag' field, `share__getopt'
+   returns the contents of the `val' field.  */
+
+struct share__option
+{
+# if defined __STDC__ && __STDC__
+  const char *name;
+# else
+  char *name;
+# endif
+  /* has_arg can't be an enum because some compilers complain about
+     type mismatches in all the code that assumes it is an int.  */
+  int has_arg;
+  int *flag;
+  int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct share__option'.  */
+
+# define share__no_argument		0
+# define share__required_argument	1
+# define share__optional_argument	2
+/*[JEC] was:#endif*/	/* need getopt */
+
+
+/* Get definitions and prototypes for functions to process the
+   arguments in ARGV (ARGC of them, minus the program name) for
+   options given in OPTS.
+
+   Return the option character from OPTS just read.  Return -1 when
+   there are no more options.  For unrecognized options, or options
+   missing arguments, `share__optopt' is set to the option letter, and '?' is
+   returned.
+
+   The OPTS string is a list of characters which are recognized option
+   letters, optionally followed by colons, specifying that that letter
+   takes an argument, to be placed in `share__optarg'.
+
+   If a letter in OPTS is followed by two colons, its argument is
+   optional.  This behavior is specific to the GNU `share__getopt'.
+
+   The argument `--' causes premature termination of argument
+   scanning, explicitly telling `share__getopt' that there are no more
+   options.
+
+   If OPTS begins with `--', then non-option arguments are treated as
+   arguments to the option '\0'.  This behavior is specific to the GNU
+   `share__getopt'.  */
+
+/*[JEC] was:#if defined __STDC__ && __STDC__*/
+/*[JEC] was:# ifdef __GNU_LIBRARY__*/
+/* Many other libraries have conflicting prototypes for getopt, with
+   differences in the consts, in stdlib.h.  To avoid compilation
+   errors, only prototype getopt for the GNU C library.  */
+extern int share__getopt (int argc, char *const *argv, const char *shortopts);
+/*[JEC] was:# else*/ /* not __GNU_LIBRARY__ */
+/*[JEC] was:extern int getopt ();*/
+/*[JEC] was:# endif*/ /* __GNU_LIBRARY__ */
+
+/*[JEC] was:# ifndef __need_getopt*/
+extern int share__getopt_long (int argc, char *const *argv, const char *shortopts,
+		        const struct share__option *longopts, int *longind);
+extern int share__getopt_long_only (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct share__option *longopts, int *longind);
+
+/* Internal only.  Users should not call this directly.  */
+extern int share___getopt_internal (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct share__option *longopts, int *longind,
+			     int long_only);
+/*[JEC] was:# endif*/
+/*[JEC] was:#else*/ /* not __STDC__ */
+/*[JEC] was:extern int getopt ();*/
+/*[JEC] was:# ifndef __need_getopt*/
+/*[JEC] was:extern int getopt_long ();*/
+/*[JEC] was:extern int getopt_long_only ();*/
+
+/*[JEC] was:extern int _getopt_internal ();*/
+/*[JEC] was:# endif*/
+/*[JEC] was:#endif*/ /* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+/* Make sure we later can get all the definitions and declarations.  */
+/*[JEC] was:#undef __need_getopt*/
+
+#endif /* getopt.h */
diff --git a/include/share/grabbag.h b/include/share/grabbag.h
new file mode 100644
index 0000000..92ec998
--- /dev/null
+++ b/include/share/grabbag.h
@@ -0,0 +1,30 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef SHARE__GRABBAG_H
+#define SHARE__GRABBAG_H
+
+/* These can't be included by themselves, only from within grabbag.h */
+#include "grabbag/cuesheet.h"
+#include "grabbag/file.h"
+#include "grabbag/picture.h"
+#include "grabbag/replaygain.h"
+#include "grabbag/seektable.h"
+
+#endif
diff --git a/include/share/grabbag/cuesheet.h b/include/share/grabbag/cuesheet.h
new file mode 100644
index 0000000..b465ae6
--- /dev/null
+++ b/include/share/grabbag/cuesheet.h
@@ -0,0 +1,43 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* This .h cannot be included by itself; #include "share/grabbag.h" instead. */
+
+#ifndef GRABBAG__CUESHEET_H
+#define GRABBAG__CUESHEET_H
+
+#include <stdio.h>
+#include "FLAC/metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned grabbag__cuesheet_msf_to_frame(unsigned minutes, unsigned seconds, unsigned frames);
+void grabbag__cuesheet_frame_to_msf(unsigned frame, unsigned *minutes, unsigned *seconds, unsigned *frames);
+
+FLAC__StreamMetadata *grabbag__cuesheet_parse(FILE *file, const char **error_message, unsigned *last_line_read, unsigned sample_rate, FLAC__bool is_cdda, FLAC__uint64 lead_out_offset);
+
+void grabbag__cuesheet_emit(FILE *file, const FLAC__StreamMetadata *cuesheet, const char *file_reference);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/share/grabbag/file.h b/include/share/grabbag/file.h
new file mode 100644
index 0000000..b3f4148
--- /dev/null
+++ b/include/share/grabbag/file.h
@@ -0,0 +1,65 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Convenience routines for manipulating files */
+
+/* This .h cannot be included by itself; #include "share/grabbag.h" instead. */
+
+#ifndef GRABAG__FILE_H
+#define GRABAG__FILE_H
+
+/* needed because of off_t */
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <sys/types.h> /* for off_t */
+#include <stdio.h> /* for FILE */
+#include "FLAC/ordinals.h"
+#include "share/compat.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void grabbag__file_copy_metadata(const char *srcpath, const char *destpath);
+FLAC__off_t grabbag__file_get_filesize(const char *srcpath);
+const char *grabbag__file_get_basename(const char *srcpath);
+
+/* read_only == false means "make file writable by user"
+ * read_only == true means "make file read-only for everyone"
+ */
+FLAC__bool grabbag__file_change_stats(const char *filename, FLAC__bool read_only);
+
+/* returns true iff stat() succeeds for both files and they have the same device and inode. */
+/* on windows, uses GetFileInformationByHandle() to compare */
+FLAC__bool grabbag__file_are_same(const char *f1, const char *f2);
+
+/* attempts to make writable before unlinking */
+FLAC__bool grabbag__file_remove_file(const char *filename);
+
+/* these will forcibly set stdin/stdout to binary mode (for OSes that require it) */
+FILE *grabbag__file_get_binary_stdin(void);
+FILE *grabbag__file_get_binary_stdout(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/share/grabbag/picture.h b/include/share/grabbag/picture.h
new file mode 100644
index 0000000..ea308f1
--- /dev/null
+++ b/include/share/grabbag/picture.h
@@ -0,0 +1,54 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2006-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/* This .h cannot be included by itself; #include "share/grabbag.h" instead. */
+
+#ifndef GRABBAG__PICTURE_H
+#define GRABBAG__PICTURE_H
+
+#include "FLAC/metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* spec should be of the form "[TYPE]|MIME_TYPE|[DESCRIPTION]|[WIDTHxHEIGHTxDEPTH[/COLORS]]|FILE", e.g.
+ *   "|image/jpeg|||cover.jpg"
+ *   "4|image/jpeg||300x300x24|backcover.jpg"
+ *   "|image/png|description|300x300x24/71|cover.png"
+ *   "-->|image/gif||300x300x24/71|http://blah.blah.blah/cover.gif"
+ *
+ * empty type means default to FLAC__STREAM_METADATA_PICTURE_TYPE_FRONT_COVER
+ * empty resolution spec means to get from the file (cannot get used with "-->" linked images)
+ * spec and error_message must not be NULL
+ */
+FLAC__StreamMetadata *grabbag__picture_parse_specification(const char *spec, const char **error_message);
+
+typedef struct PictureResolution
+{	uint32_t width, height, depth, colors ;
+} PictureResolution ;
+
+FLAC__StreamMetadata *grabbag__picture_from_specification(int type, const char *mime_type, const char * description,
+		const PictureResolution * res, const char * filepath, const char **error_message);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/share/grabbag/replaygain.h b/include/share/grabbag/replaygain.h
new file mode 100644
index 0000000..faa3272
--- /dev/null
+++ b/include/share/grabbag/replaygain.h
@@ -0,0 +1,73 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ * This wraps the replaygain_analysis lib, which is LGPL.  This wrapper
+ * allows analysis of different input resolutions by automatically
+ * scaling the input signal
+ */
+
+/* This .h cannot be included by itself; #include "share/grabbag.h" instead. */
+
+#ifndef GRABBAG__REPLAYGAIN_H
+#define GRABBAG__REPLAYGAIN_H
+
+#include "FLAC/metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const unsigned GRABBAG__REPLAYGAIN_MAX_TAG_SPACE_REQUIRED;
+
+extern const FLAC__byte * const GRABBAG__REPLAYGAIN_TAG_REFERENCE_LOUDNESS; /* = "REPLAYGAIN_REFERENCE_LOUDNESS" */
+extern const FLAC__byte * const GRABBAG__REPLAYGAIN_TAG_TITLE_GAIN; /* = "REPLAYGAIN_TRACK_GAIN" */
+extern const FLAC__byte * const GRABBAG__REPLAYGAIN_TAG_TITLE_PEAK; /* = "REPLAYGAIN_TRACK_PEAK" */
+extern const FLAC__byte * const GRABBAG__REPLAYGAIN_TAG_ALBUM_GAIN; /* = "REPLAYGAIN_ALBUM_GAIN" */
+extern const FLAC__byte * const GRABBAG__REPLAYGAIN_TAG_ALBUM_PEAK; /* = "REPLAYGAIN_ALBUM_PEAK" */
+
+FLAC__bool grabbag__replaygain_is_valid_sample_frequency(unsigned sample_frequency);
+
+FLAC__bool grabbag__replaygain_init(unsigned sample_frequency);
+
+/* 'bps' must be valid for FLAC, i.e. >=4 and <= 32 */
+FLAC__bool grabbag__replaygain_analyze(const FLAC__int32 * const input[], FLAC__bool is_stereo, unsigned bps, unsigned samples);
+
+void grabbag__replaygain_get_album(float *gain, float *peak);
+void grabbag__replaygain_get_title(float *gain, float *peak);
+
+/* These three functions return an error string on error, or NULL if successful */
+const char *grabbag__replaygain_analyze_file(const char *filename, float *title_gain, float *title_peak);
+const char *grabbag__replaygain_store_to_vorbiscomment(FLAC__StreamMetadata *block, float album_gain, float album_peak, float title_gain, float title_peak);
+const char *grabbag__replaygain_store_to_vorbiscomment_reference(FLAC__StreamMetadata *block);
+const char *grabbag__replaygain_store_to_vorbiscomment_album(FLAC__StreamMetadata *block, float album_gain, float album_peak);
+const char *grabbag__replaygain_store_to_vorbiscomment_title(FLAC__StreamMetadata *block, float title_gain, float title_peak);
+const char *grabbag__replaygain_store_to_file(const char *filename, float album_gain, float album_peak, float title_gain, float title_peak, FLAC__bool preserve_modtime);
+const char *grabbag__replaygain_store_to_file_reference(const char *filename, FLAC__bool preserve_modtime);
+const char *grabbag__replaygain_store_to_file_album(const char *filename, float album_gain, float album_peak, FLAC__bool preserve_modtime);
+const char *grabbag__replaygain_store_to_file_title(const char *filename, float title_gain, float title_peak, FLAC__bool preserve_modtime);
+
+FLAC__bool grabbag__replaygain_load_from_vorbiscomment(const FLAC__StreamMetadata *block, FLAC__bool album_mode, FLAC__bool strict, double *reference, double *gain, double *peak);
+double grabbag__replaygain_compute_scale_factor(double peak, double gain, double preamp, FLAC__bool prevent_clipping);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/share/grabbag/seektable.h b/include/share/grabbag/seektable.h
new file mode 100644
index 0000000..ac294a3
--- /dev/null
+++ b/include/share/grabbag/seektable.h
@@ -0,0 +1,39 @@
+/* grabbag - Convenience lib for various routines common to several tools
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Convenience routines for working with seek tables */
+
+/* This .h cannot be included by itself; #include "share/grabbag.h" instead. */
+
+#ifndef GRABAG__SEEKTABLE_H
+#define GRABAG__SEEKTABLE_H
+
+#include "FLAC/format.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FLAC__bool grabbag__seektable_convert_specification_to_template(const char *spec, FLAC__bool only_explicit_placeholders, FLAC__uint64 total_samples_to_encode, unsigned sample_rate, FLAC__StreamMetadata *seektable_template, FLAC__bool *spec_has_real_points);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/share/macros.h b/include/share/macros.h
new file mode 100644
index 0000000..20b3ea5
--- /dev/null
+++ b/include/share/macros.h
@@ -0,0 +1,45 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2016  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+
+/* FLAC_CHECK_RETURN : Check the return value of the provided function and
+ * print an error message if it fails (ie returns a value < 0).
+ *
+ * Ideally, a library should not print anything, but this macro is only used
+ * for things that extremely unlikely to fail, like `chown` to a previoulsy
+ * saved `uid`.
+ */
+
+#define FLAC_CHECK_RETURN(x) \
+			{	if ((x) < 0) \
+					fprintf (stderr, "%s : %s\n", #x, strerror (errno)) ; \
+			}
diff --git a/include/share/private.h b/include/share/private.h
index 3a500d3..f7e3b85 100644
--- a/include/share/private.h
+++ b/include/share/private.h
@@ -1,5 +1,5 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2013-2014  Xiph.org Foundation
+ * Copyright (C) 2013-2016  Xiph.org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/share/replaygain_analysis.h b/include/share/replaygain_analysis.h
new file mode 100644
index 0000000..f06a9b2
--- /dev/null
+++ b/include/share/replaygain_analysis.h
@@ -0,0 +1,59 @@
+/*
+ *  ReplayGainAnalysis - analyzes input samples and give the recommended dB change
+ *  Copyright (C) 2001 David Robinson and Glen Sawyer
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ *  concept and filter values by David Robinson (David@Robinson.org)
+ *    -- blame him if you think the idea is flawed
+ *  coding by Glen Sawyer (glensawyer@hotmail.com) 442 N 700 E, Provo, UT 84606 USA
+ *    -- blame him if you think this runs too slowly, or the coding is otherwise flawed
+ *  minor cosmetic tweaks to integrate with FLAC by Josh Coalson
+ *
+ *  For an explanation of the concepts and the basic algorithms involved, go to:
+ *    http://www.replaygain.org/
+ */
+
+#ifndef GAIN_ANALYSIS_H
+#define GAIN_ANALYSIS_H
+
+#include <stddef.h>
+
+#define GAIN_NOT_ENOUGH_SAMPLES  -24601
+#define GAIN_ANALYSIS_ERROR           0
+#define GAIN_ANALYSIS_OK              1
+
+#define INIT_GAIN_ANALYSIS_ERROR      0
+#define INIT_GAIN_ANALYSIS_OK         1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float   flac_float_t;         /* Type used for filtering */
+
+extern flac_float_t ReplayGainReferenceLoudness; /* in dB SPL, currently == 89.0 */
+
+int     InitGainAnalysis ( long samplefreq );
+int     ValidGainFrequency ( long samplefreq );
+int     AnalyzeSamples   ( const flac_float_t* left_samples, const flac_float_t* right_samples, size_t num_samples, int num_channels );
+flac_float_t GetTitleGain     ( void );
+flac_float_t GetAlbumGain     ( void );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GAIN_ANALYSIS_H */
diff --git a/include/share/replaygain_synthesis.h b/include/share/replaygain_synthesis.h
new file mode 100644
index 0000000..5f4c3ff
--- /dev/null
+++ b/include/share/replaygain_synthesis.h
@@ -0,0 +1,52 @@
+/* replaygain_synthesis - Routines for applying ReplayGain to a signal
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef FLAC__SHARE__REPLAYGAIN_SYNTHESIS_H
+#define FLAC__SHARE__REPLAYGAIN_SYNTHESIS_H
+
+#include <stdlib.h> /* for size_t */
+#include "FLAC/format.h"
+
+#define FLAC_SHARE__MAX_SUPPORTED_CHANNELS FLAC__MAX_CHANNELS
+
+typedef enum {
+	NOISE_SHAPING_NONE = 0,
+	NOISE_SHAPING_LOW = 1,
+	NOISE_SHAPING_MEDIUM = 2,
+	NOISE_SHAPING_HIGH = 3
+} NoiseShaping;
+
+typedef struct {
+	const float*  FilterCoeff;
+	FLAC__uint64  Mask;
+	double        Add;
+	float         Dither;
+	float         ErrorHistory     [FLAC_SHARE__MAX_SUPPORTED_CHANNELS] [16];  /* 16th order Noise shaping */
+	float         DitherHistory    [FLAC_SHARE__MAX_SUPPORTED_CHANNELS] [16];
+	int           LastRandomNumber [FLAC_SHARE__MAX_SUPPORTED_CHANNELS];
+	unsigned      LastHistoryIndex;
+	NoiseShaping  ShapingType;
+} DitherContext;
+
+void FLAC__replaygain_synthesis__init_dither_context(DitherContext *dither, int bits, int shapingtype);
+
+/* scale = (float) pow(10., (double)replaygain * 0.05); */
+size_t FLAC__replaygain_synthesis__apply_gain(FLAC__byte *data_out, FLAC__bool little_endian_data_out, FLAC__bool unsigned_data_out, const FLAC__int32 * const input[], unsigned wide_samples, unsigned channels, const unsigned source_bps, const unsigned target_bps, const double scale, const FLAC__bool hard_limit, FLAC__bool do_dithering, DitherContext *dither_context);
+
+#endif
diff --git a/include/share/safe_str.h b/include/share/safe_str.h
new file mode 100644
index 0000000..eb974c5
--- /dev/null
+++ b/include/share/safe_str.h
@@ -0,0 +1,69 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2016  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Safe string handling functions to replace things like strcpy, strncpy,
+ * strcat, strncat etc.
+ * All of these functions guarantee a correctly NUL terminated string but
+ * the string may be truncated if the destination buffer was too short.
+ */
+
+#ifndef FLAC__SHARE_SAFE_STR_H
+#define FLAC__SHARE_SAFE_STR_H
+
+static inline char *
+safe_strncat(char *dest, const char *src, size_t dest_size)
+{
+	char * ret;
+
+	if (dest_size < 1)
+		return dest;
+
+	ret = strncat(dest, src, dest_size - strlen (dest));
+	dest [dest_size - 1] = 0;
+
+	return ret;
+}
+
+static inline char *
+safe_strncpy(char *dest, const char *src, size_t dest_size)
+{
+	char * ret;
+
+	if (dest_size < 1)
+		return dest;
+
+	ret = strncpy(dest, src, dest_size);
+	dest [dest_size - 1] = 0;
+
+	return ret;
+}
+
+#endif /* FLAC__SHARE_SAFE_STR_H */
diff --git a/include/share/utf8.h b/include/share/utf8.h
new file mode 100644
index 0000000..7d6650d
--- /dev/null
+++ b/include/share/utf8.h
@@ -0,0 +1,25 @@
+#ifndef SHARE__UTF8_H
+#define SHARE__UTF8_H
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ *
+ * If the locale's charset is not set explicitly then it is
+ * obtained using nl_langinfo(CODESET), where available, the
+ * environment variable CHARSET, or assumed to be US-ASCII.
+ *
+ * Return value of conversion functions:
+ *
+ *  -1 : memory allocation failed
+ *   0 : data was converted exactly
+ *   1 : valid data was converted approximately (using '?')
+ *   2 : input was invalid (but still converted, using '#')
+ *   3 : unknown encoding (but still converted, using '?')
+ */
+
+int utf8_encode(const char *from, char **to);
+int utf8_decode(const char *from, char **to);
+
+#endif
diff --git a/include/share/win_utf8_io.h b/include/share/win_utf8_io.h
new file mode 100644
index 0000000..13fd118
--- /dev/null
+++ b/include/share/win_utf8_io.h
@@ -0,0 +1,58 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+
+#ifndef flac__win_utf8_io_h
+#define flac__win_utf8_io_h
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t strlen_utf8(const char *str);
+int win_get_console_width(void);
+
+int get_utf8_argv(int *argc, char ***argv);
+
+int printf_utf8(const char *format, ...);
+int fprintf_utf8(FILE *stream, const char *format, ...);
+int vfprintf_utf8(FILE *stream, const char *format, va_list argptr);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
+#endif
diff --git a/include/share/windows_unicode_filenames.h b/include/share/windows_unicode_filenames.h
new file mode 100644
index 0000000..86820ca
--- /dev/null
+++ b/include/share/windows_unicode_filenames.h
@@ -0,0 +1,67 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+
+#ifndef flac__windows_unicode_filenames_h
+#define flac__windows_unicode_filenames_h
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/utime.h>
+#include "FLAC/ordinals.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void flac_internal_set_utf8_filenames(FLAC__bool flag);
+FLAC__bool flac_internal_get_utf8_filenames(void);
+#define flac_set_utf8_filenames flac_internal_set_utf8_filenames
+#define flac_get_utf8_filenames flac_internal_get_utf8_filenames
+
+FILE* flac_internal_fopen_utf8(const char *filename, const char *mode);
+int flac_internal_stat64_utf8(const char *path, struct __stat64 *buffer);
+int flac_internal_chmod_utf8(const char *filename, int pmode);
+int flac_internal_utime_utf8(const char *filename, struct utimbuf *times);
+int flac_internal_unlink_utf8(const char *filename);
+int flac_internal_rename_utf8(const char *oldname, const char *newname);
+
+#include <windows.h>
+HANDLE WINAPI flac_internal_CreateFile_utf8(const char *lpFileName, DWORD dwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes, DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLE hTemplateFile);
+#define CreateFile_utf8 flac_internal_CreateFile_utf8
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
+#endif
diff --git a/libFLAC/Android.bp b/libFLAC/Android.bp
index eb6198c..d1b51f1 100644
--- a/libFLAC/Android.bp
+++ b/libFLAC/Android.bp
@@ -41,6 +41,7 @@
         "-funroll-loops",
         "-finline-functions",
         "-Werror",
+        "-Wno-unused-parameter",
     ],
 
     arch: {
diff --git a/libFLAC/bitmath.c b/libFLAC/bitmath.c
index 5b58ca9..b3d797d 100644
--- a/libFLAC/bitmath.c
+++ b/libFLAC/bitmath.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -60,50 +60,14 @@
  * silog2(  9) = 5
  * silog2( 10) = 5
  */
-unsigned FLAC__bitmath_silog2(int v)
+unsigned FLAC__bitmath_silog2(FLAC__int64 v)
 {
-	while(1) {
-		if(v == 0) {
-			return 0;
-		}
-		else if(v > 0) {
-			unsigned l = 0;
-			while(v) {
-				l++;
-				v >>= 1;
-			}
-			return l+1;
-		}
-		else if(v == -1) {
-			return 2;
-		}
-		else {
-			v++;
-			v = -v;
-		}
-	}
-}
+	if(v == 0)
+		return 0;
 
-unsigned FLAC__bitmath_silog2_wide(FLAC__int64 v)
-{
-	while(1) {
-		if(v == 0) {
-			return 0;
-		}
-		else if(v > 0) {
-			unsigned l = 0;
-			while(v) {
-				l++;
-				v >>= 1;
-			}
-			return l+1;
-		}
-		else if(v == -1) {
-			return 2;
-		}
-		else {
-			v++;
-			v = -v;
-		}
-	}
+	if(v == -1)
+		return 2;
+
+	v = (v < 0) ? (-(v+1)) : v;
+	return FLAC__bitmath_ilog2_wide(v)+2;
 }
diff --git a/libFLAC/bitreader.c b/libFLAC/bitreader.c
index 67d0f37..ab62d41 100644
--- a/libFLAC/bitreader.c
+++ b/libFLAC/bitreader.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -45,18 +45,43 @@
 #include "share/endswap.h"
 
 /* Things should be fastest when this matches the machine word size */
-/* WATCHOUT: if you change this you must also change the following #defines down to FLAC__clz_uint32 below to match */
-/* WATCHOUT: there are a few places where the code will not work unless uint32_t is >= 32 bits wide */
+/* WATCHOUT: if you change this you must also change the following #defines down to COUNT_ZERO_MSBS2 below to match */
+/* WATCHOUT: there are a few places where the code will not work unless brword is >= 32 bits wide */
 /*           also, some sections currently only have fast versions for 4 or 8 bytes per word */
-#define FLAC__BYTES_PER_WORD 4		/* sizeof uint32_t */
-#define FLAC__BITS_PER_WORD (8 * FLAC__BYTES_PER_WORD)
+
+#if (ENABLE_64_BIT_WORDS == 0)
+
+typedef FLAC__uint32 brword;
+#define FLAC__BYTES_PER_WORD 4		/* sizeof brword */
+#define FLAC__BITS_PER_WORD 32
 #define FLAC__WORD_ALL_ONES ((FLAC__uint32)0xffffffff)
-/* SWAP_BE_WORD_TO_HOST swaps bytes in a uint32_t (which is always big-endian) if necessary to match host byte order */
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a brword (which is always big-endian) if necessary to match host byte order */
 #if WORDS_BIGENDIAN
 #define SWAP_BE_WORD_TO_HOST(x) (x)
 #else
 #define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_32(x)
 #endif
+/* counts the # of zero MSBs in a word */
+#define COUNT_ZERO_MSBS(word) FLAC__clz_uint32(word)
+#define COUNT_ZERO_MSBS2(word) FLAC__clz2_uint32(word)
+
+#else
+
+typedef FLAC__uint64 brword;
+#define FLAC__BYTES_PER_WORD 8		/* sizeof brword */
+#define FLAC__BITS_PER_WORD 64
+#define FLAC__WORD_ALL_ONES ((FLAC__uint64)FLAC__U64L(0xffffffffffffffff))
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a brword (which is always big-endian) if necessary to match host byte order */
+#if WORDS_BIGENDIAN
+#define SWAP_BE_WORD_TO_HOST(x) (x)
+#else
+#define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_64(x)
+#endif
+/* counts the # of zero MSBs in a word */
+#define COUNT_ZERO_MSBS(word) FLAC__clz_uint64(word)
+#define COUNT_ZERO_MSBS2(word) FLAC__clz2_uint64(word)
+
+#endif
 
 /*
  * This should be at least twice as large as the largest number of words
@@ -77,7 +102,7 @@
 struct FLAC__BitReader {
 	/* any partially-consumed word at the head will stay right-justified as bits are consumed from the left */
 	/* any incomplete word at the tail will be left-justified, and bytes from the read callback are added on the right */
-	uint32_t *buffer;
+	brword *buffer;
 	unsigned capacity; /* in words */
 	unsigned words; /* # of completed words in buffer */
 	unsigned bytes; /* # of bytes in incomplete word at buffer[words] */
@@ -89,7 +114,7 @@
 	void *client_data;
 };
 
-static inline void crc16_update_word_(FLAC__BitReader *br, uint32_t word)
+static inline void crc16_update_word_(FLAC__BitReader *br, brword word)
 {
 	register unsigned crc = br->read_crc16;
 #if FLAC__BYTES_PER_WORD == 4
@@ -142,7 +167,7 @@
 		return false; /* no space left, buffer is too small; see note for FLAC__BITREADER_DEFAULT_CAPACITY  */
 	target = ((FLAC__byte*)(br->buffer+br->words)) + br->bytes;
 
-	/* before reading, if the existing reader looks like this (say uint32_t is 32 bits wide)
+	/* before reading, if the existing reader looks like this (say brword is 32 bits wide)
 	 *   bitstream :  11 22 33 44 55            br->words=1 br->bytes=1 (partial tail word is left-justified)
 	 *   buffer[BE]:  11 22 33 44 55 ?? ?? ??   (shown layed out as bytes sequentially in memory)
 	 *   buffer[LE]:  44 33 22 11 ?? ?? ?? 55   (?? being don't-care)
@@ -175,7 +200,7 @@
 	 */
 #if WORDS_BIGENDIAN
 #else
-	end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
+	end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + (unsigned)bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
 	for(start = br->words; start < end; start++)
 		br->buffer[start] = SWAP_BE_WORD_TO_HOST(br->buffer[start]);
 #endif
@@ -186,7 +211,7 @@
 	 *   buffer[LE]:  44 33 22 11 88 77 66 55 CC BB AA 99 ?? FF EE DD
 	 * finally we'll update the reader values:
 	 */
-	end = br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes;
+	end = br->words*FLAC__BYTES_PER_WORD + br->bytes + (unsigned)bytes;
 	br->words = end / FLAC__BYTES_PER_WORD;
 	br->bytes = end % FLAC__BYTES_PER_WORD;
 
@@ -236,7 +261,7 @@
 	br->words = br->bytes = 0;
 	br->consumed_words = br->consumed_bits = 0;
 	br->capacity = FLAC__BITREADER_DEFAULT_CAPACITY;
-	br->buffer = malloc(sizeof(uint32_t) * br->capacity);
+	br->buffer = malloc(sizeof(brword) * br->capacity);
 	if(br->buffer == 0)
 		return false;
 	br->read_callback = rcb;
@@ -281,7 +306,7 @@
 				if(i < br->consumed_words || (i == br->consumed_words && j < br->consumed_bits))
 					fprintf(out, ".");
 				else
-					fprintf(out, "%01u", br->buffer[i] & (1 << (FLAC__BITS_PER_WORD-j-1)) ? 1:0);
+					fprintf(out, "%01u", br->buffer[i] & ((brword)1 << (FLAC__BITS_PER_WORD-j-1)) ? 1:0);
 			fprintf(out, "\n");
 		}
 		if(br->bytes > 0) {
@@ -290,7 +315,7 @@
 				if(i < br->consumed_words || (i == br->consumed_words && j < br->consumed_bits))
 					fprintf(out, ".");
 				else
-					fprintf(out, "%01u", br->buffer[i] & (1 << (br->bytes*8-j-1)) ? 1:0);
+					fprintf(out, "%01u", br->buffer[i] & ((brword)1 << (br->bytes*8-j-1)) ? 1:0);
 			fprintf(out, "\n");
 		}
 	}
@@ -315,7 +340,7 @@
 
 	/* CRC any tail bytes in a partially-consumed word */
 	if(br->consumed_bits) {
-		const uint32_t tail = br->buffer[br->consumed_words];
+		const brword tail = br->buffer[br->consumed_words];
 		for( ; br->crc16_align < br->consumed_bits; br->crc16_align += 8)
 			br->read_crc16 = FLAC__CRC16_UPDATE((unsigned)((tail >> (FLAC__BITS_PER_WORD-8-br->crc16_align)) & 0xff), br->read_crc16);
 	}
@@ -363,33 +388,34 @@
 		if(br->consumed_bits) {
 			/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
 			const unsigned n = FLAC__BITS_PER_WORD - br->consumed_bits;
-			const uint32_t word = br->buffer[br->consumed_words];
+			const brword word = br->buffer[br->consumed_words];
 			if(bits < n) {
-				*val = (word & (FLAC__WORD_ALL_ONES >> br->consumed_bits)) >> (n-bits);
+				*val = (FLAC__uint32)((word & (FLAC__WORD_ALL_ONES >> br->consumed_bits)) >> (n-bits)); /* The result has <= 32 non-zero bits */
 				br->consumed_bits += bits;
 				return true;
 			}
-			*val = word & (FLAC__WORD_ALL_ONES >> br->consumed_bits);
+			/* (FLAC__BITS_PER_WORD - br->consumed_bits <= bits) ==> (FLAC__WORD_ALL_ONES >> br->consumed_bits) has no more than 'bits' non-zero bits */
+			*val = (FLAC__uint32)(word & (FLAC__WORD_ALL_ONES >> br->consumed_bits));
 			bits -= n;
 			crc16_update_word_(br, word);
 			br->consumed_words++;
 			br->consumed_bits = 0;
 			if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
 				*val <<= bits;
-				*val |= (br->buffer[br->consumed_words] >> (FLAC__BITS_PER_WORD-bits));
+				*val |= (FLAC__uint32)(br->buffer[br->consumed_words] >> (FLAC__BITS_PER_WORD-bits));
 				br->consumed_bits = bits;
 			}
 			return true;
 		}
-		else {
-			const uint32_t word = br->buffer[br->consumed_words];
+		else { /* br->consumed_bits == 0 */
+			const brword word = br->buffer[br->consumed_words];
 			if(bits < FLAC__BITS_PER_WORD) {
-				*val = word >> (FLAC__BITS_PER_WORD-bits);
+				*val = (FLAC__uint32)(word >> (FLAC__BITS_PER_WORD-bits));
 				br->consumed_bits = bits;
 				return true;
 			}
-			/* at this point 'bits' must be == FLAC__BITS_PER_WORD; because of previous assertions, it can't be larger */
-			*val = word;
+			/* at this point bits == FLAC__BITS_PER_WORD == 32; because of previous assertions, it can't be larger */
+			*val = (FLAC__uint32)word;
 			crc16_update_word_(br, word);
 			br->consumed_words++;
 			return true;
@@ -404,12 +430,12 @@
 		if(br->consumed_bits) {
 			/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
 			FLAC__ASSERT(br->consumed_bits + bits <= br->bytes*8);
-			*val = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES >> br->consumed_bits)) >> (FLAC__BITS_PER_WORD-br->consumed_bits-bits);
+			*val = (FLAC__uint32)((br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES >> br->consumed_bits)) >> (FLAC__BITS_PER_WORD-br->consumed_bits-bits));
 			br->consumed_bits += bits;
 			return true;
 		}
 		else {
-			*val = br->buffer[br->consumed_words] >> (FLAC__BITS_PER_WORD-bits);
+			*val = (FLAC__uint32)(br->buffer[br->consumed_words] >> (FLAC__BITS_PER_WORD-bits));
 			br->consumed_bits += bits;
 			return true;
 		}
@@ -565,7 +591,7 @@
 	/* step 2: read whole words in chunks */
 	while(nvals >= FLAC__BYTES_PER_WORD) {
 		if(br->consumed_words < br->words) {
-			const uint32_t word = br->buffer[br->consumed_words++];
+			const brword word = br->buffer[br->consumed_words++];
 #if FLAC__BYTES_PER_WORD == 4
 			val[0] = (FLAC__byte)(word >> 24);
 			val[1] = (FLAC__byte)(word >> 16);
@@ -630,9 +656,9 @@
 	*val = 0;
 	while(1) {
 		while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */
-			uint32_t b = br->buffer[br->consumed_words] << br->consumed_bits;
+			brword b = br->buffer[br->consumed_words] << br->consumed_bits;
 			if(b) {
-				i = FLAC__clz_uint32(b);
+				i = COUNT_ZERO_MSBS(b);
 				*val += i;
 				i++;
 				br->consumed_bits += i;
@@ -660,9 +686,9 @@
 		 */
 		if(br->bytes*8 > br->consumed_bits) {
 			const unsigned end = br->bytes * 8;
-			uint32_t b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits;
+			brword b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits;
 			if(b) {
-				i = FLAC__clz_uint32(b);
+				i = COUNT_ZERO_MSBS(b);
 				*val += i;
 				i++;
 				br->consumed_bits += i;
@@ -717,7 +743,7 @@
 	 * bitreader functions that use them, and before returning */
 	unsigned cwords, words, lsbs, msbs, x, y;
 	unsigned ucbits; /* keep track of the number of unconsumed bits in word */
-	uint32_t b;
+	brword b;
 	int *val, *end;
 
 	FLAC__ASSERT(0 != br);
@@ -758,7 +784,7 @@
 
 	while(val < end) {
 		/* read the unary MSBs and end bit */
-		x = y = FLAC__clz2_uint32(b);
+		x = y = COUNT_ZERO_MSBS2(b);
 		if(x == FLAC__BITS_PER_WORD) {
 			x = ucbits;
 			do {
@@ -767,7 +793,7 @@
 				if (cwords >= words)
 					goto incomplete_msbs;
 				b = br->buffer[cwords];
-				y = FLAC__clz2_uint32(b);
+				y = COUNT_ZERO_MSBS2(b);
 				x += y;
 			} while(y == FLAC__BITS_PER_WORD);
 		}
@@ -777,7 +803,7 @@
 		msbs = x;
 
 		/* read the binary LSBs */
-		x = b >> (FLAC__BITS_PER_WORD - parameter);
+		x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit unsigned */
 		if(parameter <= ucbits) {
 			ucbits -= parameter;
 			b <<= parameter;
@@ -788,7 +814,7 @@
 				goto incomplete_lsbs;
 			b = br->buffer[cwords];
 			ucbits += FLAC__BITS_PER_WORD - parameter;
-			x |= b >> ucbits;
+			x |= (FLAC__uint32)(b >> ucbits);
 			b <<= FLAC__BITS_PER_WORD - ucbits;
 		}
 		lsbs = x;
diff --git a/libFLAC/bitwriter.c b/libFLAC/bitwriter.c
index dcdd93e..402b1c4 100644
--- a/libFLAC/bitwriter.c
+++ b/libFLAC/bitwriter.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -46,33 +46,50 @@
 
 /* Things should be fastest when this matches the machine word size */
 /* WATCHOUT: if you change this you must also change the following #defines down to SWAP_BE_WORD_TO_HOST below to match */
-/* WATCHOUT: there are a few places where the code will not work unless uint32_t is >= 32 bits wide */
-#define FLAC__BYTES_PER_WORD 4
+/* WATCHOUT: there are a few places where the code will not work unless bwword is >= 32 bits wide */
+
+#if (ENABLE_64_BIT_WORDS == 0)
+
+typedef FLAC__uint32 bwword;
+#define FLAC__BYTES_PER_WORD 4		/* sizeof bwword */
 #define FLAC__BITS_PER_WORD 32
-#define FLAC__WORD_ALL_ONES ((FLAC__uint32)0xffffffff)
-/* SWAP_BE_WORD_TO_HOST swaps bytes in a uint32_t (which is always big-endian) if necessary to match host byte order */
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a bwword (which is always big-endian) if necessary to match host byte order */
 #if WORDS_BIGENDIAN
 #define SWAP_BE_WORD_TO_HOST(x) (x)
 #else
 #define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_32(x)
 #endif
 
+#else
+
+typedef FLAC__uint64 bwword;
+#define FLAC__BYTES_PER_WORD 8		/* sizeof bwword */
+#define FLAC__BITS_PER_WORD 64
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a bwword (which is always big-endian) if necessary to match host byte order */
+#if WORDS_BIGENDIAN
+#define SWAP_BE_WORD_TO_HOST(x) (x)
+#else
+#define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_64(x)
+#endif
+
+#endif
+
 /*
  * The default capacity here doesn't matter too much.  The buffer always grows
  * to hold whatever is written to it.  Usually the encoder will stop adding at
  * a frame or metadata block, then write that out and clear the buffer for the
  * next one.
  */
-static const unsigned FLAC__BITWRITER_DEFAULT_CAPACITY = 32768u / sizeof(uint32_t); /* size in words */
+static const unsigned FLAC__BITWRITER_DEFAULT_CAPACITY = 32768u / sizeof(bwword); /* size in words */
 /* When growing, increment 4K at a time */
-static const unsigned FLAC__BITWRITER_DEFAULT_INCREMENT = 4096u / sizeof(uint32_t); /* size in words */
+static const unsigned FLAC__BITWRITER_DEFAULT_INCREMENT = 4096u / sizeof(bwword); /* size in words */
 
 #define FLAC__WORDS_TO_BITS(words) ((words) * FLAC__BITS_PER_WORD)
 #define FLAC__TOTAL_BITS(bw) (FLAC__WORDS_TO_BITS((bw)->words) + (bw)->bits)
 
 struct FLAC__BitWriter {
-	uint32_t *buffer;
-	uint32_t accum; /* accumulator; bits are right-justified; when full, accum is appended to buffer */
+	bwword *buffer;
+	bwword accum; /* accumulator; bits are right-justified; when full, accum is appended to buffer */
 	unsigned capacity; /* capacity of buffer in words */
 	unsigned words; /* # of complete words in buffer */
 	unsigned bits; /* # of used bits in accum */
@@ -85,7 +102,7 @@
 FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, unsigned bits_to_add)
 {
 	unsigned new_capacity;
-	uint32_t *new_buffer;
+	bwword *new_buffer;
 
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
@@ -107,7 +124,7 @@
 	FLAC__ASSERT(new_capacity > bw->capacity);
 	FLAC__ASSERT(new_capacity >= bw->words + ((bw->bits + bits_to_add + FLAC__BITS_PER_WORD - 1) / FLAC__BITS_PER_WORD));
 
-	new_buffer = safe_realloc_mul_2op_(bw->buffer, sizeof(uint32_t), /*times*/new_capacity);
+	new_buffer = safe_realloc_mul_2op_(bw->buffer, sizeof(bwword), /*times*/new_capacity);
 	if(new_buffer == 0)
 		return false;
 	bw->buffer = new_buffer;
@@ -149,7 +166,7 @@
 
 	bw->words = bw->bits = 0;
 	bw->capacity = FLAC__BITWRITER_DEFAULT_CAPACITY;
-	bw->buffer = malloc(sizeof(uint32_t) * bw->capacity);
+	bw->buffer = malloc(sizeof(bwword) * bw->capacity);
 	if(bw->buffer == 0)
 		return false;
 
@@ -184,13 +201,13 @@
 		for(i = 0; i < bw->words; i++) {
 			fprintf(out, "%08X: ", i);
 			for(j = 0; j < FLAC__BITS_PER_WORD; j++)
-				fprintf(out, "%01u", bw->buffer[i] & (1 << (FLAC__BITS_PER_WORD-j-1)) ? 1:0);
+				fprintf(out, "%01u", bw->buffer[i] & ((bwword)1 << (FLAC__BITS_PER_WORD-j-1)) ? 1:0);
 			fprintf(out, "\n");
 		}
 		if(bw->bits > 0) {
 			fprintf(out, "%08X: ", i);
 			for(j = 0; j < bw->bits; j++)
-				fprintf(out, "%01u", bw->accum & (1 << (bw->bits-j-1)) ? 1:0);
+				fprintf(out, "%01u", bw->accum & ((bwword)1 << (bw->bits-j-1)) ? 1:0);
 			fprintf(out, "\n");
 		}
 	}
@@ -302,20 +319,24 @@
 	return true;
 }
 
-inline FLAC__bool FLAC__bitwriter_write_raw_uint32(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits)
+static inline FLAC__bool FLAC__bitwriter_write_raw_uint32_nocheck(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits)
 {
 	register unsigned left;
 
 	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
 	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
 
-	FLAC__ASSERT(0 != bw);
-	FLAC__ASSERT(0 != bw->buffer);
+	if(bw == 0 || bw->buffer == 0)
+		return false;
 
-	FLAC__ASSERT(bits <= 32);
+	if (bits > 32)
+		return false;
+
 	if(bits == 0)
 		return true;
 
+	FLAC__ASSERT((bits == 32) || (val>>bits == 0));
+
 	/* slightly pessimistic size check but faster than "<= bw->words + (bw->bits+bits+FLAC__BITS_PER_WORD-1)/FLAC__BITS_PER_WORD" */
 	if(bw->capacity <= bw->words + bits && !bitwriter_grow_(bw, bits))
 		return false;
@@ -330,24 +351,31 @@
 		bw->accum <<= left;
 		bw->accum |= val >> (bw->bits = bits - left);
 		bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum);
-		bw->accum = val;
+		bw->accum = val; /* unused top bits can contain garbage */
 	}
-	else {
-		bw->accum = val;
-		bw->bits = 0;
-		bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(val);
+	else { /* at this point bits == FLAC__BITS_PER_WORD == 32  and  bw->bits == 0 */
+		bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST((bwword)val);
 	}
 
 	return true;
 }
 
+inline FLAC__bool FLAC__bitwriter_write_raw_uint32(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits)
+{
+	/* check that unused bits are unset */
+	if((bits < 32) && (val>>bits != 0))
+		return false;
+
+	return FLAC__bitwriter_write_raw_uint32_nocheck(bw, val, bits);
+}
+
 inline FLAC__bool FLAC__bitwriter_write_raw_int32(FLAC__BitWriter *bw, FLAC__int32 val, unsigned bits)
 {
 	/* zero-out unused bits */
 	if(bits < 32)
 		val &= (~(0xffffffff << bits));
 
-	return FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, bits);
+	return FLAC__bitwriter_write_raw_uint32_nocheck(bw, (FLAC__uint32)val, bits);
 }
 
 inline FLAC__bool FLAC__bitwriter_write_raw_uint64(FLAC__BitWriter *bw, FLAC__uint64 val, unsigned bits)
@@ -356,7 +384,7 @@
 	if(bits > 32) {
 		return
 			FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)(val>>32), bits-32) &&
-			FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, 32);
+			FLAC__bitwriter_write_raw_uint32_nocheck(bw, (FLAC__uint32)val, 32);
 	}
 	else
 		return FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, bits);
@@ -366,13 +394,13 @@
 {
 	/* this doesn't need to be that fast as currently it is only used for vorbis comments */
 
-	if(!FLAC__bitwriter_write_raw_uint32(bw, val & 0xff, 8))
+	if(!FLAC__bitwriter_write_raw_uint32_nocheck(bw, val & 0xff, 8))
 		return false;
-	if(!FLAC__bitwriter_write_raw_uint32(bw, (val>>8) & 0xff, 8))
+	if(!FLAC__bitwriter_write_raw_uint32_nocheck(bw, (val>>8) & 0xff, 8))
 		return false;
-	if(!FLAC__bitwriter_write_raw_uint32(bw, (val>>16) & 0xff, 8))
+	if(!FLAC__bitwriter_write_raw_uint32_nocheck(bw, (val>>16) & 0xff, 8))
 		return false;
-	if(!FLAC__bitwriter_write_raw_uint32(bw, val>>24, 8))
+	if(!FLAC__bitwriter_write_raw_uint32_nocheck(bw, val>>24, 8))
 		return false;
 
 	return true;
@@ -384,7 +412,7 @@
 
 	/* this could be faster but currently we don't need it to be since it's only used for writing metadata */
 	for(i = 0; i < nvals; i++) {
-		if(!FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)(vals[i]), 8))
+		if(!FLAC__bitwriter_write_raw_uint32_nocheck(bw, (FLAC__uint32)(vals[i]), 8))
 			return false;
 	}
 
@@ -394,21 +422,23 @@
 FLAC__bool FLAC__bitwriter_write_unary_unsigned(FLAC__BitWriter *bw, unsigned val)
 {
 	if(val < 32)
-		return FLAC__bitwriter_write_raw_uint32(bw, 1, ++val);
+		return FLAC__bitwriter_write_raw_uint32_nocheck(bw, 1, ++val);
 	else
 		return
 			FLAC__bitwriter_write_zeroes(bw, val) &&
-			FLAC__bitwriter_write_raw_uint32(bw, 1, 1);
+			FLAC__bitwriter_write_raw_uint32_nocheck(bw, 1, 1);
 }
 
 unsigned FLAC__bitwriter_rice_bits(FLAC__int32 val, unsigned parameter)
 {
 	FLAC__uint32 uval;
 
-	FLAC__ASSERT(parameter < sizeof(unsigned)*8);
+	FLAC__ASSERT(parameter < 32);
 
 	/* fold signed to unsigned; actual formula is: negative(v)? -2v-1 : 2v */
-	uval = (val<<1) ^ (val>>31);
+	uval = val;
+	uval <<= 1;
+	uval ^= (val>>31);
 
 	return 1 + parameter + (uval >> parameter);
 }
@@ -484,10 +514,12 @@
 
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
-	FLAC__ASSERT(parameter < 8*sizeof(uval));
+	FLAC__ASSERT(parameter < 32);
 
 	/* fold signed to unsigned; actual formula is: negative(v)? -2v-1 : 2v */
-	uval = (val<<1) ^ (val>>31);
+	uval = val;
+	uval <<= 1;
+	uval ^= (val>>31);
 
 	msbs = uval >> parameter;
 	interesting_bits = 1 + parameter;
@@ -505,16 +537,16 @@
 
 FLAC__bool FLAC__bitwriter_write_rice_signed_block(FLAC__BitWriter *bw, const FLAC__int32 *vals, unsigned nvals, unsigned parameter)
 {
-	const FLAC__uint32 mask1 = FLAC__WORD_ALL_ONES << parameter; /* we val|=mask1 to set the stop bit above it... */
-	const FLAC__uint32 mask2 = FLAC__WORD_ALL_ONES >> (31-parameter); /* ...then mask off the bits above the stop bit with val&=mask2*/
+	const FLAC__uint32 mask1 = (FLAC__uint32)0xffffffff << parameter; /* we val|=mask1 to set the stop bit above it... */
+	const FLAC__uint32 mask2 = (FLAC__uint32)0xffffffff >> (31-parameter); /* ...then mask off the bits above the stop bit with val&=mask2 */
 	FLAC__uint32 uval;
 	unsigned left;
 	const unsigned lsbits = 1 + parameter;
-	unsigned msbits;
+	unsigned msbits, total_bits;
 
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
-	FLAC__ASSERT(parameter < 8*sizeof(uint32_t)-1);
+	FLAC__ASSERT(parameter < 31);
 	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
 	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
 
@@ -525,19 +557,20 @@
 		uval ^= (*vals>>31);
 
 		msbits = uval >> parameter;
+		total_bits = lsbits + msbits;
 
-		if(bw->bits && bw->bits + msbits + lsbits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current uint32_t */
-			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free uint32_t to work in */
-			bw->bits = bw->bits + msbits + lsbits;
+		if(bw->bits && bw->bits + total_bits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current bwword */
+			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free bwword to work in */
+			bw->bits += total_bits;
 			uval |= mask1; /* set stop bit */
 			uval &= mask2; /* mask off unused top bits */
-			bw->accum <<= msbits + lsbits;
+			bw->accum <<= total_bits;
 			bw->accum |= uval;
 		}
 		else {
 			/* slightly pessimistic size check but faster than "<= bw->words + (bw->bits+msbits+lsbits+FLAC__BITS_PER_WORD-1)/FLAC__BITS_PER_WORD" */
 			/* OPT: pessimism may cause flurry of false calls to grow_ which eat up all savings before it */
-			if(bw->capacity <= bw->words + bw->bits + msbits + 1/*lsbits always fit in 1 uint32_t*/ && !bitwriter_grow_(bw, msbits+lsbits))
+			if(bw->capacity <= bw->words + bw->bits + msbits + 1 /* lsbits always fit in 1 bwword */ && !bitwriter_grow_(bw, total_bits))
 				return false;
 
 			if(msbits) {
@@ -587,7 +620,7 @@
 				bw->accum <<= left;
 				bw->accum |= uval >> (bw->bits = lsbits - left);
 				bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum);
-				bw->accum = uval;
+				bw->accum = uval; /* unused top bits can contain garbage */
 			}
 		}
 		vals++;
@@ -727,40 +760,41 @@
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
 
-	FLAC__ASSERT(!(val & 0x80000000)); /* this version only handles 31 bits */
+	if((val & 0x80000000) != 0) /* this version only handles 31 bits */
+		return false;
 
 	if(val < 0x80) {
-		return FLAC__bitwriter_write_raw_uint32(bw, val, 8);
+		return FLAC__bitwriter_write_raw_uint32_nocheck(bw, val, 8);
 	}
 	else if(val < 0x800) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xC0 | (val>>6), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xC0 | (val>>6), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (val&0x3F), 8);
 	}
 	else if(val < 0x10000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xE0 | (val>>12), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xE0 | (val>>12), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (val&0x3F), 8);
 	}
 	else if(val < 0x200000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xF0 | (val>>18), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xF0 | (val>>18), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (val&0x3F), 8);
 	}
 	else if(val < 0x4000000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xF8 | (val>>24), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>18)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xF8 | (val>>24), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>18)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (val&0x3F), 8);
 	}
 	else {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xFC | (val>>30), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>24)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>18)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | ((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xFC | (val>>30), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>24)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>18)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | ((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (val&0x3F), 8);
 	}
 
 	return ok;
@@ -773,49 +807,50 @@
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
 
-	FLAC__ASSERT(!(val & FLAC__U64L(0xFFFFFFF000000000))); /* this version only handles 36 bits */
+	if((val & FLAC__U64L(0xFFFFFFF000000000)) != 0) /* this version only handles 36 bits */
+		return false;
 
 	if(val < 0x80) {
-		return FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, 8);
+		return FLAC__bitwriter_write_raw_uint32_nocheck(bw, (FLAC__uint32)val, 8);
 	}
 	else if(val < 0x800) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xC0 | (FLAC__uint32)(val>>6), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xC0 | (FLAC__uint32)(val>>6), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 	else if(val < 0x10000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xE0 | (FLAC__uint32)(val>>12), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xE0 | (FLAC__uint32)(val>>12), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 	else if(val < 0x200000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xF0 | (FLAC__uint32)(val>>18), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xF0 | (FLAC__uint32)(val>>18), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 	else if(val < 0x4000000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xF8 | (FLAC__uint32)(val>>24), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xF8 | (FLAC__uint32)(val>>24), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 	else if(val < 0x80000000) {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xFC | (FLAC__uint32)(val>>30), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>24)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xFC | (FLAC__uint32)(val>>30), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>24)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 	else {
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0xFE, 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>30)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>24)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
-		ok &= FLAC__bitwriter_write_raw_uint32(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0xFE, 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>30)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>24)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>18)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>12)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)((val>>6)&0x3F), 8);
+		ok &= FLAC__bitwriter_write_raw_uint32_nocheck(bw, 0x80 | (FLAC__uint32)(val&0x3F), 8);
 	}
 
 	return ok;
@@ -839,6 +874,7 @@
  * fix that we add extern declarations here.
  */
 extern FLAC__bool FLAC__bitwriter_write_zeroes(FLAC__BitWriter *bw, unsigned bits);
+extern FLAC__bool FLAC__bitwriter_write_raw_uint32(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits);
 extern FLAC__bool FLAC__bitwriter_write_raw_int32(FLAC__BitWriter *bw, FLAC__int32 val, unsigned bits);
 extern FLAC__bool FLAC__bitwriter_write_raw_uint64(FLAC__BitWriter *bw, FLAC__uint64 val, unsigned bits);
 extern FLAC__bool FLAC__bitwriter_write_raw_uint32_little_endian(FLAC__BitWriter *bw, FLAC__uint32 val);
diff --git a/libFLAC/cpu.c b/libFLAC/cpu.c
index 8249500..b9df19a 100644
--- a/libFLAC/cpu.c
+++ b/libFLAC/cpu.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -35,73 +35,43 @@
 #endif
 
 #include "private/cpu.h"
+#include "share/compat.h"
 #include <stdlib.h>
 #include <memory.h>
-#ifdef DEBUG
-# include <stdio.h>
+
+#if defined(_MSC_VER)
+#  include <intrin.h> /* for __cpuid() and _xgetbv() */
 #endif
 
+#if defined __GNUC__ && defined HAVE_CPUID_H
+#  include <cpuid.h> /* for __get_cpuid() and __get_cpuid_max() */
+#endif
+
+#ifdef DEBUG
+#include <stdio.h>
+
+#define dfprintf fprintf
+#else
+/* This is bad practice, it should be a static void empty function */
+#define dfprintf(file, format, ...)
+#endif
+
+
 #if defined FLAC__CPU_IA32
-# include <signal.h>
-
-static void disable_sse(FLAC__CPUInfo *info)
-{
-	info->ia32.sse   = false;
-	info->ia32.sse2  = false;
-	info->ia32.sse3  = false;
-	info->ia32.ssse3 = false;
-	info->ia32.sse41 = false;
-	info->ia32.sse42 = false;
-}
-
-static void disable_avx(FLAC__CPUInfo *info)
-{
-	info->ia32.avx     = false;
-	info->ia32.avx2    = false;
-	info->ia32.fma     = false;
-}
-
-#elif defined FLAC__CPU_X86_64
-
-static void disable_avx(FLAC__CPUInfo *info)
-{
-	info->x86.avx     = false;
-	info->x86.avx2    = false;
-	info->x86.fma     = false;
-}
-#endif
-
-#if defined (__NetBSD__) || defined(__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#if defined(__APPLE__)
-/* how to get sysctlbyname()? */
-#endif
-
-#ifdef FLAC__CPU_IA32
 /* these are flags in EDX of CPUID AX=00000001 */
 static const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV = 0x00008000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_MMX = 0x00800000;
-static const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE = 0x02000000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2 = 0x04000000;
 #endif
 
+#if FLAC__HAS_X86INTRIN || FLAC__AVX_SUPPORTED
 /* these are flags in ECX of CPUID AX=00000001 */
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE3 = 0x00000001;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSSE3 = 0x00000200;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE41 = 0x00080000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE42 = 0x00100000;
 
-#if defined FLAC__AVX_SUPPORTED
 /* these are flags in ECX of CPUID AX=00000001 */
 static const unsigned FLAC__CPUINFO_IA32_CPUID_OSXSAVE = 0x08000000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX = 0x10000000;
@@ -110,384 +80,214 @@
 static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX2 = 0x00000020;
 #endif
 
-/*
- * Extra stuff needed for detection of OS support for SSE on IA-32
- */
-#if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || defined FLAC__HAS_X86INTRIN) && !defined FLAC__NO_SSE_OS && !defined FLAC__SSE_OS
-# if defined(__linux__)
-/*
- * If the OS doesn't support SSE, we will get here with a SIGILL.  We
- * modify the return address to jump over the offending SSE instruction
- * and also the operation following it that indicates the instruction
- * executed successfully.  In this way we use no global variables and
- * stay thread-safe.
- *
- * 3 + 3 + 6:
- *   3 bytes for "xorps xmm0,xmm0"
- *   3 bytes for estimate of how long the follwing "inc var" instruction is
- *   6 bytes extra in case our estimate is wrong
- * 12 bytes puts us in the NOP "landing zone"
- */
-#   include <sys/ucontext.h>
-	static void sigill_handler_sse_os(int signal, siginfo_t *si, void *uc)
-	{
-		(void)signal, (void)si;
-		((ucontext_t*)uc)->uc_mcontext.gregs[14/*REG_EIP*/] += 3 + 3 + 6;
-	}
-# elif defined(_MSC_VER)
-#  include <windows.h>
-# endif
+#if defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64
+static uint32_t
+cpu_xgetbv_x86(void)
+{
+#if (defined _MSC_VER || defined __INTEL_COMPILER) && FLAC__HAS_X86INTRIN && FLAC__AVX_SUPPORTED
+	return (uint32_t)_xgetbv(0);
+#elif defined __GNUC__
+	uint32_t lo, hi;
+	asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(lo), "=d"(hi) : "c" (0));
+	return lo;
+#else
+	return 0;
+#endif
+}
 #endif
 
-
-void FLAC__cpu_info(FLAC__CPUInfo *info)
+static void
+ia32_cpu_info (FLAC__CPUInfo *info)
 {
-/*
- * IA32-specific
- */
-#ifdef FLAC__CPU_IA32
-	FLAC__bool ia32_fxsr = false;
-	FLAC__bool ia32_osxsave = false;
-	(void) ia32_fxsr; (void) ia32_osxsave; /* to avoid warnings about unused variables */
-	memset(info, 0, sizeof(*info));
-	info->type = FLAC__CPUINFO_TYPE_IA32;
-#if !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || defined FLAC__HAS_X86INTRIN)
-	info->use_asm = true; /* we assume a minimum of 80386 with FLAC__CPU_IA32 */
-#ifdef FLAC__HAS_X86INTRIN
-	if(!FLAC__cpu_have_cpuid_x86())
-		return;
+#if !defined FLAC__CPU_IA32
+	(void) info;
 #else
+	FLAC__bool ia32_osxsave = false;
+	FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
+
+#if !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || FLAC__HAS_X86INTRIN)
+	info->use_asm = true; /* we assume a minimum of 80386 with FLAC__CPU_IA32 */
+#if defined FLAC__HAS_NASM
 	if(!FLAC__cpu_have_cpuid_asm_ia32())
 		return;
 #endif
-	{
-		/* http://www.sandpile.org/x86/cpuid.htm */
-#ifdef FLAC__HAS_X86INTRIN
-		FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
+	/* http://www.sandpile.org/x86/cpuid.htm */
+	if (FLAC__HAS_X86INTRIN) {
 		FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-		info->ia32.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */
+		info->ia32.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E) ? true : false; /* GenuineIntel */
 		FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-#else
-		FLAC__uint32 flags_ecx, flags_edx;
+	}
+	else {
 		FLAC__cpu_info_asm_ia32(&flags_edx, &flags_ecx);
-#endif
-		info->ia32.cmov  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_CMOV )? true : false;
-		info->ia32.mmx   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_MMX  )? true : false;
-		      ia32_fxsr  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_FXSR )? true : false;
-		info->ia32.sse   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE  )? true : false;
-		info->ia32.sse2  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE2 )? true : false;
-		info->ia32.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
-		info->ia32.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
-		info->ia32.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false;
-		info->ia32.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false;
-#if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED
-		    ia32_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false;
-		info->ia32.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    )? true : false;
-		info->ia32.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    )? true : false;
+	}
+
+	info->ia32.cmov  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_CMOV ) ? true : false;
+	info->ia32.mmx   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_MMX  ) ? true : false;
+	info->ia32.sse   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE  ) ? true : false;
+	info->ia32.sse2  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE2 ) ? true : false;
+	info->ia32.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 ) ? true : false;
+	info->ia32.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3) ? true : false;
+	info->ia32.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41) ? true : false;
+	info->ia32.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42) ? true : false;
+
+	if (FLAC__HAS_X86INTRIN && FLAC__AVX_SUPPORTED) {
+		ia32_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE) ? true : false;
+		info->ia32.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    ) ? true : false;
+		info->ia32.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    ) ? true : false;
 		FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-		info->ia32.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   )? true : false;
-#endif
+		info->ia32.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   ) ? true : false;
 	}
 
-#ifdef DEBUG
-	fprintf(stderr, "CPU info (IA-32):\n");
-	fprintf(stderr, "  CMOV ....... %c\n", info->ia32.cmov    ? 'Y' : 'n');
-	fprintf(stderr, "  MMX ........ %c\n", info->ia32.mmx     ? 'Y' : 'n');
-	fprintf(stderr, "  SSE ........ %c\n", info->ia32.sse     ? 'Y' : 'n');
-	fprintf(stderr, "  SSE2 ....... %c\n", info->ia32.sse2    ? 'Y' : 'n');
-	fprintf(stderr, "  SSE3 ....... %c\n", info->ia32.sse3    ? 'Y' : 'n');
-	fprintf(stderr, "  SSSE3 ...... %c\n", info->ia32.ssse3   ? 'Y' : 'n');
-	fprintf(stderr, "  SSE41 ...... %c\n", info->ia32.sse41   ? 'Y' : 'n');
-	fprintf(stderr, "  SSE42 ...... %c\n", info->ia32.sse42   ? 'Y' : 'n');
-# if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED
-	fprintf(stderr, "  AVX ........ %c\n", info->ia32.avx     ? 'Y' : 'n');
-	fprintf(stderr, "  FMA ........ %c\n", info->ia32.fma     ? 'Y' : 'n');
-	fprintf(stderr, "  AVX2 ....... %c\n", info->ia32.avx2    ? 'Y' : 'n');
-# endif
-#endif
+	dfprintf(stderr, "CPU info (IA-32):\n");
+	dfprintf(stderr, "  CMOV ....... %c\n", info->ia32.cmov    ? 'Y' : 'n');
+	dfprintf(stderr, "  MMX ........ %c\n", info->ia32.mmx     ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE ........ %c\n", info->ia32.sse     ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE2 ....... %c\n", info->ia32.sse2    ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE3 ....... %c\n", info->ia32.sse3    ? 'Y' : 'n');
+	dfprintf(stderr, "  SSSE3 ...... %c\n", info->ia32.ssse3   ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE41 ...... %c\n", info->ia32.sse41   ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE42 ...... %c\n", info->ia32.sse42   ? 'Y' : 'n');
 
-	/*
-	 * now have to check for OS support of SSE instructions
-	 */
-	if(info->ia32.sse) {
-#if defined FLAC__NO_SSE_OS
-		/* assume user knows better than us; turn it off */
-		disable_sse(info);
-#elif defined FLAC__SSE_OS
-		/* assume user knows better than us; leave as detected above */
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__) || defined(__APPLE__)
-		int sse = 0;
-		size_t len;
-		/* at least one of these must work: */
-		len = sizeof(sse); sse = sse || (sysctlbyname("hw.instruction_sse", &sse, &len, NULL, 0) == 0 && sse);
-		len = sizeof(sse); sse = sse || (sysctlbyname("hw.optional.sse"   , &sse, &len, NULL, 0) == 0 && sse); /* __APPLE__ ? */
-		if(!sse)
-			disable_sse(info);
-#elif defined(__NetBSD__) || defined (__OpenBSD__)
-# if __NetBSD_Version__ >= 105250000 || (defined __OpenBSD__)
-		int val = 0, mib[2] = { CTL_MACHDEP, CPU_SSE };
-		size_t len = sizeof(val);
-		if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val)
-			disable_sse(info);
-		else { /* double-check SSE2 */
-			mib[1] = CPU_SSE2;
-			len = sizeof(val);
-			if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val) {
-				disable_sse(info);
-				info->ia32.sse = true;
-			}
-		}
-# else
-		disable_sse(info);
-# endif
-#elif defined(__ANDROID__) || defined(ANDROID)
-		/* no need to check OS SSE support */
-#elif defined(__linux__)
-		int sse = 0;
-		struct sigaction sigill_save;
-		struct sigaction sigill_sse;
-		sigill_sse.sa_sigaction = sigill_handler_sse_os;
-		__sigemptyset(&sigill_sse.sa_mask);
-		sigill_sse.sa_flags = SA_SIGINFO | SA_RESETHAND; /* SA_RESETHAND just in case our SIGILL return jump breaks, so we don't get stuck in a loop */
-		if(0 == sigaction(SIGILL, &sigill_sse, &sigill_save))
-		{
-			/* http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html */
-			/* see sigill_handler_sse_os() for an explanation of the following: */
-			asm volatile (
-				"xorps %%xmm0,%%xmm0\n\t" /* will cause SIGILL if unsupported by OS */
-				"incl %0\n\t"             /* SIGILL handler will jump over this */
-				/* landing zone */
-				"nop\n\t" /* SIGILL jump lands here if "inc" is 9 bytes */
-				"nop\n\t"
-				"nop\n\t"
-				"nop\n\t"
-				"nop\n\t"
-				"nop\n\t"
-				"nop\n\t" /* SIGILL jump lands here if "inc" is 3 bytes (expected) */
-				"nop\n\t"
-				"nop"     /* SIGILL jump lands here if "inc" is 1 byte */
-				: "=r"(sse)
-				: "0"(sse)
-			);
-
-			sigaction(SIGILL, &sigill_save, NULL);
-		}
-
-		if(!sse)
-			disable_sse(info);
-#elif defined(_MSC_VER)
-		__try {
-			__asm {
-				xorps xmm0,xmm0
-			}
-		}
-		__except(EXCEPTION_EXECUTE_HANDLER) {
-			if (_exception_code() == STATUS_ILLEGAL_INSTRUCTION)
-				disable_sse(info);
-		}
-#elif defined(__GNUC__) /* MinGW goes here */
-		int sse = 0;
-		/* Based on the idea described in Agner Fog's manual "Optimizing subroutines in assembly language" */
-		/* In theory, not guaranteed to detect lack of OS SSE support on some future Intel CPUs, but in practice works (see the aforementioned manual) */
-		if (ia32_fxsr) {
-			struct {
-				FLAC__uint32 buff[128];
-			} __attribute__((aligned(16))) fxsr;
-			FLAC__uint32 old_val, new_val;
-
-			asm volatile ("fxsave %0"  : "=m" (fxsr) : "m" (fxsr));
-			old_val = fxsr.buff[50];
-			fxsr.buff[50] ^= 0x0013c0de;                             /* change value in the buffer */
-			asm volatile ("fxrstor %0" : "=m" (fxsr) : "m" (fxsr));  /* try to change SSE register */
-			fxsr.buff[50] = old_val;                                 /* restore old value in the buffer */
-			asm volatile ("fxsave %0 " : "=m" (fxsr) : "m" (fxsr));  /* old value will be overwritten if SSE register was changed */
-			new_val = fxsr.buff[50];                                 /* == old_val if FXRSTOR didn't change SSE register and (old_val ^ 0x0013c0de) otherwise */
-			fxsr.buff[50] = old_val;                                 /* again restore old value in the buffer */
-			asm volatile ("fxrstor %0" : "=m" (fxsr) : "m" (fxsr));  /* restore old values of registers */
-
-			if ((old_val^new_val) == 0x0013c0de)
-				sse = 1;
-		}
-		if(!sse)
-			disable_sse(info);
-#else
-		/* no way to test, disable to be safe */
-		disable_sse(info);
-#endif
-#ifdef DEBUG
-		fprintf(stderr, "  SSE OS sup . %c\n", info->ia32.sse ? 'Y' : 'n');
-#endif
+	if (FLAC__HAS_X86INTRIN && FLAC__AVX_SUPPORTED) {
+		dfprintf(stderr, "  AVX ........ %c\n", info->ia32.avx     ? 'Y' : 'n');
+		dfprintf(stderr, "  FMA ........ %c\n", info->ia32.fma     ? 'Y' : 'n');
+		dfprintf(stderr, "  AVX2 ....... %c\n", info->ia32.avx2    ? 'Y' : 'n');
 	}
-	else /* info->ia32.sse == false */
-		disable_sse(info);
 
 	/*
 	 * now have to check for OS support of AVX instructions
 	 */
-	if(info->ia32.avx && ia32_osxsave) {
-		FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86();
-		if ((ecr & 0x6) != 0x6)
-			disable_avx(info);
-#ifdef DEBUG
-		fprintf(stderr, "  AVX OS sup . %c\n", info->ia32.avx ? 'Y' : 'n');
-#endif
+	if (!FLAC__HAS_X86INTRIN || !info->ia32.avx || !ia32_osxsave || (cpu_xgetbv_x86() & 0x6) != 0x6) {
+		/* no OS AVX support */
+		info->ia32.avx     = false;
+		info->ia32.avx2    = false;
+		info->ia32.fma     = false;
 	}
-	else /* no OS AVX support*/
-		disable_avx(info);
+
+	if (FLAC__HAS_X86INTRIN && FLAC__AVX_SUPPORTED) {
+		dfprintf(stderr, "  AVX OS sup . %c\n", info->ia32.avx ? 'Y' : 'n');
+	}
 #else
 	info->use_asm = false;
 #endif
+#endif
+}
 
-/*
- * x86-64-specific
- */
-#elif defined FLAC__CPU_X86_64
+static void
+x86_64_cpu_info (FLAC__CPUInfo *info)
+{
+#if !defined FLAC__NO_ASM && FLAC__HAS_X86INTRIN
 	FLAC__bool x86_osxsave = false;
-	(void) x86_osxsave; /* to avoid warnings about unused variables */
-	memset(info, 0, sizeof(*info));
-	info->type = FLAC__CPUINFO_TYPE_X86_64;
-#if !defined FLAC__NO_ASM && defined FLAC__HAS_X86INTRIN
+	FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
+
 	info->use_asm = true;
-	{
-		/* http://www.sandpile.org/x86/cpuid.htm */
-		FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
-		FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-		info->x86.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */
-		FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-		info->x86.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
-		info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
-		info->x86.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false;
-		info->x86.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false;
-#if defined FLAC__AVX_SUPPORTED
-		    x86_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false;
-		info->x86.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    )? true : false;
-		info->x86.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    )? true : false;
+
+	/* http://www.sandpile.org/x86/cpuid.htm */
+	FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+	info->x86.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E) ? true : false; /* GenuineIntel */
+	FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+	info->x86.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 ) ? true : false;
+	info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3) ? true : false;
+	info->x86.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41) ? true : false;
+	info->x86.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42) ? true : false;
+
+	if (FLAC__AVX_SUPPORTED) {
+		x86_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE) ? true : false;
+		info->x86.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    ) ? true : false;
+		info->x86.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    ) ? true : false;
 		FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
-		info->x86.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   )? true : false;
-#endif
+		info->x86.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   ) ? true : false;
 	}
-#ifdef DEBUG
-	fprintf(stderr, "CPU info (x86-64):\n");
-	fprintf(stderr, "  SSE3 ....... %c\n", info->x86.sse3  ? 'Y' : 'n');
-	fprintf(stderr, "  SSSE3 ...... %c\n", info->x86.ssse3 ? 'Y' : 'n');
-	fprintf(stderr, "  SSE41 ...... %c\n", info->x86.sse41 ? 'Y' : 'n');
-	fprintf(stderr, "  SSE42 ...... %c\n", info->x86.sse42 ? 'Y' : 'n');
-# if defined FLAC__AVX_SUPPORTED
-	fprintf(stderr, "  AVX ........ %c\n", info->x86.avx   ? 'Y' : 'n');
-	fprintf(stderr, "  FMA ........ %c\n", info->x86.fma   ? 'Y' : 'n');
-	fprintf(stderr, "  AVX2 ....... %c\n", info->x86.avx2  ? 'Y' : 'n');
-# endif
-#endif
+
+	dfprintf(stderr, "CPU info (x86-64):\n");
+	dfprintf(stderr, "  SSE3 ....... %c\n", info->x86.sse3  ? 'Y' : 'n');
+	dfprintf(stderr, "  SSSE3 ...... %c\n", info->x86.ssse3 ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE41 ...... %c\n", info->x86.sse41 ? 'Y' : 'n');
+	dfprintf(stderr, "  SSE42 ...... %c\n", info->x86.sse42 ? 'Y' : 'n');
+
+	if (FLAC__AVX_SUPPORTED) {
+		dfprintf(stderr, "  AVX ........ %c\n", info->x86.avx   ? 'Y' : 'n');
+		dfprintf(stderr, "  FMA ........ %c\n", info->x86.fma   ? 'Y' : 'n');
+		dfprintf(stderr, "  AVX2 ....... %c\n", info->x86.avx2  ? 'Y' : 'n');
+	}
 
 	/*
 	 * now have to check for OS support of AVX instructions
 	 */
-	if(info->x86.avx && x86_osxsave) {
-		FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86();
-		if ((ecr & 0x6) != 0x6)
-			disable_avx(info);
-#ifdef DEBUG
-		fprintf(stderr, "  AVX OS sup . %c\n", info->x86.avx ? 'Y' : 'n');
-#endif
+	if (!info->x86.avx || !x86_osxsave || (cpu_xgetbv_x86() & 0x6) != 0x6) {
+		/* no OS AVX support */
+		info->x86.avx     = false;
+		info->x86.avx2    = false;
+		info->x86.fma     = false;
 	}
-	else /* no OS AVX support*/
-		disable_avx(info);
-#else
-	info->use_asm = false;
-#endif
 
-/*
- * unknown CPU
- */
+	if (FLAC__AVX_SUPPORTED) {
+		dfprintf(stderr, "  AVX OS sup . %c\n", info->x86.avx ? 'Y' : 'n');
+	}
+#else
+	/* Silence compiler warnings. */
+	(void) info;
+#if defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64
+	if (0) cpu_xgetbv_x86 ();
+#endif
+#endif
+}
+
+void FLAC__cpu_info (FLAC__CPUInfo *info)
+{
+	memset(info, 0, sizeof(*info));
+
+#ifdef FLAC__CPU_IA32
+	info->type = FLAC__CPUINFO_TYPE_IA32;
+#elif defined FLAC__CPU_X86_64
+	info->type = FLAC__CPUINFO_TYPE_X86_64;
 #else
 	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
 	info->use_asm = false;
 #endif
-}
 
-#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
-
-#if defined _MSC_VER
-#include <intrin.h> /* for __cpuid() and _xgetbv() */
-#elif defined __GNUC__ && defined HAVE_CPUID_H
-#include <cpuid.h> /* for __get_cpuid() and __get_cpuid_max() */
-#endif
-
-FLAC__uint32 FLAC__cpu_have_cpuid_x86(void)
-{
-#ifdef FLAC__CPU_X86_64
-	return 1;
-#else
-# if defined _MSC_VER || defined __INTEL_COMPILER /* Do they support CPUs w/o CPUID support (or OSes that work on those CPUs)? */
-	FLAC__uint32 flags1, flags2;
-	__asm {
-		pushfd
-		pushfd
-		pop		eax
-		mov		flags1, eax
-		xor		eax, 0x200000
-		push	eax
-		popfd
-		pushfd
-		pop		eax
-		mov		flags2, eax
-		popfd
+	switch (info->type) {
+	case FLAC__CPUINFO_TYPE_IA32:
+		ia32_cpu_info (info);
+		break;
+	case FLAC__CPUINFO_TYPE_X86_64:
+		x86_64_cpu_info (info);
+		break;
+	default:
+		info->use_asm = false;
+		break;
 	}
-	if (((flags1^flags2) & 0x200000) != 0)
-		return 1;
-	else
-		return 0;
-# elif defined __GNUC__ && defined HAVE_CPUID_H
-	if (__get_cpuid_max(0, 0) != 0)
-		return 1;
-	else
-		return 0;
-# else
-	return 0;
-# endif
-#endif
 }
 
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+
 void FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx)
 {
 #if defined _MSC_VER || defined __INTEL_COMPILER
 	int cpuinfo[4];
 	int ext = level & 0x80000000;
 	__cpuid(cpuinfo, ext);
-	if((unsigned)cpuinfo[0] < level) {
-		*eax = *ebx = *ecx = *edx = 0;
+	if((unsigned)cpuinfo[0] >= level) {
+#if FLAC__AVX_SUPPORTED
+		__cpuidex(cpuinfo, ext, 0); /* for AVX2 detection */
+#else
+		__cpuid(cpuinfo, ext); /* some old compilers don't support __cpuidex */
+#endif
+
+		*eax = cpuinfo[0]; *ebx = cpuinfo[1]; *ecx = cpuinfo[2]; *edx = cpuinfo[3];
+
 		return;
 	}
-#if defined FLAC__AVX_SUPPORTED
-	__cpuidex(cpuinfo, level, 0); /* for AVX2 detection */
-#else
-	__cpuid(cpuinfo, level); /* some old compilers don't support __cpuidex */
-#endif
-	*eax = cpuinfo[0]; *ebx = cpuinfo[1]; *ecx = cpuinfo[2]; *edx = cpuinfo[3];
 #elif defined __GNUC__ && defined HAVE_CPUID_H
 	FLAC__uint32 ext = level & 0x80000000;
 	__cpuid(ext, *eax, *ebx, *ecx, *edx);
-	if (*eax < level) {
-		*eax = *ebx = *ecx = *edx = 0;
+	if (*eax >= level) {
+		__cpuid_count(level, 0, *eax, *ebx, *ecx, *edx);
+
 		return;
 	}
-	__cpuid_count(level, 0, *eax, *ebx, *ecx, *edx);
-#else
+#endif
 	*eax = *ebx = *ecx = *edx = 0;
-#endif
-}
-
-FLAC__uint32 FLAC__cpu_xgetbv_x86(void)
-{
-#if (defined _MSC_VER || defined __INTEL_COMPILER) && defined FLAC__AVX_SUPPORTED
-	return (FLAC__uint32)_xgetbv(0);
-#elif defined __GNUC__
-	FLAC__uint32 lo, hi;
-	asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(lo), "=d"(hi) : "c" (0));
-	return lo;
-#else
-	return 0;
-#endif
 }
 
 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
diff --git a/libFLAC/crc.c b/libFLAC/crc.c
index de2cb18..8123c3b 100644
--- a/libFLAC/crc.c
+++ b/libFLAC/crc.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/fixed.c b/libFLAC/fixed.c
index 74b31b3..1e2d5b2 100644
--- a/libFLAC/fixed.c
+++ b/libFLAC/fixed.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -214,7 +214,7 @@
 #endif
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
+unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #else
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #endif
@@ -255,11 +255,11 @@
 	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
 	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
 #else
 	residual_bits_per_sample[0] = (total_error_0 > 0) ? local__compute_rbps_integerized(total_error_0, data_len) : 0;
 	residual_bits_per_sample[1] = (total_error_1 > 0) ? local__compute_rbps_integerized(total_error_1, data_len) : 0;
@@ -272,7 +272,7 @@
 }
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
+unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #else
 unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #endif
@@ -317,11 +317,11 @@
 	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
 	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
 #else
 	residual_bits_per_sample[0] = (total_error_0 > 0) ? local__compute_rbps_wide_integerized(total_error_0, data_len) : 0;
 	residual_bits_per_sample[1] = (total_error_1 > 0) ? local__compute_rbps_wide_integerized(total_error_1, data_len) : 0;
@@ -349,27 +349,15 @@
 			break;
 		case 2:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				residual[i] = data[i] - (data[i-1] << 1) + data[i-2];
-#else
 				residual[i] = data[i] - 2*data[i-1] + data[i-2];
-#endif
 			break;
 		case 3:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				residual[i] = data[i] - (((data[i-1]-data[i-2])<<1) + (data[i-1]-data[i-2])) - data[i-3];
-#else
 				residual[i] = data[i] - 3*data[i-1] + 3*data[i-2] - data[i-3];
-#endif
 			break;
 		case 4:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				residual[i] = data[i] - ((data[i-1]+data[i-3])<<2) + ((data[i-2]<<2) + (data[i-2]<<1)) + data[i-4];
-#else
 				residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] + data[i-4];
-#endif
 			break;
 		default:
 			FLAC__ASSERT(0);
@@ -391,27 +379,15 @@
 			break;
 		case 2:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				data[i] = residual[i] + (data[i-1]<<1) - data[i-2];
-#else
 				data[i] = residual[i] + 2*data[i-1] - data[i-2];
-#endif
 			break;
 		case 3:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				data[i] = residual[i] + (((data[i-1]-data[i-2])<<1) + (data[i-1]-data[i-2])) + data[i-3];
-#else
 				data[i] = residual[i] + 3*data[i-1] - 3*data[i-2] + data[i-3];
-#endif
 			break;
 		case 4:
 			for(i = 0; i < idata_len; i++)
-#if 1 /* OPT: may be faster with some compilers on some systems */
-				data[i] = residual[i] + ((data[i-1]+data[i-3])<<2) - ((data[i-2]<<2) + (data[i-2]<<1)) - data[i-4];
-#else
 				data[i] = residual[i] + 4*data[i-1] - 6*data[i-2] + 4*data[i-3] - data[i-4];
-#endif
 			break;
 		default:
 			FLAC__ASSERT(0);
diff --git a/libFLAC/fixed_intrin_sse2.c b/libFLAC/fixed_intrin_sse2.c
new file mode 100644
index 0000000..6a9b4dd
--- /dev/null
+++ b/libFLAC/fixed_intrin_sse2.c
@@ -0,0 +1,255 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/fixed.h"
+#ifdef FLAC__SSE2_SUPPORTED
+
+#include <emmintrin.h> /* SSE2 */
+#include <math.h>
+#include "private/macros.h"
+#include "share/compat.h"
+#include "FLAC/assert.h"
+
+#ifdef FLAC__CPU_IA32
+#define m128i_to_i64(dest, src) _mm_storel_epi64((__m128i*)&dest, src)
+#else
+#define m128i_to_i64(dest, src) dest = _mm_cvtsi128_si64(src)
+#endif
+
+FLAC__SSE_TARGET("sse2")
+unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err2;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error;
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1, tmp;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
+			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
+			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3
+
+			tmp = _mm_srai_epi32(err0, 31);
+			err0 = _mm_xor_si128(err0, tmp);
+			err0 = _mm_sub_epi32(err0, tmp);
+			tmp = _mm_srai_epi32(err1, 31);
+			err1 = _mm_xor_si128(err1, tmp);
+			err1 = _mm_sub_epi32(err1, tmp);
+
+			total_err0 = _mm_add_epi32(total_err0, err0);					// 0   0   0   te0
+			total_err1 = _mm_add_epi32(total_err1, err1);					// te1 te2 te3 te4
+		}
+	}
+
+	total_error_0 = _mm_cvtsi128_si32(total_err0);
+	total_err2 = total_err1;											// te1  te2  te3  te4
+	total_err1 = _mm_srli_si128(total_err1, 8);							//  0    0   te1  te2
+	total_error_4 = _mm_cvtsi128_si32(total_err2);
+	total_error_2 = _mm_cvtsi128_si32(total_err1);
+	total_err2 = _mm_srli_si128(total_err2,	4);							//  0   te1  te2  te3
+	total_err1 = _mm_srli_si128(total_err1, 4);							//  0    0    0   te1
+	total_error_3 = _mm_cvtsi128_si32(total_err2);
+	total_error_1 = _mm_cvtsi128_si32(total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+FLAC__SSE_TARGET("sse2")
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err3;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error, zero = _mm_setzero_si128();
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1, tmp;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
+			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
+			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3
+
+			tmp = _mm_srai_epi32(err0, 31);
+			err0 = _mm_xor_si128(err0, tmp);
+			err0 = _mm_sub_epi32(err0, tmp);
+			tmp = _mm_srai_epi32(err1, 31);
+			err1 = _mm_xor_si128(err1, tmp);
+			err1 = _mm_sub_epi32(err1, tmp);
+
+			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
+			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
+			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
+			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
+			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
+		}
+	}
+
+	m128i_to_i64(total_error_0, total_err0);
+	m128i_to_i64(total_error_4, total_err3);
+	m128i_to_i64(total_error_2, total_err1);
+	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
+	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
+	m128i_to_i64(total_error_3, total_err3);
+	m128i_to_i64(total_error_1, total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+#endif /* FLAC__SSE2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/fixed_intrin_ssse3.c b/libFLAC/fixed_intrin_ssse3.c
new file mode 100644
index 0000000..f4d93e8
--- /dev/null
+++ b/libFLAC/fixed_intrin_ssse3.c
@@ -0,0 +1,243 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#include "private/fixed.h"
+#ifdef FLAC__SSSE3_SUPPORTED
+
+#include <tmmintrin.h> /* SSSE3 */
+#include <math.h>
+#include "private/macros.h"
+#include "share/compat.h"
+#include "FLAC/assert.h"
+
+#ifdef FLAC__CPU_IA32
+#define m128i_to_i64(dest, src) _mm_storel_epi64((__m128i*)&dest, src)
+#else
+#define m128i_to_i64(dest, src) dest = _mm_cvtsi128_si64(src)
+#endif
+
+FLAC__SSE_TARGET("ssse3")
+unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err2;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error;
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			last_error = _mm_alignr_epi8(err0, err1, 4);					// e0  e1  e2  e3
+
+			err0 = _mm_abs_epi32(err0);
+			err1 = _mm_abs_epi32(err1);
+
+			total_err0 = _mm_add_epi32(total_err0, err0);					// 0   0   0   te0
+			total_err1 = _mm_add_epi32(total_err1, err1);					// te1 te2 te3 te4
+		}
+	}
+
+	total_error_0 = _mm_cvtsi128_si32(total_err0);
+	total_err2 = total_err1;											// te1  te2  te3  te4
+	total_err1 = _mm_srli_si128(total_err1, 8);							//  0    0   te1  te2
+	total_error_4 = _mm_cvtsi128_si32(total_err2);
+	total_error_2 = _mm_cvtsi128_si32(total_err1);
+	total_err2 = _mm_srli_si128(total_err2,	4);							//  0   te1  te2  te3
+	total_err1 = _mm_srli_si128(total_err1, 4);							//  0    0    0   te1
+	total_error_3 = _mm_cvtsi128_si32(total_err2);
+	total_error_1 = _mm_cvtsi128_si32(total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+FLAC__SSE_TARGET("ssse3")
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err3;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error, zero = _mm_setzero_si128();
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			last_error = _mm_alignr_epi8(err0, err1, 4);					// e0  e1  e2  e3
+
+			err0 = _mm_abs_epi32(err0);
+			err1 = _mm_abs_epi32(err1);										// |e1| |e2| |e3| |e4|
+
+			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
+			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
+			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
+			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
+			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
+		}
+	}
+
+	m128i_to_i64(total_error_0, total_err0);
+	m128i_to_i64(total_error_4, total_err3);
+	m128i_to_i64(total_error_2, total_err1);
+	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
+	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
+	m128i_to_i64(total_error_3, total_err3);
+	m128i_to_i64(total_error_1, total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+#endif /* FLAC__SSSE3_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/float.c b/libFLAC/float.c
index 068069f..25d1a78 100644
--- a/libFLAC/float.c
+++ b/libFLAC/float.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2004-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/format.c b/libFLAC/format.c
index 0b1d10b..214bd09 100644
--- a/libFLAC/format.c
+++ b/libFLAC/format.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -44,10 +44,10 @@
 #include "private/format.h"
 #include "private/macros.h"
 
-/* VERSION should come from configure */
-FLAC_API const char *FLAC__VERSION_STRING = VERSION;
+/* PACKAGE_VERSION should come from configure */
+FLAC_API const char *FLAC__VERSION_STRING = PACKAGE_VERSION;
 
-FLAC_API const char *FLAC__VENDOR_STRING = "reference libFLAC " VERSION " 20141125";
+FLAC_API const char *FLAC__VENDOR_STRING = "reference libFLAC " PACKAGE_VERSION " 20170101";
 
 FLAC_API const FLAC__byte FLAC__STREAM_SYNC_STRING[4] = { 'f','L','a','C' };
 FLAC_API const unsigned FLAC__STREAM_SYNC = 0x664C6143;
diff --git a/libFLAC/include/private/all.h b/libFLAC/include/private/all.h
new file mode 100644
index 0000000..7715962
--- /dev/null
+++ b/libFLAC/include/private/all.h
@@ -0,0 +1,50 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__PRIVATE__ALL_H
+#define FLAC__PRIVATE__ALL_H
+
+#include "bitmath.h"
+#include "bitreader.h"
+#include "bitwriter.h"
+#include "cpu.h"
+#include "crc.h"
+#include "fixed.h"
+#include "float.h"
+#include "format.h"
+#include "lpc.h"
+#include "md5.h"
+#include "memory.h"
+#include "metadata.h"
+#include "stream_encoder_framing.h"
+
+#endif
diff --git a/libFLAC/include/private/bitmath.h b/libFLAC/include/private/bitmath.h
index 22d894f..9c75f85 100644
--- a/libFLAC/include/private/bitmath.h
+++ b/libFLAC/include/private/bitmath.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -36,69 +36,98 @@
 #include "FLAC/ordinals.h"
 #include "FLAC/assert.h"
 
-/* for CHAR_BIT */
-#include <limits.h>
 #include "share/compat.h"
 
-#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#if defined(_MSC_VER)
 #include <intrin.h> /* for _BitScanReverse* */
 #endif
 
 /* Will never be emitted for MSVC, GCC, Intel compilers */
-static inline unsigned int FLAC__clz_soft_uint32(unsigned int word)
+static inline unsigned int FLAC__clz_soft_uint32(FLAC__uint32 word)
 {
-    static const unsigned char byte_to_unary_table[] = {
-    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
+	static const unsigned char byte_to_unary_table[] = {
+	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	};
 
-    return (word) > 0xffffff ? byte_to_unary_table[(word) >> 24] :
-    (word) > 0xffff ? byte_to_unary_table[(word) >> 16] + 8 :
-    (word) > 0xff ? byte_to_unary_table[(word) >> 8] + 16 :
-    byte_to_unary_table[(word)] + 24;
+	return word > 0xffffff ? byte_to_unary_table[word >> 24] :
+		word > 0xffff ? byte_to_unary_table[word >> 16] + 8 :
+		word > 0xff ? byte_to_unary_table[word >> 8] + 16 :
+		byte_to_unary_table[word] + 24;
 }
 
 static inline unsigned int FLAC__clz_uint32(FLAC__uint32 v)
 {
 /* Never used with input 0 */
-    FLAC__ASSERT(v > 0);
+	FLAC__ASSERT(v > 0);
 #if defined(__INTEL_COMPILER)
-    return _bit_scan_reverse(v) ^ 31U;
+	return _bit_scan_reverse(v) ^ 31U;
 #elif defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
 /* This will translate either to (bsr ^ 31U), clz , ctlz, cntlz, lzcnt depending on
  * -march= setting or to a software routine in exotic machines. */
-    return __builtin_clz(v);
-#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
-    {
-        unsigned long idx;
-        _BitScanReverse(&idx, v);
-        return idx ^ 31U;
-    }
+	return __builtin_clz(v);
+#elif defined(_MSC_VER)
+	{
+		unsigned long idx;
+		_BitScanReverse(&idx, v);
+		return idx ^ 31U;
+	}
 #else
-    return FLAC__clz_soft_uint32(v);
+	return FLAC__clz_soft_uint32(v);
 #endif
 }
 
-/* This one works with input 0 */
+/* Used when 64-bit bsr/clz is unavailable; can use 32-bit bsr/clz when possible */
+static inline unsigned int FLAC__clz_soft_uint64(FLAC__uint64 word)
+{
+	return (FLAC__uint32)(word>>32) ? FLAC__clz_uint32((FLAC__uint32)(word>>32)) :
+		FLAC__clz_uint32((FLAC__uint32)word) + 32;
+}
+
+static inline unsigned int FLAC__clz_uint64(FLAC__uint64 v)
+{
+	/* Never used with input 0 */
+	FLAC__ASSERT(v > 0);
+#if defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+	return __builtin_clzll(v);
+#elif (defined(__INTEL_COMPILER) || defined(_MSC_VER)) && (defined(_M_IA64) || defined(_M_X64))
+	{
+		unsigned long idx;
+		_BitScanReverse64(&idx, v);
+		return idx ^ 63U;
+	}
+#else
+	return FLAC__clz_soft_uint64(v);
+#endif
+}
+
+/* These two functions work with input 0 */
 static inline unsigned int FLAC__clz2_uint32(FLAC__uint32 v)
 {
-    if (!v)
-        return 32;
-    return FLAC__clz_uint32(v);
+	if (!v)
+		return 32;
+	return FLAC__clz_uint32(v);
+}
+
+static inline unsigned int FLAC__clz2_uint64(FLAC__uint64 v)
+{
+	if (!v)
+		return 64;
+	return FLAC__clz_uint64(v);
 }
 
 /* An example of what FLAC__bitmath_ilog2() computes:
@@ -126,61 +155,56 @@
 
 static inline unsigned FLAC__bitmath_ilog2(FLAC__uint32 v)
 {
-    FLAC__ASSERT(v > 0);
+	FLAC__ASSERT(v > 0);
 #if defined(__INTEL_COMPILER)
-    return _bit_scan_reverse(v);
-#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
-    {
-        unsigned long idx;
-        _BitScanReverse(&idx, v);
-        return idx;
-    }
+	return _bit_scan_reverse(v);
+#elif defined(_MSC_VER)
+	{
+		unsigned long idx;
+		_BitScanReverse(&idx, v);
+		return idx;
+	}
 #else
-    return sizeof(FLAC__uint32) * CHAR_BIT  - 1 - FLAC__clz_uint32(v);
+	return FLAC__clz_uint32(v) ^ 31U;
 #endif
 }
 
-
-#ifdef FLAC__INTEGER_ONLY_LIBRARY /* Unused otherwise */
-
 static inline unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v)
 {
-    FLAC__ASSERT(v > 0);
+	FLAC__ASSERT(v > 0);
 #if defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
-    return sizeof(FLAC__uint64) * CHAR_BIT - 1 - __builtin_clzll(v);
+	return __builtin_clzll(v) ^ 63U;
 /* Sorry, only supported in x64/Itanium.. and both have fast FPU which makes integer-only encoder pointless */
-#elif (defined(_MSC_VER) && (_MSC_VER >= 1400)) && (defined(_M_IA64) || defined(_M_X64))
-    {
-        unsigned long idx;
-        _BitScanReverse64(&idx, v);
-        return idx;
-    }
+#elif (defined(__INTEL_COMPILER) || defined(_MSC_VER)) && (defined(_M_IA64) || defined(_M_X64))
+	{
+		unsigned long idx;
+		_BitScanReverse64(&idx, v);
+		return idx;
+	}
 #else
 /*  Brain-damaged compilers will use the fastest possible way that is,
-    de Bruijn sequences (http://supertech.csail.mit.edu/papers/debruijn.pdf)
-    (C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain).
+	de Bruijn sequences (http://supertech.csail.mit.edu/papers/debruijn.pdf)
+	(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain).
 */
-    {
-        static const unsigned char DEBRUIJN_IDX64[64]={
-            0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
-            5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
-            63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
-            62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
-        };
-        v|= v>>1;
-        v|= v>>2;
-        v|= v>>4;
-        v|= v>>8;
-        v|= v>>16;
-        v|= v>>32;
-        v= (v>>1)+1;
-        return DEBRUIJN_IDX64[v*0x218A392CD3D5DBF>>58&0x3F];
-    }
+	{
+		static const unsigned char DEBRUIJN_IDX64[64]={
+			0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
+			5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
+			63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
+			62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
+		};
+		v|= v>>1;
+		v|= v>>2;
+		v|= v>>4;
+		v|= v>>8;
+		v|= v>>16;
+		v|= v>>32;
+		v= (v>>1)+1;
+		return DEBRUIJN_IDX64[v*FLAC__U64L(0x218A392CD3D5DBF)>>58&0x3F];
+	}
 #endif
 }
-#endif
 
-unsigned FLAC__bitmath_silog2(int v);
-unsigned FLAC__bitmath_silog2_wide(FLAC__int64 v);
+unsigned FLAC__bitmath_silog2(FLAC__int64 v);
 
 #endif
diff --git a/libFLAC/include/private/bitreader.h b/libFLAC/include/private/bitreader.h
index 4dea8d0..7c73165 100644
--- a/libFLAC/include/private/bitreader.h
+++ b/libFLAC/include/private/bitreader.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/bitwriter.h b/libFLAC/include/private/bitwriter.h
index 3b1362d..ef3ad1b 100644
--- a/libFLAC/include/private/bitwriter.h
+++ b/libFLAC/include/private/bitwriter.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/cpu.h b/libFLAC/include/private/cpu.h
index 380f4f0..7c65180 100644
--- a/libFLAC/include/private/cpu.h
+++ b/libFLAC/include/private/cpu.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -39,7 +39,24 @@
 #include <config.h>
 #endif
 
-#if defined FLAC__HAS_X86INTRIN
+#ifndef FLAC__CPU_X86_64
+
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define FLAC__CPU_X86_64
+#endif
+
+#endif
+
+#ifndef FLAC__CPU_IA32
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) ||defined( __i386) || defined(_M_IX86)
+#define FLAC__CPU_IA32
+#endif
+
+#endif
+
+
+#if FLAC__HAS_X86INTRIN
 /* SSE intrinsics support by ICC/MSVC/GCC */
 #if defined __INTEL_COMPILER
   #define FLAC__SSE_TARGET(x)
@@ -78,9 +95,11 @@
     #define FLAC__SSE2_SUPPORTED 1
     #define FLAC__SSSE3_SUPPORTED 1
     #define FLAC__SSE4_1_SUPPORTED 1
+#ifdef FLAC__USE_AVX
     #define FLAC__AVX_SUPPORTED 1
     #define FLAC__AVX2_SUPPORTED 1
     #define FLAC__FMA_SUPPORTED 1
+#endif
   #else /* for GCC older than 4.9 */
     #define FLAC__SSE_TARGET(x)
     #ifdef __SSE__
@@ -108,13 +127,17 @@
 #endif /* compiler version */
 #endif /* intrinsics support */
 
+
+#ifndef FLAC__AVX_SUPPORTED
+#define FLAC__AVX_SUPPORTED 0
+#endif
+
 typedef enum {
 	FLAC__CPUINFO_TYPE_IA32,
 	FLAC__CPUINFO_TYPE_X86_64,
 	FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
-#if defined FLAC__CPU_IA32
 typedef struct {
 	FLAC__bool intel;
 
@@ -131,7 +154,7 @@
 	FLAC__bool avx2;
 	FLAC__bool fma;
 } FLAC__CPUInfo_IA32;
-#elif defined FLAC__CPU_X86_64
+
 typedef struct {
 	FLAC__bool intel;
 
@@ -143,30 +166,21 @@
 	FLAC__bool avx2;
 	FLAC__bool fma;
 } FLAC__CPUInfo_x86;
-#endif
+
 
 typedef struct {
 	FLAC__bool use_asm;
 	FLAC__CPUInfo_Type type;
-#if defined FLAC__CPU_IA32
 	FLAC__CPUInfo_IA32 ia32;
-#elif defined FLAC__CPU_X86_64
 	FLAC__CPUInfo_x86 x86;
-#endif
 } FLAC__CPUInfo;
 
 void FLAC__cpu_info(FLAC__CPUInfo *info);
 
-#ifndef FLAC__NO_ASM
-# if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM
 FLAC__uint32 FLAC__cpu_have_cpuid_asm_ia32(void);
+
 void         FLAC__cpu_info_asm_ia32(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx);
-# endif
-# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
-FLAC__uint32 FLAC__cpu_have_cpuid_x86(void);
+
 void         FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx);
-FLAC__uint32 FLAC__cpu_xgetbv_x86(void);
-# endif
-#endif
 
 #endif
diff --git a/libFLAC/include/private/crc.h b/libFLAC/include/private/crc.h
index 29c512c..294f60e 100644
--- a/libFLAC/include/private/crc.h
+++ b/libFLAC/include/private/crc.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/fixed.h b/libFLAC/include/private/fixed.h
index dcc4715..68cdfce 100644
--- a/libFLAC/include/private/fixed.h
+++ b/libFLAC/include/private/fixed.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -54,21 +54,21 @@
  *	OUT residual_bits_per_sample[0,FLAC__MAX_FIXED_ORDER]
  */
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
-unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 # ifndef FLAC__NO_ASM
-#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
 #   ifdef FLAC__SSE2_SUPPORTED
-unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
-unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
 #   endif
 #   ifdef FLAC__SSSE3_SUPPORTED
-unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
-unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
 #   endif
 #  endif
 #  if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM
-unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #  endif
 # endif
 #else
diff --git a/libFLAC/include/private/float.h b/libFLAC/include/private/float.h
index ab26432..12ece60 100644
--- a/libFLAC/include/private/float.h
+++ b/libFLAC/include/private/float.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2004-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -40,18 +40,15 @@
 #include "FLAC/ordinals.h"
 
 /*
- * These typedefs make it easier to ensure that integer versions of
- * the library really only contain integer operations.  All the code
- * in libFLAC should use FLAC__float and FLAC__double in place of
- * float and double, and be protected by checks of the macro
+ * All the code in libFLAC that uses float and double
+ * should be protected by checks of the macro
  * FLAC__INTEGER_ONLY_LIBRARY.
  *
- * FLAC__real is the basic floating point type used in LPC analysis.
  */
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-typedef double FLAC__double;
-typedef float FLAC__float;
 /*
+ * FLAC__real is the basic floating point type used in LPC analysis.
+ *
  * WATCHOUT: changing FLAC__real will change the signatures of many
  * functions that have assembly language equivalents and break them.
  */
diff --git a/libFLAC/include/private/format.h b/libFLAC/include/private/format.h
index 2fd3460..5b9cfbd 100644
--- a/libFLAC/include/private/format.h
+++ b/libFLAC/include/private/format.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/lpc.h b/libFLAC/include/private/lpc.h
index c4ed085..6eb02be 100644
--- a/libFLAC/include/private/lpc.h
+++ b/libFLAC/include/private/lpc.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -79,7 +79,7 @@
 void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 #    endif
 #  endif
-#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
 #    ifdef FLAC__SSE_SUPPORTED
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
@@ -114,7 +114,7 @@
  *	         in lp_coeff[8][0,8], the LP coefficients for order 8 will be
  *			 in lp_coeff[7][0,7], etc.
  */
-void FLAC__lpc_compute_lp_coefficients(const FLAC__real autoc[], unsigned *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], FLAC__double error[]);
+void FLAC__lpc_compute_lp_coefficients(const FLAC__real autoc[], unsigned *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], double error[]);
 
 /*
  *	FLAC__lpc_quantize_coefficients()
@@ -161,7 +161,7 @@
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 #    endif
 #  endif
-#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
 #    ifdef FLAC__SSE2_SUPPORTED
 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
@@ -205,7 +205,7 @@
 void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #    endif /* FLAC__HAS_NASM */
 #  endif /* FLAC__CPU_IA32 */
-#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
 #    ifdef FLAC__SSE2_SUPPORTED
 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #    endif
@@ -227,8 +227,8 @@
  *	IN total_samples > 0  # of samples in residual signal
  *	RETURN                expected bits per sample
  */
-FLAC__double FLAC__lpc_compute_expected_bits_per_residual_sample(FLAC__double lpc_error, unsigned total_samples);
-FLAC__double FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(FLAC__double lpc_error, FLAC__double error_scale);
+double FLAC__lpc_compute_expected_bits_per_residual_sample(double lpc_error, unsigned total_samples);
+double FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(double lpc_error, double error_scale);
 
 /*
  *	FLAC__lpc_compute_best_order()
@@ -243,7 +243,7 @@
  *	                                    (includes warmup sample size and quantized LP coefficient)
  *	RETURN [1,max_order]                best order
  */
-unsigned FLAC__lpc_compute_best_order(const FLAC__double lpc_error[], unsigned max_order, unsigned total_samples, unsigned overhead_bits_per_order);
+unsigned FLAC__lpc_compute_best_order(const double lpc_error[], unsigned max_order, unsigned total_samples, unsigned overhead_bits_per_order);
 
 #endif /* !defined FLAC__INTEGER_ONLY_LIBRARY */
 
diff --git a/libFLAC/include/private/macros.h b/libFLAC/include/private/macros.h
index fd48a7f..becc59f 100644
--- a/libFLAC/include/private/macros.h
+++ b/libFLAC/include/private/macros.h
@@ -1,5 +1,5 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2012-2014  Xiph.org Foundation
+ * Copyright (C) 2012-2016  Xiph.org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -32,19 +32,19 @@
 #ifndef FLAC__PRIVATE__MACROS_H
 #define FLAC__PRIVATE__MACROS_H
 
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
+#if defined(__GNUC__) && (__GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 
 #define flac_max(a,b) \
-    ({ __typeof__ (a) _a = (a); \
-    __typeof__ (b) _b = (b); \
-    _a > _b ? _a : _b; })
+	({ __typeof__ (a) _a = (a); \
+	__typeof__ (b) _b = (b); \
+	_a > _b ? _a : _b; })
 
 #define MIN_PASTE(A,B) A##B
 #define MIN_IMPL(A,B,L) ({ \
-    __typeof__(A) MIN_PASTE(__a,L) = (A); \
-    __typeof__(B) MIN_PASTE(__b,L) = (B); \
-    MIN_PASTE(__a,L) < MIN_PASTE(__b,L) ? MIN_PASTE(__a,L) : MIN_PASTE(__b,L); \
-    })
+	__typeof__(A) MIN_PASTE(__a,L) = (A); \
+	__typeof__(B) MIN_PASTE(__b,L) = (B); \
+	MIN_PASTE(__a,L) < MIN_PASTE(__b,L) ? MIN_PASTE(__a,L) : MIN_PASTE(__b,L); \
+	})
 
 #define flac_min(A,B) MIN_IMPL(A,B,__COUNTER__)
 
diff --git a/libFLAC/include/private/memory.h b/libFLAC/include/private/memory.h
index c9f2712..f103c53 100644
--- a/libFLAC/include/private/memory.h
+++ b/libFLAC/include/private/memory.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/metadata.h b/libFLAC/include/private/metadata.h
index 092764c..161947f 100644
--- a/libFLAC/include/private/metadata.h
+++ b/libFLAC/include/private/metadata.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2002-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/ogg_decoder_aspect.h b/libFLAC/include/private/ogg_decoder_aspect.h
index 439bf8e..218f44e 100644
--- a/libFLAC/include/private/ogg_decoder_aspect.h
+++ b/libFLAC/include/private/ogg_decoder_aspect.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec
  * Copyright (C) 2002-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/ogg_encoder_aspect.h b/libFLAC/include/private/ogg_encoder_aspect.h
index 709597d..f55ef32 100644
--- a/libFLAC/include/private/ogg_encoder_aspect.h
+++ b/libFLAC/include/private/ogg_encoder_aspect.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec
  * Copyright (C) 2002-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/ogg_helper.h b/libFLAC/include/private/ogg_helper.h
index 87b96ef..4c1000c 100644
--- a/libFLAC/include/private/ogg_helper.h
+++ b/libFLAC/include/private/ogg_helper.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec
  * Copyright (C) 2004-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/ogg_mapping.h b/libFLAC/include/private/ogg_mapping.h
index d35a46c..1fa022d 100644
--- a/libFLAC/include/private/ogg_mapping.h
+++ b/libFLAC/include/private/ogg_mapping.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec
  * Copyright (C) 2004-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/stream_encoder.h b/libFLAC/include/private/stream_encoder.h
index 96d3135..ab1721f 100644
--- a/libFLAC/include/private/stream_encoder.h
+++ b/libFLAC/include/private/stream_encoder.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/stream_encoder_framing.h b/libFLAC/include/private/stream_encoder_framing.h
index 2b7387a..f633a9d 100644
--- a/libFLAC/include/private/stream_encoder_framing.h
+++ b/libFLAC/include/private/stream_encoder_framing.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/private/window.h b/libFLAC/include/private/window.h
index 52c9262..bfed774 100644
--- a/libFLAC/include/private/window.h
+++ b/libFLAC/include/private/window.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2006-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/protected/all.h b/libFLAC/include/protected/all.h
new file mode 100644
index 0000000..9468bd3
--- /dev/null
+++ b/libFLAC/include/protected/all.h
@@ -0,0 +1,39 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__PROTECTED__ALL_H
+#define FLAC__PROTECTED__ALL_H
+
+#include "stream_decoder.h"
+#include "stream_encoder.h"
+
+#endif
diff --git a/libFLAC/include/protected/stream_decoder.h b/libFLAC/include/protected/stream_decoder.h
index 7be95af..5c31c16 100644
--- a/libFLAC/include/protected/stream_decoder.h
+++ b/libFLAC/include/protected/stream_decoder.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/include/protected/stream_encoder.h b/libFLAC/include/protected/stream_encoder.h
index 6917f5d..8850c6b 100644
--- a/libFLAC/include/protected/stream_encoder.h
+++ b/libFLAC/include/protected/stream_encoder.h
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/libFLAC.m4 b/libFLAC/libFLAC.m4
new file mode 100644
index 0000000..da7354e
--- /dev/null
+++ b/libFLAC/libFLAC.m4
@@ -0,0 +1,118 @@
+# Configure paths for libFLAC
+# "Inspired" by ogg.m4
+
+dnl AM_PATH_LIBFLAC([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl Test for libFLAC, and define LIBFLAC_CFLAGS, LIBFLAC_LIBS, LIBFLAC_LIBDIR
+dnl
+AC_DEFUN([AM_PATH_LIBFLAC],
+[dnl 
+dnl Get the cflags and libraries
+dnl
+AC_ARG_WITH(libFLAC,[  --with-libFLAC=PFX   Prefix where libFLAC is installed (optional)], libFLAC_prefix="$withval", libFLAC_prefix="")
+AC_ARG_WITH(libFLAC-libraries,[  --with-libFLAC-libraries=DIR   Directory where libFLAC library is installed (optional)], libFLAC_libraries="$withval", libFLAC_libraries="")
+AC_ARG_WITH(libFLAC-includes,[  --with-libFLAC-includes=DIR   Directory where libFLAC header files are installed (optional)], libFLAC_includes="$withval", libFLAC_includes="")
+AC_ARG_ENABLE(libFLACtest, [  --disable-libFLACtest       Do not try to compile and run a test libFLAC program],, enable_libFLACtest=yes)
+
+  if test "x$libFLAC_libraries" != "x" ; then
+    LIBFLAC_LIBS="-L$libFLAC_libraries"
+  elif test "x$libFLAC_prefix" = "xno" || test "x$libFLAC_prefix" = "xyes" ; then
+    LIBFLAC_LIBS=""
+  elif test "x$libFLAC_prefix" != "x" ; then
+    LIBFLAC_LIBS="-L$libFLAC_prefix/lib"
+  elif test "x$prefix" != "xNONE"; then
+    LIBFLAC_LIBS="-L$prefix/lib"
+  fi
+
+  if test "x$libFLAC_prefix" != "xno" ; then
+    LIBFLAC_LIBS="$LIBFLAC_LIBS -lFLAC $OGG_LIBS -lm"
+  fi
+
+  if test "x$libFLAC_includes" != "x" ; then
+    LIBFLAC_CFLAGS="-I$libFLAC_includes"
+  elif test "x$libFLAC_prefix" != "x" ; then
+    LIBFLAC_CFLAGS="-I$libFLAC_prefix/include"
+  elif test "$prefix" != "xNONE"; then
+    LIBFLAC_CFLAGS=""
+  fi
+
+  AC_MSG_CHECKING(for libFLAC)
+  no_libFLAC=""
+
+
+  if test "x$enable_libFLACtest" = "xyes" ; then
+    ac_save_CFLAGS="$CFLAGS"
+    ac_save_CXXFLAGS="$CXXFLAGS"
+    ac_save_LIBS="$LIBS"
+    ac_save_LD_LIBRARY_PATH="$LD_LIBRARY_PATH"
+    CFLAGS="$CFLAGS $LIBFLAC_CFLAGS"
+    CXXFLAGS="$CXXFLAGS $LIBFLAC_CFLAGS"
+    LIBS="$LIBS $LIBFLAC_LIBS"
+    LD_LIBRARY_PATH="$LIBFLAC_LIBDIR:$LD_LIBRARY_PATH"
+dnl
+dnl Now check if the installed libFLAC is sufficiently new.
+dnl
+      rm -f conf.libFLACtest
+      AC_TRY_RUN([
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <FLAC/format.h>
+
+int main ()
+{
+  system("touch conf.libFLACtest");
+  return 0;
+}
+
+],, no_libFLAC=yes,[echo $ac_n "cross compiling; assumed OK... $ac_c"])
+       CFLAGS="$ac_save_CFLAGS"
+       CXXFLAGS="$ac_save_CXXFLAGS"
+       LIBS="$ac_save_LIBS"
+       LD_LIBRARY_PATH="$ac_save_LD_LIBRARY_PATH"
+  fi
+
+  if test "x$no_libFLAC" = "x" ; then
+     AC_MSG_RESULT(yes)
+     ifelse([$1], , :, [$1])     
+  else
+     AC_MSG_RESULT(no)
+     if test -f conf.libFLACtest ; then
+       :
+     else
+       echo "*** Could not run libFLAC test program, checking why..."
+       CFLAGS="$CFLAGS $LIBFLAC_CFLAGS"
+       CXXFLAGS="$CXXFLAGS $LIBFLAC_CFLAGS"
+       LIBS="$LIBS $LIBFLAC_LIBS"
+       LD_LIBRARY_PATH="$LIBFLAC_LIBDIR:$LD_LIBRARY_PATH"
+       AC_TRY_LINK([
+#include <stdio.h>
+#include <FLAC/format.h>
+],     [ return 0; ],
+       [ echo "*** The test program compiled, but did not run. This usually means"
+       echo "*** that the run-time linker is not finding libFLAC or finding the wrong"
+       echo "*** version of libFLAC. If it is not finding libFLAC, you'll need to set your"
+       echo "*** LD_LIBRARY_PATH environment variable, or edit /etc/ld.so.conf to point"
+       echo "*** to the installed location  Also, make sure you have run ldconfig if that"
+       echo "*** is required on your system"
+       echo "***"
+       echo "*** If you have an old version installed, it is best to remove it, although"
+       echo "*** you may also be able to get things to work by modifying LD_LIBRARY_PATH"],
+       [ echo "*** The test program failed to compile or link. See the file config.log for the"
+       echo "*** exact error that occured. This usually means libFLAC was incorrectly installed"
+       echo "*** or that you have moved libFLAC since it was installed. In the latter case, you"
+       echo "*** may want to edit the libFLAC-config script: $LIBFLAC_CONFIG" ])
+       CFLAGS="$ac_save_CFLAGS"
+       CXXFLAGS="$ac_save_CXXFLAGS"
+       LIBS="$ac_save_LIBS"
+       LD_LIBRARY_PATH="$ac_save_LD_LIBRARY_PATH"
+     fi
+     LIBFLAC_CFLAGS=""
+     LIBFLAC_LIBDIR=""
+     LIBFLAC_LIBS=""
+     ifelse([$2], , :, [$2])
+  fi
+  AC_SUBST(LIBFLAC_CFLAGS)
+  AC_SUBST(LIBFLAC_LIBDIR)
+  AC_SUBST(LIBFLAC_LIBS)
+  rm -f conf.libFLACtest
+])
diff --git a/libFLAC/lpc.c b/libFLAC/lpc.c
index 5c6e175..531247b 100644
--- a/libFLAC/lpc.c
+++ b/libFLAC/lpc.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2000-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -51,15 +51,14 @@
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 
-#if !defined(HAVE_LROUND)
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
 #include <float.h>
-#define copysign _copysign
-#elif defined(__GNUC__)
-#define copysign __builtin_copysign
-#endif
 static inline long int lround(double x) {
-    return (long)(x + copysign (0.5, x));
+	return (long)(x + _copysign(0.5, x));
+}
+#elif !defined(HAVE_LROUND) && defined(__GNUC__)
+static inline long int lround(double x) {
+	return (long)(x + __builtin_copysign(0.5, x));
 }
 /* If this fails, we are in the presence of a mid 90's compiler, move along... */
 #endif
@@ -120,10 +119,10 @@
 	}
 }
 
-void FLAC__lpc_compute_lp_coefficients(const FLAC__real autoc[], unsigned *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], FLAC__double error[])
+void FLAC__lpc_compute_lp_coefficients(const FLAC__real autoc[], unsigned *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], double error[])
 {
 	unsigned i, j;
-	FLAC__double r, err, lpc[FLAC__MAX_LPC_ORDER];
+	double r, err, lpc[FLAC__MAX_LPC_ORDER];
 
 	FLAC__ASSERT(0 != max_order);
 	FLAC__ASSERT(0 < *max_order);
@@ -142,7 +141,7 @@
 		/* Update LPC coefficients and total error. */
 		lpc[i]=r;
 		for(j = 0; j < (i>>1); j++) {
-			FLAC__double tmp = lpc[j];
+			double tmp = lpc[j];
 			lpc[j] += r * lpc[i-1-j];
 			lpc[i-1-j] += r * tmp;
 		}
@@ -167,7 +166,7 @@
 int FLAC__lpc_quantize_coefficients(const FLAC__real lp_coeff[], unsigned order, unsigned precision, FLAC__int32 qlp_coeff[], int *shift)
 {
 	unsigned i;
-	FLAC__double cmax;
+	double cmax;
 	FLAC__int32 qmax, qmin;
 
 	FLAC__ASSERT(precision > 0);
@@ -182,7 +181,7 @@
 	/* calc cmax = max( |lp_coeff[i]| ) */
 	cmax = 0.0;
 	for(i = 0; i < order; i++) {
-		const FLAC__double d = fabs(lp_coeff[i]);
+		const double d = fabs(lp_coeff[i]);
 		if(d > cmax)
 			cmax = d;
 	}
@@ -207,7 +206,7 @@
 	}
 
 	if(*shift >= 0) {
-		FLAC__double error = 0.0;
+		double error = 0.0;
 		FLAC__int32 q;
 		for(i = 0; i < order; i++) {
 			error += lp_coeff[i] * (1 << *shift);
@@ -228,12 +227,12 @@
 		}
 	}
 	/* negative shift is very rare but due to design flaw, negative shift is
-	 * a NOP in the decoder, so it must be handled specially by scaling down
-	 * coeffs
+	 * not allowed in the decoder, so it must be handled specially by scaling
+	 * down coeffs
 	 */
 	else {
 		const int nshift = -(*shift);
-		FLAC__double error = 0.0;
+		double error = 0.0;
 		FLAC__int32 q;
 #ifdef DEBUG
 		fprintf(stderr,"FLAC__lpc_quantize_coefficients: negative shift=%d order=%u cmax=%f\n", *shift, order, cmax);
@@ -546,11 +545,11 @@
 		history = data;
 		for(j = 0; j < order; j++)
 			sum += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*(--history));
-		if(FLAC__bitmath_silog2_wide(sum >> lp_quantization) > 32) {
+		if(FLAC__bitmath_silog2(sum >> lp_quantization) > 32) {
 			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, sum=%" PRId64 "\n", i, (sum >> lp_quantization));
 			break;
 		}
-		if(FLAC__bitmath_silog2_wide((FLAC__int64)(*data) - (sum >> lp_quantization)) > 32) {
+		if(FLAC__bitmath_silog2((FLAC__int64)(*data) - (sum >> lp_quantization)) > 32) {
 			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, data=%d, sum=%" PRId64 ", residual=%" PRId64 "\n", i, *data, (int64_t)(sum >> lp_quantization), ((FLAC__int64)(*data) - (sum >> lp_quantization)));
 			break;
 		}
@@ -1063,11 +1062,11 @@
 		history = data;
 		for(j = 0; j < order; j++)
 			sum += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*(--history));
-		if(FLAC__bitmath_silog2_wide(sum >> lp_quantization) > 32) {
+		if(FLAC__bitmath_silog2(sum >> lp_quantization) > 32) {
 			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, sum=%" PRId64 "\n", i, (sum >> lp_quantization));
 			break;
 		}
-		if(FLAC__bitmath_silog2_wide((FLAC__int64)(*r) + (sum >> lp_quantization)) > 32) {
+		if(FLAC__bitmath_silog2((FLAC__int64)(*r) + (sum >> lp_quantization)) > 32) {
 			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, residual=%d, sum=%" PRId64 ", data=%" PRId64 "\n", i, *r, (sum >> lp_quantization), ((FLAC__int64)(*r) + (sum >> lp_quantization)));
 			break;
 		}
@@ -1303,21 +1302,21 @@
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 
-FLAC__double FLAC__lpc_compute_expected_bits_per_residual_sample(FLAC__double lpc_error, unsigned total_samples)
+double FLAC__lpc_compute_expected_bits_per_residual_sample(double lpc_error, unsigned total_samples)
 {
-	FLAC__double error_scale;
+	double error_scale;
 
 	FLAC__ASSERT(total_samples > 0);
 
-	error_scale = 0.5 / (FLAC__double)total_samples;
+	error_scale = 0.5 / (double)total_samples;
 
 	return FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(lpc_error, error_scale);
 }
 
-FLAC__double FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(FLAC__double lpc_error, FLAC__double error_scale)
+double FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(double lpc_error, double error_scale)
 {
 	if(lpc_error > 0.0) {
-		FLAC__double bps = (FLAC__double)0.5 * log(error_scale * lpc_error) / M_LN2;
+		double bps = (double)0.5 * log(error_scale * lpc_error) / M_LN2;
 		if(bps >= 0.0)
 			return bps;
 		else
@@ -1331,21 +1330,21 @@
 	}
 }
 
-unsigned FLAC__lpc_compute_best_order(const FLAC__double lpc_error[], unsigned max_order, unsigned total_samples, unsigned overhead_bits_per_order)
+unsigned FLAC__lpc_compute_best_order(const double lpc_error[], unsigned max_order, unsigned total_samples, unsigned overhead_bits_per_order)
 {
 	unsigned order, indx, best_index; /* 'index' the index into lpc_error; index==order-1 since lpc_error[0] is for order==1, lpc_error[1] is for order==2, etc */
-	FLAC__double bits, best_bits, error_scale;
+	double bits, best_bits, error_scale;
 
 	FLAC__ASSERT(max_order > 0);
 	FLAC__ASSERT(total_samples > 0);
 
-	error_scale = 0.5 / (FLAC__double)total_samples;
+	error_scale = 0.5 / (double)total_samples;
 
 	best_index = 0;
 	best_bits = (unsigned)(-1);
 
 	for(indx = 0, order = 1; indx < max_order; indx++, order++) {
-		bits = FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(lpc_error[indx], error_scale) * (FLAC__double)(total_samples - order) + (FLAC__double)(order * overhead_bits_per_order);
+		bits = FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(lpc_error[indx], error_scale) * (double)(total_samples - order) + (double)(order * overhead_bits_per_order);
 		if(bits < best_bits) {
 			best_index = indx;
 			best_bits = bits;
diff --git a/libFLAC/lpc_intrin_avx2.c b/libFLAC/lpc_intrin_avx2.c
new file mode 100644
index 0000000..f9f5ccd
--- /dev/null
+++ b/libFLAC/lpc_intrin_avx2.c
@@ -0,0 +1,1122 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__AVX2_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <immintrin.h> /* AVX2 */
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
+					q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
+						mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
+						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ;
+						summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(qlp_coeff[10]);
+					q11 = _mm256_set1_epi32(qlp_coeff[11]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
+						mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(qlp_coeff[10]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
+						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ;
+						summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int64 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
+					q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
+						mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
+						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ;
+						summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+#endif /* FLAC__AVX2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/lpc_intrin_sse.c b/libFLAC/lpc_intrin_sse.c
new file mode 100644
index 0000000..430e73f
--- /dev/null
+++ b/libFLAC/lpc_intrin_sse.c
@@ -0,0 +1,454 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE_SUPPORTED
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <xmmintrin.h> /* SSE */
+
+/*   new routines: more unaligned loads, less shuffle
+ *   old routines: less unaligned loads, more shuffle
+ *   these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
+ */
+
+/* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 4;
+	__m128 sum0;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 4);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0;
+		d0 = _mm_loadu_ps(data+i);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_move_ss(d0, d);
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 8;
+	__m128 sum0, sum1;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 8);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 12;
+	__m128 sum0, sum1, sum2;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 12);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+	sum2 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1, d2;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d2 = _mm_loadu_ps(data+i+8);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		__m128 d2 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d2 = _mm_move_ss(d2, d1);
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+	_mm_storeu_ps(autoc+8, sum2);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 16;
+	__m128 sum0, sum1, sum2, sum3;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 16);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+	sum2 = _mm_setzero_ps();
+	sum3 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1, d2, d3;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d2 = _mm_loadu_ps(data+i+8);
+		d3 = _mm_loadu_ps(data+i+12);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
+		sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		__m128 d2 = _mm_setzero_ps();
+		__m128 d3 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
+			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d3 = _mm_move_ss(d3, d2);
+			d2 = _mm_move_ss(d2, d1);
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
+			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+	_mm_storeu_ps(autoc+8, sum2);
+	_mm_storeu_ps(autoc+12,sum3);
+}
+
+/* old routines: faster on older Intel CPUs (up to Core 2) */
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm2, xmm5;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 4);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm5 = _mm_add_ps(xmm5, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc, xmm5);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 8);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+	xmm6 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm6 = _mm_add_ps(xmm6, xmm1);
+		xmm5 = _mm_add_ps(xmm5, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm5);
+	_mm_storeu_ps(autoc+4, xmm6);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 12);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+	xmm6 = _mm_setzero_ps();
+	xmm7 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+	xmm4 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_move_ss(xmm4, xmm3);
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm2);
+		xmm5 = _mm_add_ps(xmm5, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm6 = _mm_add_ps(xmm6, xmm1);
+		xmm0 = _mm_mul_ps(xmm0, xmm4);
+		xmm7 = _mm_add_ps(xmm7, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm5);
+	_mm_storeu_ps(autoc+4, xmm6);
+	_mm_storeu_ps(autoc+8, xmm7);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 16);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm6 = _mm_setzero_ps();
+	xmm7 = _mm_setzero_ps();
+	xmm8 = _mm_setzero_ps();
+	xmm9 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+	xmm4 = _mm_setzero_ps();
+	xmm5 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm6 = _mm_add_ps(xmm6, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		/* shift xmm5:xmm4:xmm3:xmm2 left by one float */
+		xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm5 = _mm_move_ss(xmm5, xmm4);
+		xmm4 = _mm_move_ss(xmm4, xmm3);
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		/* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm5);
+		xmm9 = _mm_add_ps(xmm9, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm4);
+		xmm8 = _mm_add_ps(xmm8, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm7 = _mm_add_ps(xmm7, xmm1);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm6 = _mm_add_ps(xmm6, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm6);
+	_mm_storeu_ps(autoc+4, xmm7);
+	_mm_storeu_ps(autoc+8, xmm8);
+	_mm_storeu_ps(autoc+12,xmm9);
+}
+
+#endif /* FLAC__SSE_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/lpc_intrin_sse2.c b/libFLAC/lpc_intrin_sse2.c
new file mode 100644
index 0000000..1383394
--- /dev/null
+++ b/libFLAC/lpc_intrin_sse2.c
@@ -0,0 +1,1090 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE2_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <emmintrin.h> /* SSE2 */
+
+#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
+
+#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
+						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
+						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m128i q0, q1, q2, q3, q4, q5, q6;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m128i q0, q1, q2, q3, q4, q5;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m128i q0, q1, q2, q3, q4;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m128i q0, q1, q2, q3;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m128i q0, q1, q2;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m128i q0, q1;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m128i q0;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ;
+						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				if(order == 12) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
+					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
+					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[11] * data[i-12];
+						//sum += qlp_coeff[10] * data[i-11];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
+						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
+
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm4);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 11 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[10] * data[i-11];
+						xmm7 = _mm_cvtsi32_si128(data[i-11]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm4);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 9, 10 */
+				if(order == 10) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 9 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[8] * data[i-9];
+						xmm7 = _mm_cvtsi32_si128(data[i-9]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				if(order == 8) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 7 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[6] * data[i-7];
+						xmm7 = _mm_cvtsi32_si128(data[i-7]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 5, 6 */
+				if(order == 6) {
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 5 */
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[4] * data[i-5];
+						xmm7 = _mm_cvtsi32_si128(data[i-5]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				if(order == 4) {
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 3 */
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[2] * data[i-3];
+						xmm7 = _mm_cvtsi32_si128(data[i-3]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm0);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 1 */
+					for(i = 0; i < (int)data_len; i++)
+						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int32 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+	if (order < 8 || order > 12) {
+		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
+		return;
+	}
+	if (data_len == 0)
+		return;
+
+	FLAC__ASSERT(order >= 8);
+	FLAC__ASSERT(order <= 12);
+
+	if(order > 8) { /* order == 9, 10, 11, 12 */
+		FLAC__int32 curr;
+		__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
+		xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
+		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
+		switch(order)                                          /* ...and zero them out */
+		{
+		case 9:
+			xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
+		case 10:
+			xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
+		case 11:
+			xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
+		}
+		xmm2 = _mm_setzero_si128();
+		xmm0 = _mm_packs_epi32(xmm0, xmm6);
+		xmm1 = _mm_packs_epi32(xmm1, xmm2);
+
+		xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
+		xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
+		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
+		xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
+		xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
+		xmm4 = _mm_packs_epi32(xmm4, xmm2);
+		xmm3 = _mm_packs_epi32(xmm3, xmm5);
+
+		xmm7 = _mm_slli_si128(xmm1, 2);
+		xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
+		xmm2 = _mm_slli_si128(xmm0, 2);
+
+		/* xmm0, xmm1: qlp_coeff
+			xmm2, xmm7: qlp_coeff << 16 bit
+			xmm3, xmm4: data */
+
+		xmm5 = _mm_madd_epi16(xmm4, xmm1);
+		xmm6 = _mm_madd_epi16(xmm3, xmm0);
+		xmm6 = _mm_add_epi32(xmm6, xmm5);
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+		DATA16_RESULT(xmm6);
+
+		data_len--;
+
+		if(data_len % 2) {
+			xmm6 = _mm_srli_si128(xmm3, 14);
+			xmm4 = _mm_slli_si128(xmm4, 2);
+			xmm3 = _mm_slli_si128(xmm3, 2);
+			xmm4 = _mm_or_si128(xmm4, xmm6);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm1);
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len--;
+		}
+
+		while(data_len) { /* data_len is a multiple of 2 */
+			/* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
+			xmm6 = _mm_srli_si128(xmm3, 12);
+			xmm4 = _mm_slli_si128(xmm4, 4);
+			xmm3 = _mm_slli_si128(xmm3, 4);
+			xmm4 = _mm_or_si128(xmm4, xmm6);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 1);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm7);
+			xmm6 = _mm_madd_epi16(xmm3, xmm2);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm1);
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len-=2;
+		}
+	} /* endif(order > 8) */
+	else
+	{
+		FLAC__int32 curr;
+		__m128i xmm0, xmm1, xmm3, xmm6;
+		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
+		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
+		xmm0 = _mm_packs_epi32(xmm0, xmm1);
+
+		xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
+		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
+		xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_packs_epi32(xmm3, xmm1);
+
+		/* xmm0: qlp_coeff
+			xmm3: data */
+
+		xmm6 = _mm_madd_epi16(xmm3, xmm0);
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+		DATA16_RESULT(xmm6);
+
+		data_len--;
+
+		while(data_len) {
+			xmm3 = _mm_slli_si128(xmm3, 2);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len--;
+		}
+	}
+}
+
+#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
+
+#endif /* FLAC__SSE2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/lpc_intrin_sse41.c b/libFLAC/lpc_intrin_sse41.c
new file mode 100644
index 0000000..bef73f4
--- /dev/null
+++ b/libFLAC/lpc_intrin_sse41.c
@@ -0,0 +1,1314 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE4_1_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#if defined FLAC__CPU_IA32 /* unused for x64 */
+
+#define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
+#define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				if(order == 12) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
+					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
+					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+						//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
+						xmm7 = _mm_mul_epi32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm4);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT1(xmm7);
+					}
+				}
+				else { /* order == 11 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
+						xmm7 = _mm_cvtsi32_si128(data[i-11]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm4);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT1(xmm7);
+					}
+				}
+			}
+			else { /* order == 9, 10 */
+				if(order == 10) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 9 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm7 = _mm_cvtsi32_si128(data[i-9]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				if(order == 8) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 7 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm7 = _mm_cvtsi32_si128(data[i-7]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 5, 6 */
+				if(order == 6) {
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 5 */
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm7 = _mm_cvtsi32_si128(data[i-5]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				if(order == 4) {
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 3 */
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm7 = _mm_cvtsi32_si128(data[i-3]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm0);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 1 */
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm7 = _mm_cvtsi32_si128(data[i-1]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm0);
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int64 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+}
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+	int i;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	if (!data_len)
+		return;
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				__m128i qlp[6], dat[6];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));	// 0  0  q[1]  q[0]
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));	// 0  0  q[3]  q[2]
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));	// 0  0  q[5]  q[4]
+				qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));	// 0  0  q[7]  q[6]
+				qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));	// 0  0  q[9]  q[8]
+				if (order == 12)
+					qlp[5] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10));	// 0  0  q[11] q[10]
+				else
+					qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]);					// 0  0  0     q[10]
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));	// 0  q[0]  0  q[1]
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));	// 0  q[2]  0  q[3]
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));	// 0  q[4]  0  q[5]
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));	// 0  q[5]  0  q[7]
+				qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));	// 0  q[8]  0  q[9]
+				qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2,0,3,1));	// 0  q[10] 0  q[11]
+
+				dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-12)));	// ?  d[i-11]  ?  d[i-12]
+				dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));	// ?  d[i-9]   ?  d[i-10]
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));	// ?  d[i-7]   ?  d[i-8]
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));	// ?  d[i-5]   ?  d[i-6]
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));	// ?  d[i-3]   ?  d[i-4]
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));	// ?  d[i-1]   ?  d[i-2]
+
+				summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
+				summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
+				temp = _mm_cvtsi32_si128(residual[0]);					// 0  0  0  r[i]
+				temp = _mm_add_epi32(temp, summ);						// ?  ?  ?  d[i]
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8);	//  ?  d[i-10] ?  d[i-11]
+					dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);	//  ?  d[i-8]  ?  d[i-9]
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);	//  ?  d[i-6]  ?  d[i-7]
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);	//  ?  d[i-4]  ?  d[i-5]
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);	//  ?  d[i-2]  ?  d[i-3]
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);	//  ?  d[i  ]  ?  d[i-1]
+
+					summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
+					summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
+					temp = _mm_cvtsi32_si128(residual[i]);					// 0  0  0  r[i]
+					temp = _mm_add_epi32(temp, summ);						// ?  ?  ?  d[i]
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 9, 10 */
+				__m128i qlp[5], dat[5];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+				if (order == 10)
+					qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+				else
+					qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
+				qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));
+
+				dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				__m128i qlp[4], dat[4];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				if (order == 8)
+					qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+				else
+					qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
+
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 5, 6 */
+				__m128i qlp[3], dat[3];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				if (order == 6)
+					qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				else
+					qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				__m128i qlp[2], dat[2];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				if (order == 4)
+					qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				else
+					qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i qlp0, dat0;
+					__m128i summ, temp;
+					qlp0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff));
+					qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFFLE(2,0,3,1));
+
+					dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+					summ = _mm_mul_epi32(dat0, qlp0) ;
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[0]);
+					temp = _mm_add_epi32(temp, summ);
+					data[0] = _mm_cvtsi128_si32(temp);
+
+					for(i = 1; i < (int)data_len; i++) {
+						dat0 = _mm_alignr_epi8(temp, dat0, 8);
+
+						summ = _mm_mul_epi32(dat0, qlp0) ;
+
+						summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+						summ = _mm_srl_epi64(summ, cnt);
+						temp = _mm_cvtsi32_si128(residual[i]);
+						temp = _mm_add_epi32(temp, summ);
+						data[i] = _mm_cvtsi128_si32(temp);
+					}
+				}
+				else { /* order == 1 */
+					__m128i qlp0;
+					__m128i summ, temp;
+					qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
+					temp = _mm_cvtsi32_si128(data[-1]);
+
+					summ = _mm_mul_epi32(temp, qlp0);
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[0]);
+					temp = _mm_add_epi32(temp, summ);
+					data[0] = _mm_cvtsi128_si32(temp);
+
+					for(i = 1; i < (int)data_len; i++) {
+						summ = _mm_mul_epi32(temp, qlp0) ;
+						summ = _mm_srl_epi64(summ, cnt);
+						temp = _mm_cvtsi32_si128(residual[i]);
+						temp = _mm_add_epi32(temp, summ);
+						data[i] = _mm_cvtsi128_si32(temp);
+					}
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int64 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+}
+
+#endif /* defined FLAC__CPU_IA32 */
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+					q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
+						mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
+						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m128i q0, q1, q2, q3, q4, q5, q6;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m128i q0, q1, q2, q3, q4, q5;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m128i q0, q1, q2, q3, q4;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m128i q0, q1, q2, q3;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m128i q0, q1, q2;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m128i q0, q1;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m128i q0;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ;
+						summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+#endif /* FLAC__SSE4_1_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/libFLAC/memory.c b/libFLAC/memory.c
index d22df70..a8ebd10 100644
--- a/libFLAC/memory.c
+++ b/libFLAC/memory.c
@@ -1,6 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
  * Copyright (C) 2001-2009  Josh Coalson
- * Copyright (C) 2011-2014  Xiph.Org Foundation
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/libFLAC/metadata_iterators.c b/libFLAC/metadata_iterators.c
new file mode 100644
index 0000000..0a84d03
--- /dev/null
+++ b/libFLAC/metadata_iterators.c
@@ -0,0 +1,3481 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include <sys/stat.h> /* for stat(), maybe chmod() */
+
+#include "private/metadata.h"
+
+#include "FLAC/assert.h"
+#include "FLAC/stream_decoder.h"
+#include "share/alloc.h"
+#include "share/compat.h"
+#include "share/macros.h"
+#include "share/safe_str.h"
+#include "private/macros.h"
+#include "private/memory.h"
+
+/* Alias the first (in share/alloc.h) to the second (in src/libFLAC/memory.c). */
+#define safe_malloc_mul_2op_ safe_malloc_mul_2op_p
+
+/****************************************************************************
+ *
+ * Local function declarations
+ *
+ ***************************************************************************/
+
+static void pack_uint32_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes);
+static void pack_uint32_little_endian_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes);
+static void pack_uint64_(FLAC__uint64 val, FLAC__byte *b, unsigned bytes);
+static FLAC__uint32 unpack_uint32_(FLAC__byte *b, unsigned bytes);
+static FLAC__uint32 unpack_uint32_little_endian_(FLAC__byte *b, unsigned bytes);
+static FLAC__uint64 unpack_uint64_(FLAC__byte *b, unsigned bytes);
+
+static FLAC__bool read_metadata_block_header_(FLAC__Metadata_SimpleIterator *iterator);
+static FLAC__bool read_metadata_block_data_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block);
+static FLAC__bool read_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__bool *is_last, FLAC__MetadataType *type, unsigned *length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_StreamInfo *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_Padding *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Application *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_SeekTable *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_entry_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_VorbisComment_Entry *entry, unsigned max_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_VorbisComment *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_track_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet_Track *track);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Picture *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Unknown *block, unsigned block_length);
+
+static FLAC__bool write_metadata_block_header_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_StreamInfo *block);
+static FLAC__bool write_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Padding *block, unsigned block_length);
+static FLAC__bool write_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Application *block, unsigned block_length);
+static FLAC__bool write_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_SeekTable *block);
+static FLAC__bool write_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_VorbisComment *block);
+static FLAC__bool write_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_CueSheet *block);
+static FLAC__bool write_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Picture *block);
+static FLAC__bool write_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Unknown *block, unsigned block_length);
+
+static FLAC__bool write_metadata_block_stationary_(FLAC__Metadata_SimpleIterator *iterator, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_stationary_with_padding_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, unsigned padding_length, FLAC__bool padding_is_last);
+static FLAC__bool rewrite_whole_file_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool append);
+
+static void simple_iterator_push_(FLAC__Metadata_SimpleIterator *iterator);
+static FLAC__bool simple_iterator_pop_(FLAC__Metadata_SimpleIterator *iterator);
+
+static unsigned seek_to_first_metadata_block_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb);
+static unsigned seek_to_first_metadata_block_(FILE *f);
+
+static FLAC__bool simple_iterator_copy_file_prefix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, FLAC__bool append);
+static FLAC__bool simple_iterator_copy_file_postfix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, int fixup_is_last_code, FLAC__off_t fixup_is_last_flag_offset, FLAC__bool backup);
+
+static FLAC__bool copy_n_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_n_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_remaining_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_remaining_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__Metadata_SimpleIteratorStatus *status);
+
+static FLAC__bool open_tempfile_(const char *filename, const char *tempfile_path_prefix, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool transport_tempfile_(const char *filename, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status);
+static void cleanup_tempfile_(FILE **tempfile, char **tempfilename);
+
+static FLAC__bool get_file_stats_(const char *filename, struct flac_stat_s *stats);
+static void set_file_stats_(const char *filename, struct flac_stat_s *stats);
+
+static int fseek_wrapper_(FLAC__IOHandle handle, FLAC__int64 offset, int whence);
+static FLAC__int64 ftell_wrapper_(FLAC__IOHandle handle);
+
+static FLAC__Metadata_ChainStatus get_equivalent_status_(FLAC__Metadata_SimpleIteratorStatus status);
+
+
+#ifdef FLAC__VALGRIND_TESTING
+static size_t local__fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t ret = fwrite(ptr, size, nmemb, stream);
+	if(!ferror(stream))
+		fflush(stream);
+	return ret;
+}
+#else
+#define local__fwrite fwrite
+#endif
+
+/****************************************************************************
+ *
+ * Level 0 implementation
+ *
+ ***************************************************************************/
+
+static FLAC__StreamDecoderWriteStatus write_callback_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data);
+static void metadata_callback_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data);
+static void error_callback_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data);
+
+typedef struct {
+	FLAC__bool got_error;
+	FLAC__StreamMetadata *object;
+} level0_client_data;
+
+static FLAC__StreamMetadata *get_one_metadata_block_(const char *filename, FLAC__MetadataType type)
+{
+	level0_client_data cd;
+	FLAC__StreamDecoder *decoder;
+
+	FLAC__ASSERT(0 != filename);
+
+	cd.got_error = false;
+	cd.object = 0;
+
+	decoder = FLAC__stream_decoder_new();
+
+	if(0 == decoder)
+		return 0;
+
+	FLAC__stream_decoder_set_md5_checking(decoder, false);
+	FLAC__stream_decoder_set_metadata_ignore_all(decoder);
+	FLAC__stream_decoder_set_metadata_respond(decoder, type);
+
+	if(FLAC__stream_decoder_init_file(decoder, filename, write_callback_, metadata_callback_, error_callback_, &cd) != FLAC__STREAM_DECODER_INIT_STATUS_OK || cd.got_error) {
+		(void)FLAC__stream_decoder_finish(decoder);
+		FLAC__stream_decoder_delete(decoder);
+		return 0;
+	}
+
+	if(!FLAC__stream_decoder_process_until_end_of_metadata(decoder) || cd.got_error) {
+		(void)FLAC__stream_decoder_finish(decoder);
+		FLAC__stream_decoder_delete(decoder);
+		if(0 != cd.object)
+			FLAC__metadata_object_delete(cd.object);
+		return 0;
+	}
+
+	(void)FLAC__stream_decoder_finish(decoder);
+	FLAC__stream_decoder_delete(decoder);
+
+	return cd.object;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_streaminfo(const char *filename, FLAC__StreamMetadata *streaminfo)
+{
+	FLAC__StreamMetadata *object;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != streaminfo);
+
+	object = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_STREAMINFO);
+
+	if (object) {
+		/* can just copy the contents since STREAMINFO has no internal structure */
+		*streaminfo = *object;
+		FLAC__metadata_object_delete(object);
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_tags(const char *filename, FLAC__StreamMetadata **tags)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != tags);
+
+	*tags = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	return 0 != *tags;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_cuesheet(const char *filename, FLAC__StreamMetadata **cuesheet)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != cuesheet);
+
+	*cuesheet = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_CUESHEET);
+
+	return 0 != *cuesheet;
+}
+
+FLAC__StreamDecoderWriteStatus write_callback_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data)
+{
+	(void)decoder, (void)frame, (void)buffer, (void)client_data;
+
+	return FLAC__STREAM_DECODER_WRITE_STATUS_CONTINUE;
+}
+
+void metadata_callback_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data)
+{
+	level0_client_data *cd = (level0_client_data *)client_data;
+	(void)decoder;
+
+	/*
+	 * we assume we only get here when the one metadata block we were
+	 * looking for was passed to us
+	 */
+	if(!cd->got_error && 0 == cd->object) {
+		if(0 == (cd->object = FLAC__metadata_object_clone(metadata)))
+			cd->got_error = true;
+	}
+}
+
+void error_callback_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data)
+{
+	level0_client_data *cd = (level0_client_data *)client_data;
+	(void)decoder;
+
+	if(status != FLAC__STREAM_DECODER_ERROR_STATUS_LOST_SYNC)
+		cd->got_error = true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_picture(const char *filename, FLAC__StreamMetadata **picture, FLAC__StreamMetadata_Picture_Type type, const char *mime_type, const FLAC__byte *description, unsigned max_width, unsigned max_height, unsigned max_depth, unsigned max_colors)
+{
+	FLAC__Metadata_SimpleIterator *it;
+	FLAC__uint64 max_area_seen = 0;
+	FLAC__uint64 max_depth_seen = 0;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != picture);
+
+	*picture = 0;
+
+	it = FLAC__metadata_simple_iterator_new();
+	if(0 == it)
+		return false;
+	if(!FLAC__metadata_simple_iterator_init(it, filename, /*read_only=*/true, /*preserve_file_stats=*/true)) {
+		FLAC__metadata_simple_iterator_delete(it);
+		return false;
+	}
+	do {
+		if(FLAC__metadata_simple_iterator_get_block_type(it) == FLAC__METADATA_TYPE_PICTURE) {
+			FLAC__StreamMetadata *obj = FLAC__metadata_simple_iterator_get_block(it);
+			FLAC__uint64 area = (FLAC__uint64)obj->data.picture.width * (FLAC__uint64)obj->data.picture.height;
+			/* check constraints */
+			if(
+				(type == (FLAC__StreamMetadata_Picture_Type)(-1) || type == obj->data.picture.type) &&
+				(mime_type == 0 || !strcmp(mime_type, obj->data.picture.mime_type)) &&
+				(description == 0 || !strcmp((const char *)description, (const char *)obj->data.picture.description)) &&
+				obj->data.picture.width <= max_width &&
+				obj->data.picture.height <= max_height &&
+				obj->data.picture.depth <= max_depth &&
+				obj->data.picture.colors <= max_colors &&
+				(area > max_area_seen || (area == max_area_seen && obj->data.picture.depth > max_depth_seen))
+			) {
+				if(*picture)
+					FLAC__metadata_object_delete(*picture);
+				*picture = obj;
+				max_area_seen = area;
+				max_depth_seen = obj->data.picture.depth;
+			}
+			else {
+				FLAC__metadata_object_delete(obj);
+			}
+		}
+	} while(FLAC__metadata_simple_iterator_next(it));
+
+	FLAC__metadata_simple_iterator_delete(it);
+
+	return (0 != *picture);
+}
+
+
+/****************************************************************************
+ *
+ * Level 1 implementation
+ *
+ ***************************************************************************/
+
+#define SIMPLE_ITERATOR_MAX_PUSH_DEPTH (1+4)
+/* 1 for initial offset, +4 for our own personal use */
+
+struct FLAC__Metadata_SimpleIterator {
+	FILE *file;
+	char *filename, *tempfile_path_prefix;
+	struct flac_stat_s stats;
+	FLAC__bool has_stats;
+	FLAC__bool is_writable;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__off_t offset[SIMPLE_ITERATOR_MAX_PUSH_DEPTH];
+	FLAC__off_t first_offset; /* this is the offset to the STREAMINFO block */
+	unsigned depth;
+	/* this is the metadata block header of the current block we are pointing to: */
+	FLAC__bool is_last;
+	FLAC__MetadataType type;
+	unsigned length;
+};
+
+FLAC_API const char * const FLAC__Metadata_SimpleIteratorStatusString[] = {
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_INTERNAL_ERROR"
+};
+
+
+FLAC_API FLAC__Metadata_SimpleIterator *FLAC__metadata_simple_iterator_new(void)
+{
+	FLAC__Metadata_SimpleIterator *iterator = calloc(1, sizeof(FLAC__Metadata_SimpleIterator));
+
+	if(0 != iterator) {
+		iterator->file = 0;
+		iterator->filename = 0;
+		iterator->tempfile_path_prefix = 0;
+		iterator->has_stats = false;
+		iterator->is_writable = false;
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+		iterator->first_offset = iterator->offset[0] = -1;
+		iterator->depth = 0;
+	}
+
+	return iterator;
+}
+
+static void simple_iterator_free_guts_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 != iterator->file) {
+		fclose(iterator->file);
+		iterator->file = 0;
+		if(iterator->has_stats)
+			set_file_stats_(iterator->filename, &iterator->stats);
+	}
+	if(0 != iterator->filename) {
+		free(iterator->filename);
+		iterator->filename = 0;
+	}
+	if(0 != iterator->tempfile_path_prefix) {
+		free(iterator->tempfile_path_prefix);
+		iterator->tempfile_path_prefix = 0;
+	}
+}
+
+FLAC_API void FLAC__metadata_simple_iterator_delete(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	simple_iterator_free_guts_(iterator);
+	free(iterator);
+}
+
+FLAC_API FLAC__Metadata_SimpleIteratorStatus FLAC__metadata_simple_iterator_status(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+
+	FLAC__ASSERT(0 != iterator);
+
+	status = iterator->status;
+	iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+	return status;
+}
+
+static FLAC__bool simple_iterator_prime_input_(FLAC__Metadata_SimpleIterator *iterator, FLAC__bool read_only)
+{
+	unsigned ret;
+
+	FLAC__ASSERT(0 != iterator);
+
+	if(read_only || 0 == (iterator->file = flac_fopen(iterator->filename, "r+b"))) {
+		iterator->is_writable = false;
+		if(read_only || errno == EACCES) {
+			if(0 == (iterator->file = flac_fopen(iterator->filename, "rb"))) {
+				iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+				return false;
+			}
+		}
+		else {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+			return false;
+		}
+	}
+	else {
+		iterator->is_writable = true;
+	}
+
+	ret = seek_to_first_metadata_block_(iterator->file);
+	switch(ret) {
+		case 0:
+			iterator->depth = 0;
+			iterator->first_offset = iterator->offset[iterator->depth] = ftello(iterator->file);
+			return read_metadata_block_header_(iterator);
+		case 1:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		case 2:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		case 3:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE;
+			return false;
+		default:
+			FLAC__ASSERT(0);
+			return false;
+	}
+}
+
+#if 0
+@@@ If we decide to finish implementing this, put this comment back in metadata.h
+/*
+ * The 'tempfile_path_prefix' allows you to specify a directory where
+ * tempfiles should go.  Remember that if your metadata edits cause the
+ * FLAC file to grow, the entire file will have to be rewritten.  If
+ * 'tempfile_path_prefix' is NULL, the temp file will be written in the
+ * same directory as the original FLAC file.  This makes replacing the
+ * original with the tempfile fast but requires extra space in the same
+ * partition for the tempfile.  If space is a problem, you can pass a
+ * directory name belonging to a different partition in
+ * 'tempfile_path_prefix'.  Note that you should use the forward slash
+ * '/' as the directory separator.  A trailing slash is not needed; it
+ * will be added automatically.
+ */
+FLAC__bool FLAC__metadata_simple_iterator_init(FLAC__Metadata_SimpleIterator *iterator, const char *filename, FLAC__bool preserve_file_stats, const char *tempfile_path_prefix);
+#endif
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_init(FLAC__Metadata_SimpleIterator *iterator, const char *filename, FLAC__bool read_only, FLAC__bool preserve_file_stats)
+{
+	const char *tempfile_path_prefix = 0; /*@@@ search for comments near 'flac_rename(...)' for what it will take to finish implementing this */
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != filename);
+
+	simple_iterator_free_guts_(iterator);
+
+	if(!read_only && preserve_file_stats)
+		iterator->has_stats = get_file_stats_(filename, &iterator->stats);
+
+	if(0 == (iterator->filename = strdup(filename))) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+	if(0 != tempfile_path_prefix && 0 == (iterator->tempfile_path_prefix = strdup(tempfile_path_prefix))) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+
+	return simple_iterator_prime_input_(iterator, read_only);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_is_writable(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->is_writable;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_next(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(iterator->is_last)
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->length, SEEK_CUR)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	iterator->offset[iterator->depth] = ftello(iterator->file);
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_prev(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__off_t this_offset;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(iterator->offset[iterator->depth] == iterator->first_offset)
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->first_offset, SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	this_offset = iterator->first_offset;
+	if(!read_metadata_block_header_(iterator))
+		return false;
+
+	/* we ignore any error from ftello() and catch it in fseeko() */
+	while(ftello(iterator->file) + (FLAC__off_t)iterator->length < iterator->offset[iterator->depth]) {
+		if(0 != fseeko(iterator->file, iterator->length, SEEK_CUR)) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		this_offset = ftello(iterator->file);
+		if(!read_metadata_block_header_(iterator))
+			return false;
+	}
+
+	iterator->offset[iterator->depth] = this_offset;
+
+	return true;
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_is_last(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->is_last;
+}
+
+/*@@@@add to tests*/
+FLAC_API off_t FLAC__metadata_simple_iterator_get_block_offset(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->offset[iterator->depth];
+}
+
+FLAC_API FLAC__MetadataType FLAC__metadata_simple_iterator_get_block_type(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->type;
+}
+
+/*@@@@add to tests*/
+FLAC_API unsigned FLAC__metadata_simple_iterator_get_block_length(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->length;
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_get_application_id(FLAC__Metadata_SimpleIterator *iterator, FLAC__byte *id)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != id);
+
+	if(iterator->type != FLAC__METADATA_TYPE_APPLICATION) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	if(fread(id, 1, id_bytes, iterator->file) != id_bytes) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		return false;
+	}
+
+	/* back up */
+	if(0 != fseeko(iterator->file, -((int)id_bytes), SEEK_CUR)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_simple_iterator_get_block(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__StreamMetadata *block = FLAC__metadata_object_new(iterator->type);
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(0 != block) {
+		block->is_last = iterator->is_last;
+		block->length = iterator->length;
+
+		if(!read_metadata_block_data_(iterator, block)) {
+			FLAC__metadata_object_delete(block);
+			return 0;
+		}
+
+		/* back up to the beginning of the block data to stay consistent */
+		if(0 != fseeko(iterator->file, iterator->offset[iterator->depth] + FLAC__STREAM_METADATA_HEADER_LENGTH, SEEK_SET)) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			FLAC__metadata_object_delete(block);
+			return 0;
+		}
+	}
+	else
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	return block;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_set_block(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool use_padding)
+{
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth];)
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != block);
+
+	if(!iterator->is_writable) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE;
+		return false;
+	}
+
+	if(iterator->type == FLAC__METADATA_TYPE_STREAMINFO || block->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		if(iterator->type != block->type) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+			return false;
+		}
+	}
+
+	block->is_last = iterator->is_last;
+
+	if(iterator->length == block->length)
+		return write_metadata_block_stationary_(iterator, block);
+	else if(iterator->length > block->length) {
+		if(use_padding && iterator->length >= FLAC__STREAM_METADATA_HEADER_LENGTH + block->length) {
+			ret = write_metadata_block_stationary_with_padding_(iterator, block, iterator->length - FLAC__STREAM_METADATA_HEADER_LENGTH - block->length, block->is_last);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+		else {
+			ret = rewrite_whole_file_(iterator, block, /*append=*/false);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+	else /* iterator->length < block->length */ {
+		unsigned padding_leftover = 0;
+		FLAC__bool padding_is_last = false;
+		if(use_padding) {
+			/* first see if we can even use padding */
+			if(iterator->is_last) {
+				use_padding = false;
+			}
+			else {
+				const unsigned extra_padding_bytes_required = block->length - iterator->length;
+				simple_iterator_push_(iterator);
+				if(!FLAC__metadata_simple_iterator_next(iterator)) {
+					(void)simple_iterator_pop_(iterator);
+					return false;
+				}
+				if(iterator->type != FLAC__METADATA_TYPE_PADDING) {
+					use_padding = false;
+				}
+				else {
+					if(FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length == extra_padding_bytes_required) {
+						padding_leftover = 0;
+						block->is_last = iterator->is_last;
+					}
+					else if(iterator->length < extra_padding_bytes_required)
+						use_padding = false;
+					else {
+						padding_leftover = FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length - extra_padding_bytes_required;
+						padding_is_last = iterator->is_last;
+						block->is_last = false;
+					}
+				}
+				if(!simple_iterator_pop_(iterator))
+					return false;
+			}
+		}
+		if(use_padding) {
+			if(padding_leftover == 0) {
+				ret = write_metadata_block_stationary_(iterator, block);
+				FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+				FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+				return ret;
+			}
+			else {
+				FLAC__ASSERT(padding_leftover >= FLAC__STREAM_METADATA_HEADER_LENGTH);
+				ret = write_metadata_block_stationary_with_padding_(iterator, block, padding_leftover - FLAC__STREAM_METADATA_HEADER_LENGTH, padding_is_last);
+				FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+				FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+				return ret;
+			}
+		}
+		else {
+			ret = rewrite_whole_file_(iterator, block, /*append=*/false);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_insert_block_after(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool use_padding)
+{
+	unsigned padding_leftover = 0;
+	FLAC__bool padding_is_last = false;
+
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth] + FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length;)
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != block);
+
+	if(!iterator->is_writable) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE;
+		return false;
+	}
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	block->is_last = iterator->is_last;
+
+	if(use_padding) {
+		/* first see if we can even use padding */
+		if(iterator->is_last) {
+			use_padding = false;
+		}
+		else {
+			simple_iterator_push_(iterator);
+			if(!FLAC__metadata_simple_iterator_next(iterator)) {
+				(void)simple_iterator_pop_(iterator);
+				return false;
+			}
+			if(iterator->type != FLAC__METADATA_TYPE_PADDING) {
+				use_padding = false;
+			}
+			else {
+				if(iterator->length == block->length) {
+					padding_leftover = 0;
+					block->is_last = iterator->is_last;
+				}
+				else if(iterator->length < FLAC__STREAM_METADATA_HEADER_LENGTH + block->length)
+					use_padding = false;
+				else {
+					padding_leftover = iterator->length - block->length;
+					padding_is_last = iterator->is_last;
+					block->is_last = false;
+				}
+			}
+			if(!simple_iterator_pop_(iterator))
+				return false;
+		}
+	}
+	if(use_padding) {
+		/* move to the next block, which is suitable padding */
+		if(!FLAC__metadata_simple_iterator_next(iterator))
+			return false;
+		if(padding_leftover == 0) {
+			ret = write_metadata_block_stationary_(iterator, block);
+			FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+		else {
+			FLAC__ASSERT(padding_leftover >= FLAC__STREAM_METADATA_HEADER_LENGTH);
+			ret = write_metadata_block_stationary_with_padding_(iterator, block, padding_leftover - FLAC__STREAM_METADATA_HEADER_LENGTH, padding_is_last);
+			FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+	else {
+		ret = rewrite_whole_file_(iterator, block, /*append=*/true);
+		FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+		return ret;
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_delete_block(FLAC__Metadata_SimpleIterator *iterator, FLAC__bool use_padding)
+{
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth];)
+	FLAC__bool ret;
+
+	if(!iterator->is_writable) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE;
+		return false;
+	}
+
+	if(iterator->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	if(use_padding) {
+		FLAC__StreamMetadata *padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING);
+		if(0 == padding) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		padding->length = iterator->length;
+		if(!FLAC__metadata_simple_iterator_set_block(iterator, padding, false)) {
+			FLAC__metadata_object_delete(padding);
+			return false;
+		}
+		FLAC__metadata_object_delete(padding);
+		if(!FLAC__metadata_simple_iterator_prev(iterator))
+			return false;
+		FLAC__ASSERT(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) + (FLAC__off_t)iterator->length == debug_target_offset);
+		return true;
+	}
+	else {
+		ret = rewrite_whole_file_(iterator, 0, /*append=*/false);
+		FLAC__ASSERT(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) + (FLAC__off_t)iterator->length == debug_target_offset);
+		return ret;
+	}
+}
+
+
+
+/****************************************************************************
+ *
+ * Level 2 implementation
+ *
+ ***************************************************************************/
+
+
+typedef struct FLAC__Metadata_Node {
+	FLAC__StreamMetadata *data;
+	struct FLAC__Metadata_Node *prev, *next;
+} FLAC__Metadata_Node;
+
+struct FLAC__Metadata_Chain {
+	char *filename; /* will be NULL if using callbacks */
+	FLAC__bool is_ogg;
+	FLAC__Metadata_Node *head;
+	FLAC__Metadata_Node *tail;
+	unsigned nodes;
+	FLAC__Metadata_ChainStatus status;
+	FLAC__off_t first_offset, last_offset;
+	/*
+	 * This is the length of the chain initially read from the FLAC file.
+	 * it is used to compare against the current length to decide whether
+	 * or not the whole file has to be rewritten.
+	 */
+	FLAC__off_t initial_length;
+	/* @@@ hacky, these are currently only needed by ogg reader */
+	FLAC__IOHandle handle;
+	FLAC__IOCallback_Read read_cb;
+};
+
+struct FLAC__Metadata_Iterator {
+	FLAC__Metadata_Chain *chain;
+	FLAC__Metadata_Node *current;
+};
+
+FLAC_API const char * const FLAC__Metadata_ChainStatusString[] = {
+	"FLAC__METADATA_CHAIN_STATUS_OK",
+	"FLAC__METADATA_CHAIN_STATUS_ILLEGAL_INPUT",
+	"FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE",
+	"FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE",
+	"FLAC__METADATA_CHAIN_STATUS_NOT_WRITABLE",
+	"FLAC__METADATA_CHAIN_STATUS_BAD_METADATA",
+	"FLAC__METADATA_CHAIN_STATUS_READ_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_RENAME_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_UNLINK_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS",
+	"FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH",
+	"FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL"
+};
+
+
+static FLAC__Metadata_Node *node_new_(void)
+{
+	return calloc(1, sizeof(FLAC__Metadata_Node));
+}
+
+static void node_delete_(FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	if(0 != node->data)
+		FLAC__metadata_object_delete(node->data);
+	free(node);
+}
+
+static void chain_init_(FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != chain);
+
+	chain->filename = 0;
+	chain->is_ogg = false;
+	chain->head = chain->tail = 0;
+	chain->nodes = 0;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	chain->initial_length = 0;
+	chain->read_cb = 0;
+}
+
+static void chain_clear_(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node, *next;
+
+	FLAC__ASSERT(0 != chain);
+
+	for(node = chain->head; node; ) {
+		next = node->next;
+		node_delete_(node);
+		node = next;
+	}
+
+	if(0 != chain->filename)
+		free(chain->filename);
+
+	chain_init_(chain);
+}
+
+static void chain_append_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+
+	node->next = node->prev = 0;
+	node->data->is_last = true;
+	if(0 != chain->tail)
+		chain->tail->data->is_last = false;
+
+	if(0 == chain->head)
+		chain->head = node;
+	else {
+		FLAC__ASSERT(0 != chain->tail);
+		chain->tail->next = node;
+		node->prev = chain->tail;
+	}
+	chain->tail = node;
+	chain->nodes++;
+}
+
+static void chain_remove_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != node);
+
+	if(node == chain->head)
+		chain->head = node->next;
+	else
+		node->prev->next = node->next;
+
+	if(node == chain->tail)
+		chain->tail = node->prev;
+	else
+		node->next->prev = node->prev;
+
+	if(0 != chain->tail)
+		chain->tail->data->is_last = true;
+
+	chain->nodes--;
+}
+
+static void chain_delete_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	chain_remove_node_(chain, node);
+	node_delete_(node);
+}
+
+static FLAC__off_t chain_calculate_length_(FLAC__Metadata_Chain *chain)
+{
+	const FLAC__Metadata_Node *node;
+	FLAC__off_t length = 0;
+	for(node = chain->head; node; node = node->next)
+		length += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+	return length;
+}
+
+static void iterator_insert_node_(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->chain);
+	FLAC__ASSERT(0 != iterator->chain->head);
+	FLAC__ASSERT(0 != iterator->chain->tail);
+
+	node->data->is_last = false;
+
+	node->prev = iterator->current->prev;
+	node->next = iterator->current;
+
+	if(0 == node->prev)
+		iterator->chain->head = node;
+	else
+		node->prev->next = node;
+
+	iterator->current->prev = node;
+
+	iterator->chain->nodes++;
+}
+
+static void iterator_insert_node_after_(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->chain);
+	FLAC__ASSERT(0 != iterator->chain->head);
+	FLAC__ASSERT(0 != iterator->chain->tail);
+
+	iterator->current->data->is_last = false;
+
+	node->prev = iterator->current;
+	node->next = iterator->current->next;
+
+	if(0 == node->next)
+		iterator->chain->tail = node;
+	else
+		node->next->prev = node;
+
+	node->prev->next = node;
+
+	iterator->chain->tail->data->is_last = true;
+
+	iterator->chain->nodes++;
+}
+
+/* return true iff node and node->next are both padding */
+static FLAC__bool chain_merge_adjacent_padding_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	if(node->data->type == FLAC__METADATA_TYPE_PADDING && 0 != node->next && node->next->data->type == FLAC__METADATA_TYPE_PADDING) {
+		const unsigned growth = FLAC__STREAM_METADATA_HEADER_LENGTH + node->next->data->length;
+		node->data->length += growth; /* new block size can be greater than max metadata block size, but it'll be fixed later in chain_prepare_for_write_() */
+
+		chain_delete_node_(chain, node->next);
+		return true;
+	}
+	else
+		return false;
+}
+
+/* Returns the new length of the chain, or 0 if there was an error. */
+/* WATCHOUT: This can get called multiple times before a write, so
+ * it should still work when this happens.
+ */
+/* WATCHOUT: Make sure to also update the logic in
+ * FLAC__metadata_chain_check_if_tempfile_needed() if the logic here changes.
+ */
+static FLAC__off_t chain_prepare_for_write_(FLAC__Metadata_Chain *chain, FLAC__bool use_padding)
+{
+	FLAC__off_t current_length = chain_calculate_length_(chain);
+
+	if(use_padding) {
+		/* if the metadata shrank and the last block is padding, we just extend the last padding block */
+		if(current_length < chain->initial_length && chain->tail->data->type == FLAC__METADATA_TYPE_PADDING) {
+			const FLAC__off_t delta = chain->initial_length - current_length;
+			chain->tail->data->length += delta;
+			current_length += delta;
+			FLAC__ASSERT(current_length == chain->initial_length);
+		}
+		/* if the metadata shrank more than 4 bytes then there's room to add another padding block */
+		else if(current_length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH <= chain->initial_length) {
+			FLAC__StreamMetadata *padding;
+			FLAC__Metadata_Node *node;
+			if(0 == (padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING))) {
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return 0;
+			}
+			padding->length = chain->initial_length - (FLAC__STREAM_METADATA_HEADER_LENGTH + current_length);
+			if(0 == (node = node_new_())) {
+				FLAC__metadata_object_delete(padding);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return 0;
+			}
+			node->data = padding;
+			chain_append_node_(chain, node);
+			current_length = chain_calculate_length_(chain);
+			FLAC__ASSERT(current_length == chain->initial_length);
+		}
+		/* if the metadata grew but the last block is padding, try cutting the padding to restore the original length so we don't have to rewrite the whole file */
+		else if(current_length > chain->initial_length) {
+			const FLAC__off_t delta = current_length - chain->initial_length;
+			if(chain->tail->data->type == FLAC__METADATA_TYPE_PADDING) {
+				/* if the delta is exactly the size of the last padding block, remove the padding block */
+				if((FLAC__off_t)chain->tail->data->length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH == delta) {
+					chain_delete_node_(chain, chain->tail);
+					current_length = chain_calculate_length_(chain);
+					FLAC__ASSERT(current_length == chain->initial_length);
+				}
+				/* if there is at least 'delta' bytes of padding, trim the padding down */
+				else if((FLAC__off_t)chain->tail->data->length >= delta) {
+					chain->tail->data->length -= delta;
+					current_length -= delta;
+					FLAC__ASSERT(current_length == chain->initial_length);
+				}
+			}
+		}
+	}
+
+	/* check sizes of all metadata blocks; reduce padding size if necessary */
+	{
+		FLAC__Metadata_Node *node;
+		for (node = chain->head; node; node = node->next) {
+			if(node->data->length >= (1u << FLAC__STREAM_METADATA_LENGTH_LEN)) {
+				if(node->data->type == FLAC__METADATA_TYPE_PADDING) {
+					node->data->length = (1u << FLAC__STREAM_METADATA_LENGTH_LEN) - 1;
+					current_length = chain_calculate_length_(chain);
+				} else {
+					chain->status = FLAC__METADATA_CHAIN_STATUS_BAD_METADATA;
+					return 0;
+				}
+			}
+		}
+	}
+
+	return current_length;
+}
+
+static FLAC__bool chain_read_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__IOCallback_Tell tell_cb)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+
+	/* we assume we're already at the beginning of the file */
+
+	switch(seek_to_first_metadata_block_cb_(handle, read_cb, seek_cb)) {
+		case 0:
+			break;
+		case 1:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		case 2:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+			return false;
+		case 3:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE;
+			return false;
+		default:
+			FLAC__ASSERT(0);
+			return false;
+	}
+
+	{
+		FLAC__int64 pos = tell_cb(handle);
+		if(pos < 0) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		}
+		chain->first_offset = (FLAC__off_t)pos;
+	}
+
+	{
+		FLAC__bool is_last;
+		FLAC__MetadataType type;
+		unsigned length;
+
+		do {
+			node = node_new_();
+			if(0 == node) {
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+
+			if(!read_metadata_block_header_cb_(handle, read_cb, &is_last, &type, &length)) {
+				node_delete_(node);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+				return false;
+			}
+
+			node->data = FLAC__metadata_object_new(type);
+			if(0 == node->data) {
+				node_delete_(node);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+
+			node->data->is_last = is_last;
+			node->data->length = length;
+
+			chain->status = get_equivalent_status_(read_metadata_block_data_cb_(handle, read_cb, seek_cb, node->data));
+			if(chain->status != FLAC__METADATA_CHAIN_STATUS_OK) {
+				node_delete_(node);
+				return false;
+			}
+			chain_append_node_(chain, node);
+		} while(!is_last);
+	}
+
+	{
+		FLAC__int64 pos = tell_cb(handle);
+		if(pos < 0) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		}
+		chain->last_offset = (FLAC__off_t)pos;
+	}
+
+	chain->initial_length = chain_calculate_length_(chain);
+
+	return true;
+}
+
+static FLAC__StreamDecoderReadStatus chain_read_ogg_read_cb_(const FLAC__StreamDecoder *decoder, FLAC__byte buffer[], size_t *bytes, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	(void)decoder;
+	if(*bytes > 0 && chain->status == FLAC__METADATA_CHAIN_STATUS_OK) {
+		*bytes = chain->read_cb(buffer, sizeof(FLAC__byte), *bytes, chain->handle);
+		if(*bytes == 0)
+			return FLAC__STREAM_DECODER_READ_STATUS_END_OF_STREAM;
+		else
+			return FLAC__STREAM_DECODER_READ_STATUS_CONTINUE;
+	}
+	else
+		return FLAC__STREAM_DECODER_READ_STATUS_ABORT;
+}
+
+static FLAC__StreamDecoderWriteStatus chain_read_ogg_write_cb_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data)
+{
+	(void)decoder, (void)frame, (void)buffer, (void)client_data;
+	return FLAC__STREAM_DECODER_WRITE_STATUS_ABORT;
+}
+
+static void chain_read_ogg_metadata_cb_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	FLAC__Metadata_Node *node;
+
+	(void)decoder;
+
+	node = node_new_();
+	if(0 == node) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return;
+	}
+
+	node->data = FLAC__metadata_object_clone(metadata);
+	if(0 == node->data) {
+		node_delete_(node);
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return;
+	}
+
+	chain_append_node_(chain, node);
+}
+
+static void chain_read_ogg_error_cb_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	(void)decoder, (void)status;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+}
+
+static FLAC__bool chain_read_ogg_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb)
+{
+	FLAC__StreamDecoder *decoder;
+
+	FLAC__ASSERT(0 != chain);
+
+	/* we assume we're already at the beginning of the file */
+
+	chain->handle = handle;
+	chain->read_cb = read_cb;
+	if(0 == (decoder = FLAC__stream_decoder_new())) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+	FLAC__stream_decoder_set_metadata_respond_all(decoder);
+	if(FLAC__stream_decoder_init_ogg_stream(decoder, chain_read_ogg_read_cb_, /*seek_callback=*/0, /*tell_callback=*/0, /*length_callback=*/0, /*eof_callback=*/0, chain_read_ogg_write_cb_, chain_read_ogg_metadata_cb_, chain_read_ogg_error_cb_, chain) != FLAC__STREAM_DECODER_INIT_STATUS_OK) {
+		FLAC__stream_decoder_delete(decoder);
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+		return false;
+	}
+
+	chain->first_offset = 0; /*@@@ wrong; will need to be set correctly to implement metadata writing for Ogg FLAC */
+
+	if(!FLAC__stream_decoder_process_until_end_of_metadata(decoder))
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+	if(chain->status != FLAC__METADATA_CHAIN_STATUS_OK) {
+		FLAC__stream_decoder_delete(decoder);
+		return false;
+	}
+
+	FLAC__stream_decoder_delete(decoder);
+
+	chain->last_offset = 0; /*@@@ wrong; will need to be set correctly to implement metadata writing for Ogg FLAC */
+
+	chain->initial_length = chain_calculate_length_(chain);
+
+	return true;
+}
+
+static FLAC__bool chain_rewrite_metadata_in_place_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, FLAC__IOCallback_Seek seek_cb)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->head);
+
+	if(0 != seek_cb(handle, chain->first_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_cb_(handle, write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+		if(!write_metadata_block_data_cb_(handle, write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	return true;
+}
+
+static FLAC__bool chain_rewrite_metadata_in_place_(FLAC__Metadata_Chain *chain)
+{
+	FILE *file;
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain->filename);
+
+	if(0 == (file = flac_fopen(chain->filename, "r+b"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	/* chain_rewrite_metadata_in_place_cb_() sets chain->status for us */
+	ret = chain_rewrite_metadata_in_place_cb_(chain, (FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, fseek_wrapper_);
+
+	fclose(file);
+
+	return ret;
+}
+
+static FLAC__bool chain_rewrite_file_(FLAC__Metadata_Chain *chain, const char *tempfile_path_prefix)
+{
+	FILE *f, *tempfile = NULL;
+	char *tempfilename;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->filename);
+	FLAC__ASSERT(0 != chain->head);
+
+	/* copy the file prefix (data up to first metadata block */
+	if(0 == (f = flac_fopen(chain->filename, "rb"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+	if(!open_tempfile_(chain->filename, tempfile_path_prefix, &tempfile, &tempfilename, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+	if(!copy_n_bytes_from_file_(f, tempfile, chain->first_offset, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+
+	/* write the metadata */
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_(tempfile, &status, node->data)) {
+			chain->status = get_equivalent_status_(status);
+			goto err;
+		}
+		if(!write_metadata_block_data_(tempfile, &status, node->data)) {
+			chain->status = get_equivalent_status_(status);
+			goto err;
+		}
+	}
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	/* copy the file postfix (everything after the metadata) */
+	if(0 != fseeko(f, chain->last_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		goto err;
+	}
+	if(!copy_remaining_bytes_from_file_(f, tempfile, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+
+	/* move the tempfile on top of the original */
+	(void)fclose(f);
+	if(!transport_tempfile_(chain->filename, &tempfile, &tempfilename, &status))
+		return false;
+
+	return true;
+
+err:
+	(void)fclose(f);
+	cleanup_tempfile_(&tempfile, &tempfilename);
+	return false;
+}
+
+/* assumes 'handle' is already at beginning of file */
+static FLAC__bool chain_rewrite_file_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 == chain->filename);
+	FLAC__ASSERT(0 != chain->head);
+
+	/* copy the file prefix (data up to first metadata block */
+	if(!copy_n_bytes_from_file_cb_(handle, read_cb, temp_handle, temp_write_cb, chain->first_offset, &status)) {
+		chain->status = get_equivalent_status_(status);
+		return false;
+	}
+
+	/* write the metadata */
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_cb_(temp_handle, temp_write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+		if(!write_metadata_block_data_cb_(temp_handle, temp_write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	/* copy the file postfix (everything after the metadata) */
+	if(0 != seek_cb(handle, chain->last_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!copy_remaining_bytes_from_file_cb_(handle, read_cb, eof_cb, temp_handle, temp_write_cb, &status)) {
+		chain->status = get_equivalent_status_(status);
+		return false;
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__Metadata_Chain *FLAC__metadata_chain_new(void)
+{
+	FLAC__Metadata_Chain *chain = calloc(1, sizeof(FLAC__Metadata_Chain));
+
+	if(0 != chain)
+		chain_init_(chain);
+
+	return chain;
+}
+
+FLAC_API void FLAC__metadata_chain_delete(FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != chain);
+
+	chain_clear_(chain);
+
+	free(chain);
+}
+
+FLAC_API FLAC__Metadata_ChainStatus FLAC__metadata_chain_status(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_ChainStatus status;
+
+	FLAC__ASSERT(0 != chain);
+
+	status = chain->status;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	return status;
+}
+
+static FLAC__bool chain_read_(FLAC__Metadata_Chain *chain, const char *filename, FLAC__bool is_ogg)
+{
+	FILE *file;
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != filename);
+
+	chain_clear_(chain);
+
+	if(0 == (chain->filename = strdup(filename))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+
+	chain->is_ogg = is_ogg;
+
+	if(0 == (file = flac_fopen(filename, "rb"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	/* the function also sets chain->status for us */
+	ret = is_ogg?
+		chain_read_ogg_cb_(chain, file, (FLAC__IOCallback_Read)fread) :
+		chain_read_cb_(chain, file, (FLAC__IOCallback_Read)fread, fseek_wrapper_, ftell_wrapper_)
+	;
+
+	fclose(file);
+
+	return ret;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_read(FLAC__Metadata_Chain *chain, const char *filename)
+{
+	return chain_read_(chain, filename, /*is_ogg=*/false);
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_chain_read_ogg(FLAC__Metadata_Chain *chain, const char *filename)
+{
+	return chain_read_(chain, filename, /*is_ogg=*/true);
+}
+
+static FLAC__bool chain_read_with_callbacks_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks, FLAC__bool is_ogg)
+{
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain);
+
+	chain_clear_(chain);
+
+	if (0 == callbacks.read || 0 == callbacks.seek || 0 == callbacks.tell) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	chain->is_ogg = is_ogg;
+
+	/* rewind */
+	if(0 != callbacks.seek(handle, 0, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	/* the function also sets chain->status for us */
+	ret = is_ogg?
+		chain_read_ogg_cb_(chain, handle, callbacks.read) :
+		chain_read_cb_(chain, handle, callbacks.read, callbacks.seek, callbacks.tell)
+	;
+
+	return ret;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_read_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	return chain_read_with_callbacks_(chain, handle, callbacks, /*is_ogg=*/false);
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_chain_read_ogg_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	return chain_read_with_callbacks_(chain, handle, callbacks, /*is_ogg=*/true);
+}
+
+typedef enum {
+	LBS_NONE = 0,
+	LBS_SIZE_CHANGED,
+	LBS_BLOCK_ADDED,
+	LBS_BLOCK_REMOVED
+} LastBlockState;
+
+FLAC_API FLAC__bool FLAC__metadata_chain_check_if_tempfile_needed(FLAC__Metadata_Chain *chain, FLAC__bool use_padding)
+{
+	/* This does all the same checks that are in chain_prepare_for_write_()
+	 * but doesn't actually alter the chain.  Make sure to update the logic
+	 * here if chain_prepare_for_write_() changes.
+	 */
+	FLAC__off_t current_length;
+	LastBlockState lbs_state = LBS_NONE;
+	unsigned lbs_size = 0;
+
+	FLAC__ASSERT(0 != chain);
+
+	current_length = chain_calculate_length_(chain);
+
+	if(use_padding) {
+		const FLAC__Metadata_Node * const node = chain->tail;
+		/* if the metadata shrank and the last block is padding, we just extend the last padding block */
+		if(current_length < chain->initial_length && node->data->type == FLAC__METADATA_TYPE_PADDING) {
+			lbs_state = LBS_SIZE_CHANGED;
+			lbs_size = node->data->length + (chain->initial_length - current_length);
+		}
+		/* if the metadata shrank more than 4 bytes then there's room to add another padding block */
+		else if(current_length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH <= chain->initial_length) {
+			lbs_state = LBS_BLOCK_ADDED;
+			lbs_size = chain->initial_length - (current_length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+		}
+		/* if the metadata grew but the last block is padding, try cutting the padding to restore the original length so we don't have to rewrite the whole file */
+		else if(current_length > chain->initial_length) {
+			const FLAC__off_t delta = current_length - chain->initial_length;
+			if(node->data->type == FLAC__METADATA_TYPE_PADDING) {
+				/* if the delta is exactly the size of the last padding block, remove the padding block */
+				if((FLAC__off_t)node->data->length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH == delta) {
+					lbs_state = LBS_BLOCK_REMOVED;
+					lbs_size = 0;
+				}
+				/* if there is at least 'delta' bytes of padding, trim the padding down */
+				else if((FLAC__off_t)node->data->length >= delta) {
+					lbs_state = LBS_SIZE_CHANGED;
+					lbs_size = node->data->length - delta;
+				}
+			}
+		}
+	}
+
+	current_length = 0;
+	/* check sizes of all metadata blocks; reduce padding size if necessary */
+	{
+		const FLAC__Metadata_Node *node;
+		for(node = chain->head; node; node = node->next) {
+			unsigned block_len = node->data->length;
+			if(node == chain->tail) {
+				if(lbs_state == LBS_BLOCK_REMOVED)
+					continue;
+				else if(lbs_state == LBS_SIZE_CHANGED)
+					block_len = lbs_size;
+			}
+			if(block_len >= (1u << FLAC__STREAM_METADATA_LENGTH_LEN)) {
+				if(node->data->type == FLAC__METADATA_TYPE_PADDING)
+					block_len = (1u << FLAC__STREAM_METADATA_LENGTH_LEN) - 1;
+				else
+					return false /* the return value doesn't matter */;
+			}
+			current_length += (FLAC__STREAM_METADATA_HEADER_LENGTH + block_len);
+		}
+
+		if(lbs_state == LBS_BLOCK_ADDED) {
+			/* test added padding block */
+			unsigned block_len = lbs_size;
+			if(block_len >= (1u << FLAC__STREAM_METADATA_LENGTH_LEN))
+				block_len = (1u << FLAC__STREAM_METADATA_LENGTH_LEN) - 1;
+			current_length += (FLAC__STREAM_METADATA_HEADER_LENGTH + block_len);
+		}
+	}
+
+	return (current_length != chain->initial_length);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__bool preserve_file_stats)
+{
+	struct flac_stat_s stats;
+	const char *tempfile_path_prefix = 0;
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 == chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	if(preserve_file_stats)
+		get_file_stats_(chain->filename, &stats);
+
+	if(current_length == chain->initial_length) {
+		if(!chain_rewrite_metadata_in_place_(chain))
+			return false;
+	}
+	else {
+		if(!chain_rewrite_file_(chain, tempfile_path_prefix))
+			return false;
+
+		/* recompute lengths and offsets */
+		{
+			const FLAC__Metadata_Node *node;
+			chain->initial_length = current_length;
+			chain->last_offset = chain->first_offset;
+			for(node = chain->head; node; node = node->next)
+				chain->last_offset += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+		}
+	}
+
+	if(preserve_file_stats)
+		set_file_stats_(chain->filename, &stats);
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 != chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	if (0 == callbacks.write || 0 == callbacks.seek) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	if (FLAC__metadata_chain_check_if_tempfile_needed(chain, use_padding)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	FLAC__ASSERT(current_length == chain->initial_length);
+
+	return chain_rewrite_metadata_in_place_cb_(chain, handle, callbacks.write, callbacks.seek);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write_with_callbacks_and_tempfile(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks, FLAC__IOHandle temp_handle, FLAC__IOCallbacks temp_callbacks)
+{
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 != chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	if (0 == callbacks.read || 0 == callbacks.seek || 0 == callbacks.eof) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+	if (0 == temp_callbacks.write) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	if (!FLAC__metadata_chain_check_if_tempfile_needed(chain, use_padding)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	FLAC__ASSERT(current_length != chain->initial_length);
+
+	/* rewind */
+	if(0 != callbacks.seek(handle, 0, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	if(!chain_rewrite_file_cb_(chain, handle, callbacks.read, callbacks.seek, callbacks.eof, temp_handle, temp_callbacks.write))
+		return false;
+
+	/* recompute lengths and offsets */
+	{
+		const FLAC__Metadata_Node *node;
+		chain->initial_length = current_length;
+		chain->last_offset = chain->first_offset;
+		for(node = chain->head; node; node = node->next)
+			chain->last_offset += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+	}
+
+	return true;
+}
+
+FLAC_API void FLAC__metadata_chain_merge_padding(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+
+	for(node = chain->head; node; ) {
+		if(!chain_merge_adjacent_padding_(chain, node))
+			node = node->next;
+	}
+}
+
+FLAC_API void FLAC__metadata_chain_sort_padding(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node, *save;
+	unsigned i;
+
+	FLAC__ASSERT(0 != chain);
+
+	/*
+	 * Don't try and be too smart... this simple algo is good enough for
+	 * the small number of nodes that we deal with.
+	 */
+	for(i = 0, node = chain->head; i < chain->nodes; i++) {
+		if(node->data->type == FLAC__METADATA_TYPE_PADDING) {
+			save = node->next;
+			chain_remove_node_(chain, node);
+			chain_append_node_(chain, node);
+			node = save;
+		}
+		else {
+			node = node->next;
+		}
+	}
+
+	FLAC__metadata_chain_merge_padding(chain);
+}
+
+
+FLAC_API FLAC__Metadata_Iterator *FLAC__metadata_iterator_new(void)
+{
+	FLAC__Metadata_Iterator *iterator = calloc(1, sizeof(FLAC__Metadata_Iterator));
+
+	/* calloc() implies:
+		iterator->current = 0;
+		iterator->chain = 0;
+	*/
+
+	return iterator;
+}
+
+FLAC_API void FLAC__metadata_iterator_delete(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	free(iterator);
+}
+
+FLAC_API void FLAC__metadata_iterator_init(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->head);
+
+	iterator->chain = chain;
+	iterator->current = chain->head;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_next(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 == iterator->current || 0 == iterator->current->next)
+		return false;
+
+	iterator->current = iterator->current->next;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_prev(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 == iterator->current || 0 == iterator->current->prev)
+		return false;
+
+	iterator->current = iterator->current->prev;
+	return true;
+}
+
+FLAC_API FLAC__MetadataType FLAC__metadata_iterator_get_block_type(const FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->current->data);
+
+	return iterator->current->data->type;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_iterator_get_block(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+
+	return iterator->current->data;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_set_block(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != block);
+	return FLAC__metadata_iterator_delete_block(iterator, false) && FLAC__metadata_iterator_insert_block_after(iterator, block);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_delete_block(FLAC__Metadata_Iterator *iterator, FLAC__bool replace_with_padding)
+{
+	FLAC__Metadata_Node *save;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+
+	if(0 == iterator->current->prev) {
+		FLAC__ASSERT(iterator->current->data->type == FLAC__METADATA_TYPE_STREAMINFO);
+		return false;
+	}
+
+	save = iterator->current->prev;
+
+	if(replace_with_padding) {
+		FLAC__metadata_object_delete_data(iterator->current->data);
+		iterator->current->data->type = FLAC__METADATA_TYPE_PADDING;
+	}
+	else {
+		chain_delete_node_(iterator->chain, iterator->current);
+	}
+
+	iterator->current = save;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_insert_block_before(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != block);
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO)
+		return false;
+
+	if(0 == iterator->current->prev) {
+		FLAC__ASSERT(iterator->current->data->type == FLAC__METADATA_TYPE_STREAMINFO);
+		return false;
+	}
+
+	if(0 == (node = node_new_()))
+		return false;
+
+	node->data = block;
+	iterator_insert_node_(iterator, node);
+	iterator->current = node;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_insert_block_after(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != block);
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO)
+		return false;
+
+	if(0 == (node = node_new_()))
+		return false;
+
+	node->data = block;
+	iterator_insert_node_after_(iterator, node);
+	iterator->current = node;
+	return true;
+}
+
+
+/****************************************************************************
+ *
+ * Local function definitions
+ *
+ ***************************************************************************/
+
+void pack_uint32_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++) {
+		*(--b) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+void pack_uint32_little_endian_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	for(i = 0; i < bytes; i++) {
+		*(b++) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+void pack_uint64_(FLAC__uint64 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++) {
+		*(--b) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+FLAC__uint32 unpack_uint32_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint32 ret = 0;
+	unsigned i;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint32)(*b++);
+
+	return ret;
+}
+
+FLAC__uint32 unpack_uint32_little_endian_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint32 ret = 0;
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint32)(*--b);
+
+	return ret;
+}
+
+FLAC__uint64 unpack_uint64_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint64 ret = 0;
+	unsigned i;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint64)(*b++);
+
+	return ret;
+}
+
+FLAC__bool read_metadata_block_header_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(!read_metadata_block_header_cb_((FLAC__IOHandle)iterator->file, (FLAC__IOCallback_Read)fread, &iterator->is_last, &iterator->type, &iterator->length)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool read_metadata_block_data_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	iterator->status = read_metadata_block_data_cb_((FLAC__IOHandle)iterator->file, (FLAC__IOCallback_Read)fread, fseek_wrapper_, block);
+
+	return (iterator->status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK);
+}
+
+FLAC__bool read_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__bool *is_last, FLAC__MetadataType *type, unsigned *length)
+{
+	FLAC__byte raw_header[FLAC__STREAM_METADATA_HEADER_LENGTH];
+
+	if(read_cb(raw_header, 1, FLAC__STREAM_METADATA_HEADER_LENGTH, handle) != FLAC__STREAM_METADATA_HEADER_LENGTH)
+		return false;
+
+	*is_last = raw_header[0] & 0x80? true : false;
+	*type = (FLAC__MetadataType)(raw_header[0] & 0x7f);
+	*length = unpack_uint32_(raw_header + 1, 3);
+
+	/* Note that we don't check:
+	 *    if(iterator->type >= FLAC__METADATA_TYPE_UNDEFINED)
+	 * we just will read in an opaque block
+	 */
+
+	return true;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata *block)
+{
+	switch(block->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return read_metadata_block_data_streaminfo_cb_(handle, read_cb, &block->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return read_metadata_block_data_padding_cb_(handle, seek_cb, &block->data.padding, block->length);
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return read_metadata_block_data_application_cb_(handle, read_cb, &block->data.application, block->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return read_metadata_block_data_seektable_cb_(handle, read_cb, &block->data.seek_table, block->length);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return read_metadata_block_data_vorbis_comment_cb_(handle, read_cb, seek_cb, &block->data.vorbis_comment, block->length);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return read_metadata_block_data_cuesheet_cb_(handle, read_cb, &block->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return read_metadata_block_data_picture_cb_(handle, read_cb, &block->data.picture);
+		default:
+			return read_metadata_block_data_unknown_cb_(handle, read_cb, &block->data.unknown, block->length);
+	}
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_StreamInfo *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_STREAMINFO_LENGTH], *b;
+
+	if(read_cb(buffer, 1, FLAC__STREAM_METADATA_STREAMINFO_LENGTH, handle) != FLAC__STREAM_METADATA_STREAMINFO_LENGTH)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	b = buffer;
+
+	/* we are using hardcoded numbers for simplicity but we should
+	 * probably eventually write a bit-level unpacker and use the
+	 * _STREAMINFO_ constants.
+	 */
+	block->min_blocksize = unpack_uint32_(b, 2); b += 2;
+	block->max_blocksize = unpack_uint32_(b, 2); b += 2;
+	block->min_framesize = unpack_uint32_(b, 3); b += 3;
+	block->max_framesize = unpack_uint32_(b, 3); b += 3;
+	block->sample_rate = (unpack_uint32_(b, 2) << 4) | ((unsigned)(b[2] & 0xf0) >> 4);
+	block->channels = (unsigned)((b[2] & 0x0e) >> 1) + 1;
+	block->bits_per_sample = ((((unsigned)(b[2] & 0x01)) << 4) | (((unsigned)(b[3] & 0xf0)) >> 4)) + 1;
+	block->total_samples = (((FLAC__uint64)(b[3] & 0x0f)) << 32) | unpack_uint64_(b+4, 4);
+	memcpy(block->md5sum, b+8, 16);
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_Padding *block, unsigned block_length)
+{
+	(void)block; /* nothing to do; we don't care about reading the padding bytes */
+
+	if(0 != seek_cb(handle, block_length, SEEK_CUR))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Application *block, unsigned block_length)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	if(read_cb(block->id, 1, id_bytes, handle) != id_bytes)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	if(block_length < id_bytes)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	block_length -= id_bytes;
+
+	if(block_length == 0) {
+		block->data = 0;
+	}
+	else {
+		if(0 == (block->data = malloc(block_length)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(block->data, 1, block_length, handle) != block_length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_SeekTable *block, unsigned block_length)
+{
+	unsigned i;
+	FLAC__byte buffer[FLAC__STREAM_METADATA_SEEKPOINT_LENGTH];
+
+	FLAC__ASSERT(block_length % FLAC__STREAM_METADATA_SEEKPOINT_LENGTH == 0);
+
+	block->num_points = block_length / FLAC__STREAM_METADATA_SEEKPOINT_LENGTH;
+
+	if(block->num_points == 0)
+		block->points = 0;
+	else if(0 == (block->points = safe_malloc_mul_2op_p(block->num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < block->num_points; i++) {
+		if(read_cb(buffer, 1, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH, handle) != FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		/* some MAGIC NUMBERs here */
+		block->points[i].sample_number = unpack_uint64_(buffer, 8);
+		block->points[i].stream_offset = unpack_uint64_(buffer+8, 8);
+		block->points[i].frame_samples = unpack_uint32_(buffer+16, 2);
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_entry_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_VorbisComment_Entry *entry, unsigned max_length)
+{
+	const unsigned entry_length_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8 == sizeof(buffer));
+
+	if(max_length < entry_length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA;
+
+	max_length -= entry_length_len;
+	if(read_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	entry->length = unpack_uint32_little_endian_(buffer, entry_length_len);
+	if(max_length < entry->length) {
+		entry->length = 0;
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA;
+	} else max_length -= entry->length;
+
+	if(0 != entry->entry)
+		free(entry->entry);
+
+	if(entry->length == 0) {
+		entry->entry = 0;
+	}
+	else {
+		if(0 == (entry->entry = safe_malloc_add_2op_(entry->length, /*+*/1)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(entry->entry, 1, entry->length, handle) != entry->length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+		entry->entry[entry->length] = '\0';
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_VorbisComment *block, unsigned block_length)
+{
+	unsigned i;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const unsigned num_comments_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8 == sizeof(buffer));
+
+	status = read_metadata_block_data_vorbis_comment_entry_cb_(handle, read_cb, &(block->vendor_string), block_length);
+	if(block_length >= 4)
+		block_length -= 4;
+	if(status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA)
+		goto skip;
+	else if(status != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+	block_length -= block->vendor_string.length;
+
+	if(block_length < num_comments_len) goto skip; else block_length -= num_comments_len;
+	if(read_cb(buffer, 1, num_comments_len, handle) != num_comments_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->num_comments = unpack_uint32_little_endian_(buffer, num_comments_len);
+
+	if(block->num_comments == 0) {
+		block->comments = 0;
+	}
+	else if(0 == (block->comments = calloc(block->num_comments, sizeof(FLAC__StreamMetadata_VorbisComment_Entry)))) {
+		block->num_comments = 0;
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+	}
+
+	for(i = 0; i < block->num_comments; i++) {
+		status = read_metadata_block_data_vorbis_comment_entry_cb_(handle, read_cb, block->comments + i, block_length);
+		if(block_length >= 4) block_length -= 4;
+		if(status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA) {
+			block->num_comments = i;
+			goto skip;
+		}
+		else if(status != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK) return status;
+		block_length -= block->comments[i].length;
+	}
+
+  skip:
+	if(block_length > 0) {
+		/* bad metadata */
+		if(0 != seek_cb(handle, block_length, SEEK_CUR))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_track_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet_Track *track)
+{
+	unsigned i, len;
+	FLAC__byte buffer[32]; /* asserted below that this is big enough */
+
+	FLAC__ASSERT(sizeof(buffer) >= sizeof(FLAC__uint64));
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->offset = unpack_uint64_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->number = (FLAC__byte)unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN / 8;
+	if(read_cb(track->isrc, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN == 1);
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN == 1);
+	track->type = buffer[0] >> 7;
+	track->pre_emphasis = (buffer[0] >> 6) & 1;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->num_indices = (FLAC__byte)unpack_uint32_(buffer, len);
+
+	if(track->num_indices == 0) {
+		track->indices = 0;
+	}
+	else if(0 == (track->indices = calloc(track->num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < track->num_indices; i++) {
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		track->indices[i].offset = unpack_uint64_(buffer, len);
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		track->indices[i].number = (FLAC__byte)unpack_uint32_(buffer, len);
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet *block)
+{
+	unsigned i, len;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__byte buffer[1024]; /* MSVC needs a constant expression so we put a magic number and assert */
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN)/8 <= sizeof(buffer));
+	FLAC__ASSERT(sizeof(FLAC__uint64) <= sizeof(buffer));
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN / 8;
+	if(read_cb(block->media_catalog_number, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->lead_in = unpack_uint64_(buffer, len);
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->is_cd = buffer[0]&0x80? true : false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->num_tracks = unpack_uint32_(buffer, len);
+
+	if(block->num_tracks == 0) {
+		block->tracks = 0;
+	}
+	else if(0 == (block->tracks = calloc(block->num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < block->num_tracks; i++) {
+		if(FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK != (status = read_metadata_block_data_cuesheet_track_cb_(handle, read_cb, block->tracks + i)))
+			return status;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cstring_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__byte **data, FLAC__uint32 *length, FLAC__uint32 length_len)
+{
+	FLAC__byte buffer[sizeof(FLAC__uint32)];
+
+	FLAC__ASSERT(0 != data);
+	FLAC__ASSERT(length_len%8 == 0);
+
+	length_len /= 8; /* convert to bytes */
+
+	FLAC__ASSERT(sizeof(buffer) >= length_len);
+
+	if(read_cb(buffer, 1, length_len, handle) != length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	*length = unpack_uint32_(buffer, length_len);
+
+	if(0 != *data)
+		free(*data);
+
+	if(0 == (*data = safe_malloc_add_2op_(*length, /*+*/1)))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	if(*length > 0) {
+		if(read_cb(*data, 1, *length, handle) != *length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	(*data)[*length] = '\0';
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Picture *block)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__byte buffer[4]; /* asserted below that this is big enough */
+	FLAC__uint32 len;
+
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_TYPE_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_TYPE_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->type = (FLAC__StreamMetadata_Picture_Type)unpack_uint32_(buffer, len);
+
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, (FLAC__byte**)(&(block->mime_type)), &len, FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, &(block->description), &len, FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->width = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->height = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->depth = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_COLORS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_COLORS_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->colors = unpack_uint32_(buffer, len);
+
+	/* for convenience we use read_metadata_block_data_picture_cstring_cb_() even though it adds an extra terminating NUL we don't use */
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, &(block->data), &(block->data_length), FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Unknown *block, unsigned block_length)
+{
+	if(block_length == 0) {
+		block->data = 0;
+	}
+	else {
+		if(0 == (block->data = malloc(block_length)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(block->data, 1, block_length, handle) != block_length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__bool write_metadata_block_header_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != file);
+	FLAC__ASSERT(0 != status);
+
+	if(!write_metadata_block_header_cb_((FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, block)) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != file);
+	FLAC__ASSERT(0 != status);
+
+	if (write_metadata_block_data_cb_((FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, block)) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+		return true;
+	}
+	else {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+		return false;
+	}
+}
+
+FLAC__bool write_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_HEADER_LENGTH];
+
+	FLAC__ASSERT(block->length < (1u << FLAC__STREAM_METADATA_LENGTH_LEN));
+	/* double protection */
+	if(block->length >= (1u << FLAC__STREAM_METADATA_LENGTH_LEN))
+		return false;
+
+	buffer[0] = (block->is_last? 0x80 : 0) | (FLAC__byte)block->type;
+	pack_uint32_(block->length, buffer + 1, 3);
+
+	if(write_cb(buffer, 1, FLAC__STREAM_METADATA_HEADER_LENGTH, handle) != FLAC__STREAM_METADATA_HEADER_LENGTH)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != block);
+
+	switch(block->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return write_metadata_block_data_streaminfo_cb_(handle, write_cb, &block->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return write_metadata_block_data_padding_cb_(handle, write_cb, &block->data.padding, block->length);
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return write_metadata_block_data_application_cb_(handle, write_cb, &block->data.application, block->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return write_metadata_block_data_seektable_cb_(handle, write_cb, &block->data.seek_table);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return write_metadata_block_data_vorbis_comment_cb_(handle, write_cb, &block->data.vorbis_comment);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return write_metadata_block_data_cuesheet_cb_(handle, write_cb, &block->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return write_metadata_block_data_picture_cb_(handle, write_cb, &block->data.picture);
+		default:
+			return write_metadata_block_data_unknown_cb_(handle, write_cb, &block->data.unknown, block->length);
+	}
+}
+
+FLAC__bool write_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_StreamInfo *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_STREAMINFO_LENGTH];
+	const unsigned channels1 = block->channels - 1;
+	const unsigned bps1 = block->bits_per_sample - 1;
+
+	/* we are using hardcoded numbers for simplicity but we should
+	 * probably eventually write a bit-level packer and use the
+	 * _STREAMINFO_ constants.
+	 */
+	pack_uint32_(block->min_blocksize, buffer, 2);
+	pack_uint32_(block->max_blocksize, buffer+2, 2);
+	pack_uint32_(block->min_framesize, buffer+4, 3);
+	pack_uint32_(block->max_framesize, buffer+7, 3);
+	buffer[10] = (block->sample_rate >> 12) & 0xff;
+	buffer[11] = (block->sample_rate >> 4) & 0xff;
+	buffer[12] = ((block->sample_rate & 0x0f) << 4) | (channels1 << 1) | (bps1 >> 4);
+	buffer[13] = (FLAC__byte)(((bps1 & 0x0f) << 4) | ((block->total_samples >> 32) & 0x0f));
+	pack_uint32_((FLAC__uint32)block->total_samples, buffer+14, 4);
+	memcpy(buffer+18, block->md5sum, 16);
+
+	if(write_cb(buffer, 1, FLAC__STREAM_METADATA_STREAMINFO_LENGTH, handle) != FLAC__STREAM_METADATA_STREAMINFO_LENGTH)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Padding *block, unsigned block_length)
+{
+	unsigned i, n = block_length;
+	FLAC__byte buffer[1024];
+
+	(void)block;
+
+	memset(buffer, 0, 1024);
+
+	for(i = 0; i < n/1024; i++)
+		if(write_cb(buffer, 1, 1024, handle) != 1024)
+			return false;
+
+	n %= 1024;
+
+	if(write_cb(buffer, 1, n, handle) != n)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Application *block, unsigned block_length)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	if(write_cb(block->id, 1, id_bytes, handle) != id_bytes)
+		return false;
+
+	block_length -= id_bytes;
+
+	if(write_cb(block->data, 1, block_length, handle) != block_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_SeekTable *block)
+{
+	unsigned i;
+	FLAC__byte buffer[FLAC__STREAM_METADATA_SEEKPOINT_LENGTH];
+
+	for(i = 0; i < block->num_points; i++) {
+		/* some MAGIC NUMBERs here */
+		pack_uint64_(block->points[i].sample_number, buffer, 8);
+		pack_uint64_(block->points[i].stream_offset, buffer+8, 8);
+		pack_uint32_(block->points[i].frame_samples, buffer+16, 2);
+		if(write_cb(buffer, 1, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH, handle) != FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)
+			return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_VorbisComment *block)
+{
+	unsigned i;
+	const unsigned entry_length_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8;
+	const unsigned num_comments_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(flac_max(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN, FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN) / 8 == sizeof(buffer));
+
+	pack_uint32_little_endian_(block->vendor_string.length, buffer, entry_length_len);
+	if(write_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+		return false;
+	if(write_cb(block->vendor_string.entry, 1, block->vendor_string.length, handle) != block->vendor_string.length)
+		return false;
+
+	pack_uint32_little_endian_(block->num_comments, buffer, num_comments_len);
+	if(write_cb(buffer, 1, num_comments_len, handle) != num_comments_len)
+		return false;
+
+	for(i = 0; i < block->num_comments; i++) {
+		pack_uint32_little_endian_(block->comments[i].length, buffer, entry_length_len);
+		if(write_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+			return false;
+		if(write_cb(block->comments[i].entry, 1, block->comments[i].length, handle) != block->comments[i].length)
+			return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_CueSheet *block)
+{
+	unsigned i, j, len;
+	FLAC__byte buffer[1024]; /* asserted below that this is big enough */
+
+	FLAC__ASSERT(sizeof(buffer) >= sizeof(FLAC__uint64));
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN)/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN/8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN / 8;
+	if(write_cb(block->media_catalog_number, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN / 8;
+	pack_uint64_(block->lead_in, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) / 8;
+	memset(buffer, 0, len);
+	if(block->is_cd)
+		buffer[0] |= 0x80;
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN / 8;
+	pack_uint32_(block->num_tracks, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	for(i = 0; i < block->num_tracks; i++) {
+		FLAC__StreamMetadata_CueSheet_Track *track = block->tracks + i;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN / 8;
+		pack_uint64_(track->offset, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN / 8;
+		pack_uint32_(track->number, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN / 8;
+		if(write_cb(track->isrc, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) % 8 == 0);
+		len = (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8;
+		memset(buffer, 0, len);
+		buffer[0] = (track->type << 7) | (track->pre_emphasis << 6);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN / 8;
+		pack_uint32_(track->num_indices, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		for(j = 0; j < track->num_indices; j++) {
+			FLAC__StreamMetadata_CueSheet_Index *indx = track->indices + j;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN / 8;
+			pack_uint64_(indx->offset, buffer, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN / 8;
+			pack_uint32_(indx->number, buffer, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN / 8;
+			memset(buffer, 0, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Picture *block)
+{
+	unsigned len;
+	size_t slen;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_TYPE_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_COLORS_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN%8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN/8);
+
+	len = FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8;
+	pack_uint32_(block->type, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN/8;
+	slen = strlen(block->mime_type);
+	pack_uint32_(slen, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->mime_type, 1, slen, handle) != slen)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN/8;
+	slen = strlen((const char *)block->description);
+	pack_uint32_(slen, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->description, 1, slen, handle) != slen)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8;
+	pack_uint32_(block->width, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8;
+	pack_uint32_(block->height, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8;
+	pack_uint32_(block->depth, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8;
+	pack_uint32_(block->colors, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN/8;
+	pack_uint32_(block->data_length, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->data, 1, block->data_length, handle) != block->data_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Unknown *block, unsigned block_length)
+{
+	if(write_cb(block->data, 1, block_length, handle) != block_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_stationary_(FLAC__Metadata_SimpleIterator *iterator, const FLAC__StreamMetadata *block)
+{
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC__bool write_metadata_block_stationary_with_padding_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, unsigned padding_length, FLAC__bool padding_is_last)
+{
+	FLAC__StreamMetadata *padding;
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	block->is_last = false;
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(0 == (padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING)))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	padding->is_last = padding_is_last;
+	padding->length = padding_length;
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, padding)) {
+		FLAC__metadata_object_delete(padding);
+		return false;
+	}
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, padding)) {
+		FLAC__metadata_object_delete(padding);
+		return false;
+	}
+
+	FLAC__metadata_object_delete(padding);
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC__bool rewrite_whole_file_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool append)
+{
+	FILE *tempfile = NULL;
+	char *tempfilename = NULL;
+	int fixup_is_last_code = 0; /* 0 => no need to change any is_last flags */
+	FLAC__off_t fixup_is_last_flag_offset = -1;
+
+	FLAC__ASSERT(0 != block || append == false);
+
+	if(iterator->is_last) {
+		if(append) {
+			fixup_is_last_code = 1; /* 1 => clear the is_last flag at the following offset */
+			fixup_is_last_flag_offset = iterator->offset[iterator->depth];
+		}
+		else if(0 == block) {
+			simple_iterator_push_(iterator);
+			if(!FLAC__metadata_simple_iterator_prev(iterator)) {
+				(void)simple_iterator_pop_(iterator);
+				return false;
+			}
+			fixup_is_last_code = -1; /* -1 => set the is_last the flag at the following offset */
+			fixup_is_last_flag_offset = iterator->offset[iterator->depth];
+			if(!simple_iterator_pop_(iterator))
+				return false;
+		}
+	}
+
+	if(!simple_iterator_copy_file_prefix_(iterator, &tempfile, &tempfilename, append))
+		return false;
+
+	if(0 != block) {
+		if(!write_metadata_block_header_(tempfile, &iterator->status, block)) {
+			cleanup_tempfile_(&tempfile, &tempfilename);
+			return false;
+		}
+
+		if(!write_metadata_block_data_(tempfile, &iterator->status, block)) {
+			cleanup_tempfile_(&tempfile, &tempfilename);
+			return false;
+		}
+	}
+
+	if(!simple_iterator_copy_file_postfix_(iterator, &tempfile, &tempfilename, fixup_is_last_code, fixup_is_last_flag_offset, block==0))
+		return false;
+
+	if(append)
+		return FLAC__metadata_simple_iterator_next(iterator);
+
+	return true;
+}
+
+void simple_iterator_push_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(iterator->depth+1 < SIMPLE_ITERATOR_MAX_PUSH_DEPTH);
+	iterator->offset[iterator->depth+1] = iterator->offset[iterator->depth];
+	iterator->depth++;
+}
+
+FLAC__bool simple_iterator_pop_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(iterator->depth > 0);
+	iterator->depth--;
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+/* return meanings:
+ * 0: ok
+ * 1: read error
+ * 2: seek error
+ * 3: not a FLAC file
+ */
+unsigned seek_to_first_metadata_block_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb)
+{
+	FLAC__byte buffer[4];
+	size_t n;
+	unsigned i;
+
+	FLAC__ASSERT(FLAC__STREAM_SYNC_LENGTH == sizeof(buffer));
+
+	/* skip any id3v2 tag */
+	errno = 0;
+	n = read_cb(buffer, 1, 4, handle);
+	if(errno)
+		return 1;
+	else if(n != 4)
+		return 3;
+	else if(0 == memcmp(buffer, "ID3", 3)) {
+		unsigned tag_length = 0;
+
+		/* skip to the tag length */
+		if(seek_cb(handle, 2, SEEK_CUR) < 0)
+			return 2;
+
+		/* read the length */
+		for(i = 0; i < 4; i++) {
+			if(read_cb(buffer, 1, 1, handle) < 1 || buffer[0] & 0x80)
+				return 1;
+			tag_length <<= 7;
+			tag_length |= (buffer[0] & 0x7f);
+		}
+
+		/* skip the rest of the tag */
+		if(seek_cb(handle, tag_length, SEEK_CUR) < 0)
+			return 2;
+
+		/* read the stream sync code */
+		errno = 0;
+		n = read_cb(buffer, 1, 4, handle);
+		if(errno)
+			return 1;
+		else if(n != 4)
+			return 3;
+	}
+
+	/* check for the fLaC signature */
+	if(0 == memcmp(FLAC__STREAM_SYNC_STRING, buffer, FLAC__STREAM_SYNC_LENGTH))
+		return 0;
+	else
+		return 3;
+}
+
+unsigned seek_to_first_metadata_block_(FILE *f)
+{
+	return seek_to_first_metadata_block_cb_((FLAC__IOHandle)f, (FLAC__IOCallback_Read)fread, fseek_wrapper_);
+}
+
+FLAC__bool simple_iterator_copy_file_prefix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, FLAC__bool append)
+{
+	const FLAC__off_t offset_end = append? iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length : iterator->offset[iterator->depth];
+
+	if(0 != fseeko(iterator->file, 0, SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!open_tempfile_(iterator->filename, iterator->tempfile_path_prefix, tempfile, tempfilename, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+	if(!copy_n_bytes_from_file_(iterator->file, *tempfile, offset_end, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool simple_iterator_copy_file_postfix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, int fixup_is_last_code, FLAC__off_t fixup_is_last_flag_offset, FLAC__bool backup)
+{
+	FLAC__off_t save_offset = iterator->offset[iterator->depth];
+	FLAC__ASSERT(0 != *tempfile);
+
+	if(0 != fseeko(iterator->file, save_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length, SEEK_SET)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!copy_remaining_bytes_from_file_(iterator->file, *tempfile, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+
+	if(fixup_is_last_code != 0) {
+		/*
+		 * if code == 1, it means a block was appended to the end so
+		 *   we have to clear the is_last flag of the previous block
+		 * if code == -1, it means the last block was deleted so
+		 *   we have to set the is_last flag of the previous block
+		 */
+		/* MAGIC NUMBERs here; we know the is_last flag is the high bit of the byte at this location */
+		FLAC__byte x;
+		if(0 != fseeko(*tempfile, fixup_is_last_flag_offset, SEEK_SET)) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		if(fread(&x, 1, 1, *tempfile) != 1) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(fixup_is_last_code > 0) {
+			FLAC__ASSERT(x & 0x80);
+			x &= 0x7f;
+		}
+		else {
+			FLAC__ASSERT(!(x & 0x80));
+			x |= 0x80;
+		}
+		if(0 != fseeko(*tempfile, fixup_is_last_flag_offset, SEEK_SET)) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		if(local__fwrite(&x, 1, 1, *tempfile) != 1) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	(void)fclose(iterator->file);
+
+	if(!transport_tempfile_(iterator->filename, tempfile, tempfilename, &iterator->status))
+		return false;
+
+	if(iterator->has_stats)
+		set_file_stats_(iterator->filename, &iterator->stats);
+
+	if(!simple_iterator_prime_input_(iterator, !iterator->is_writable))
+		return false;
+	if(backup) {
+		while(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length < save_offset)
+			if(!FLAC__metadata_simple_iterator_next(iterator))
+				return false;
+		return true;
+	}
+	else {
+		/* move the iterator to it's original block faster by faking a push, then doing a pop_ */
+		FLAC__ASSERT(iterator->depth == 0);
+		iterator->offset[0] = save_offset;
+		iterator->depth++;
+		return simple_iterator_pop_(iterator);
+	}
+}
+
+FLAC__bool copy_n_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	FLAC__ASSERT(bytes >= 0);
+	while(bytes > 0) {
+		n = flac_min(sizeof(buffer), (size_t)bytes);
+		if(fread(buffer, 1, n, file) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(local__fwrite(buffer, 1, n, tempfile) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+		bytes -= n;
+	}
+
+	return true;
+}
+
+FLAC__bool copy_n_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	FLAC__ASSERT(bytes >= 0);
+	while(bytes > 0) {
+		n = flac_min(sizeof(buffer), (size_t)bytes);
+		if(read_cb(buffer, 1, n, handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(temp_write_cb(buffer, 1, n, temp_handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+		bytes -= n;
+	}
+
+	return true;
+}
+
+FLAC__bool copy_remaining_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	while(!feof(file)) {
+		n = fread(buffer, 1, sizeof(buffer), file);
+		if(n == 0 && !feof(file)) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(n > 0 && local__fwrite(buffer, 1, n, tempfile) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+FLAC__bool copy_remaining_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	while(!eof_cb(handle)) {
+		n = read_cb(buffer, 1, sizeof(buffer), handle);
+		if(n == 0 && !eof_cb(handle)) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(n > 0 && temp_write_cb(buffer, 1, n, temp_handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int
+local_snprintf(char *str, size_t size, const char *fmt, ...)
+{
+	va_list va;
+	int rc;
+
+#if defined _MSC_VER
+	if (size == 0)
+		return 1024;
+#endif
+
+	va_start (va, fmt);
+
+#if defined _MSC_VER
+	rc = vsnprintf_s (str, size, _TRUNCATE, fmt, va);
+	if (rc < 0)
+		rc = size - 1;
+#elif defined __MINGW32__
+	rc = __mingw_vsnprintf (str, size, fmt, va);
+#else
+	rc = vsnprintf (str, size, fmt, va);
+#endif
+	va_end (va);
+
+	return rc;
+}
+
+FLAC__bool open_tempfile_(const char *filename, const char *tempfile_path_prefix, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	static const char *tempfile_suffix = ".metadata_edit";
+	if(0 == tempfile_path_prefix) {
+		size_t dest_len = strlen(filename) + strlen(tempfile_suffix) + 1;
+		if(0 == (*tempfilename = safe_malloc_(dest_len))) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		local_snprintf(*tempfilename, dest_len, "%s%s", filename, tempfile_suffix);
+	}
+	else {
+		const char *p = strrchr(filename, '/');
+		size_t dest_len;
+		if(0 == p)
+			p = filename;
+		else
+			p++;
+
+		dest_len = strlen(tempfile_path_prefix) + strlen(p) + strlen(tempfile_suffix) + 2;
+
+		if(0 == (*tempfilename = safe_malloc_(dest_len))) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		local_snprintf(*tempfilename, dest_len, "%s/%s%s", tempfile_path_prefix, p, tempfile_suffix);
+	}
+
+	if(0 == (*tempfile = flac_fopen(*tempfilename, "w+b"))) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool transport_tempfile_(const char *filename, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != tempfile);
+	FLAC__ASSERT(0 != *tempfile);
+	FLAC__ASSERT(0 != tempfilename);
+	FLAC__ASSERT(0 != *tempfilename);
+	FLAC__ASSERT(0 != status);
+
+	(void)fclose(*tempfile);
+	*tempfile = 0;
+
+#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__ || defined __EMX__
+	/* on some flavors of windows, flac_rename() will fail if the destination already exists */
+	if(flac_unlink(filename) < 0) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR;
+		return false;
+	}
+#endif
+
+	/*@@@ to fully support the tempfile_path_prefix we need to update this piece to actually copy across filesystems instead of just flac_rename(): */
+	if(0 != flac_rename(*tempfilename, filename)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR;
+		return false;
+	}
+
+	cleanup_tempfile_(tempfile, tempfilename);
+
+	return true;
+}
+
+void cleanup_tempfile_(FILE **tempfile, char **tempfilename)
+{
+	if(0 != *tempfile) {
+		(void)fclose(*tempfile);
+		*tempfile = 0;
+	}
+
+	if(0 != *tempfilename) {
+		(void)flac_unlink(*tempfilename);
+		free(*tempfilename);
+		*tempfilename = 0;
+	}
+}
+
+FLAC__bool get_file_stats_(const char *filename, struct flac_stat_s *stats)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != stats);
+	return (0 == flac_stat(filename, stats));
+}
+
+void set_file_stats_(const char *filename, struct flac_stat_s *stats)
+{
+	struct utimbuf srctime;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != stats);
+
+	srctime.actime = stats->st_atime;
+	srctime.modtime = stats->st_mtime;
+	(void)flac_chmod(filename, stats->st_mode);
+	(void)flac_utime(filename, &srctime);
+#if !defined _MSC_VER && !defined __BORLANDC__ && !defined __MINGW32__
+	FLAC_CHECK_RETURN(chown(filename, stats->st_uid, -1));
+	FLAC_CHECK_RETURN(chown(filename, -1, stats->st_gid));
+#endif
+}
+
+int fseek_wrapper_(FLAC__IOHandle handle, FLAC__int64 offset, int whence)
+{
+	return fseeko((FILE*)handle, (FLAC__off_t)offset, whence);
+}
+
+FLAC__int64 ftell_wrapper_(FLAC__IOHandle handle)
+{
+	return ftello((FILE*)handle);
+}
+
+FLAC__Metadata_ChainStatus get_equivalent_status_(FLAC__Metadata_SimpleIteratorStatus status)
+{
+	switch(status) {
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK:
+			return FLAC__METADATA_CHAIN_STATUS_OK;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT:
+			return FLAC__METADATA_CHAIN_STATUS_ILLEGAL_INPUT;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE:
+			return FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE:
+			return FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE:
+			return FLAC__METADATA_CHAIN_STATUS_NOT_WRITABLE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA:
+			return FLAC__METADATA_CHAIN_STATUS_BAD_METADATA;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_RENAME_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_UNLINK_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_INTERNAL_ERROR:
+		default:
+			return FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+	}
+}
diff --git a/libFLAC/metadata_object.c b/libFLAC/metadata_object.c
new file mode 100644
index 0000000..9cb9501
--- /dev/null
+++ b/libFLAC/metadata_object.c
@@ -0,0 +1,1821 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2016  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "private/metadata.h"
+#include "private/memory.h"
+
+#include "FLAC/assert.h"
+#include "share/alloc.h"
+#include "share/compat.h"
+
+/* Alias the first (in share/alloc.h) to the second (in src/libFLAC/memory.c). */
+#define safe_malloc_mul_2op_ safe_malloc_mul_2op_p
+
+
+/****************************************************************************
+ *
+ * Local routines
+ *
+ ***************************************************************************/
+
+/* copy bytes:
+ *  from = NULL  && bytes = 0
+ *       to <- NULL
+ *  from != NULL && bytes > 0
+ *       to <- copy of from
+ *  else ASSERT
+ * malloc error leaves 'to' unchanged
+ */
+static FLAC__bool copy_bytes_(FLAC__byte **to, const FLAC__byte *from, unsigned bytes)
+{
+	FLAC__ASSERT(to != NULL);
+	if (bytes > 0 && from != NULL) {
+		FLAC__byte *x;
+		if ((x = safe_malloc_(bytes)) == NULL)
+			return false;
+		memcpy(x, from, bytes);
+		*to = x;
+	}
+	else {
+		*to = 0;
+	}
+	return true;
+}
+
+#if 0 /* UNUSED */
+/* like copy_bytes_(), but free()s the original '*to' if the copy succeeds and the original '*to' is non-NULL */
+static FLAC__bool free_copy_bytes_(FLAC__byte **to, const FLAC__byte *from, unsigned bytes)
+{
+	FLAC__byte *copy;
+	FLAC__ASSERT(to != NULL);
+	if (copy_bytes_(&copy, from, bytes)) {
+		free(*to);
+		*to = copy;
+		return true;
+	}
+	else
+		return false;
+}
+#endif
+
+/* reallocate entry to 1 byte larger and add a terminating NUL */
+/* realloc() failure leaves entry unchanged */
+static FLAC__bool ensure_null_terminated_(FLAC__byte **entry, unsigned length)
+{
+	FLAC__byte *x = safe_realloc_add_2op_(*entry, length, /*+*/1);
+	if (x != NULL) {
+		x[length] = '\0';
+		*entry = x;
+		return true;
+	}
+	else
+		return false;
+}
+
+/* copies the NUL-terminated C-string 'from' to '*to', leaving '*to'
+ * unchanged if malloc fails, free()ing the original '*to' if it
+ * succeeds and the original '*to' was not NULL
+ */
+static FLAC__bool copy_cstring_(char **to, const char *from)
+{
+	char *copy = strdup(from);
+	FLAC__ASSERT(to != NULL);
+	if (copy) {
+		free(*to);
+		*to = copy;
+		return true;
+	}
+	else
+		return false;
+}
+
+static FLAC__bool copy_vcentry_(FLAC__StreamMetadata_VorbisComment_Entry *to, const FLAC__StreamMetadata_VorbisComment_Entry *from)
+{
+	to->length = from->length;
+	if (from->entry == 0) {
+		FLAC__ASSERT(from->length == 0);
+		to->entry = 0;
+	}
+	else {
+		FLAC__byte *x;
+		FLAC__ASSERT(from->length > 0);
+		if ((x = safe_malloc_add_2op_(from->length, /*+*/1)) == NULL)
+			return false;
+		memcpy(x, from->entry, from->length);
+		x[from->length] = '\0';
+		to->entry = x;
+	}
+	return true;
+}
+
+static FLAC__bool copy_track_(FLAC__StreamMetadata_CueSheet_Track *to, const FLAC__StreamMetadata_CueSheet_Track *from)
+{
+	memcpy(to, from, sizeof(FLAC__StreamMetadata_CueSheet_Track));
+	if (from->indices == 0) {
+		FLAC__ASSERT(from->num_indices == 0);
+	}
+	else {
+		FLAC__StreamMetadata_CueSheet_Index *x;
+		FLAC__ASSERT(from->num_indices > 0);
+		if ((x = safe_malloc_mul_2op_p(from->num_indices, /*times*/sizeof(FLAC__StreamMetadata_CueSheet_Index))) == NULL)
+			return false;
+		memcpy(x, from->indices, from->num_indices * sizeof(FLAC__StreamMetadata_CueSheet_Index));
+		to->indices = x;
+	}
+	return true;
+}
+
+static void seektable_calculate_length_(FLAC__StreamMetadata *object)
+{
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	object->length = object->data.seek_table.num_points * FLAC__STREAM_METADATA_SEEKPOINT_LENGTH;
+}
+
+static FLAC__StreamMetadata_SeekPoint *seekpoint_array_new_(unsigned num_points)
+{
+	FLAC__StreamMetadata_SeekPoint *object_array;
+
+	FLAC__ASSERT(num_points > 0);
+
+	object_array = safe_malloc_mul_2op_p(num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint));
+
+	if (object_array != NULL) {
+		unsigned i;
+		for (i = 0; i < num_points; i++) {
+			object_array[i].sample_number = FLAC__STREAM_METADATA_SEEKPOINT_PLACEHOLDER;
+			object_array[i].stream_offset = 0;
+			object_array[i].frame_samples = 0;
+		}
+	}
+
+	return object_array;
+}
+
+static void vorbiscomment_calculate_length_(FLAC__StreamMetadata *object)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	object->length = (FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN) / 8;
+	object->length += object->data.vorbis_comment.vendor_string.length;
+	object->length += (FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN) / 8;
+	for (i = 0; i < object->data.vorbis_comment.num_comments; i++) {
+		object->length += (FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8);
+		object->length += object->data.vorbis_comment.comments[i].length;
+	}
+}
+
+static FLAC__StreamMetadata_VorbisComment_Entry *vorbiscomment_entry_array_new_(unsigned num_comments)
+{
+	FLAC__ASSERT(num_comments > 0);
+
+	return safe_calloc_(num_comments, sizeof(FLAC__StreamMetadata_VorbisComment_Entry));
+}
+
+static void vorbiscomment_entry_array_delete_(FLAC__StreamMetadata_VorbisComment_Entry *object_array, unsigned num_comments)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object_array != NULL && num_comments > 0);
+
+	for (i = 0; i < num_comments; i++)
+		free(object_array[i].entry);
+
+	free(object_array);
+}
+
+static FLAC__StreamMetadata_VorbisComment_Entry *vorbiscomment_entry_array_copy_(const FLAC__StreamMetadata_VorbisComment_Entry *object_array, unsigned num_comments)
+{
+	FLAC__StreamMetadata_VorbisComment_Entry *return_array;
+
+	FLAC__ASSERT(object_array != NULL);
+	FLAC__ASSERT(num_comments > 0);
+
+	return_array = vorbiscomment_entry_array_new_(num_comments);
+
+	if (return_array != NULL) {
+		unsigned i;
+
+		for (i = 0; i < num_comments; i++) {
+			if (!copy_vcentry_(return_array+i, object_array+i)) {
+				vorbiscomment_entry_array_delete_(return_array, num_comments);
+				return 0;
+			}
+		}
+	}
+
+	return return_array;
+}
+
+static FLAC__bool vorbiscomment_set_entry_(FLAC__StreamMetadata *object, FLAC__StreamMetadata_VorbisComment_Entry *dest, const FLAC__StreamMetadata_VorbisComment_Entry *src, FLAC__bool copy)
+{
+	FLAC__byte *save;
+
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(dest != NULL);
+	FLAC__ASSERT(src != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT((src->entry != NULL && src->length > 0) || (src->entry == NULL && src->length == 0));
+
+	save = dest->entry;
+
+	if (src->entry != NULL) {
+		if (copy) {
+			/* do the copy first so that if we fail we leave the dest object untouched */
+			if (!copy_vcentry_(dest, src))
+				return false;
+		}
+		else {
+			/* we have to make sure that the string we're taking over is null-terminated */
+
+			/*
+			 * Stripping the const from src->entry is OK since we're taking
+			 * ownership of the pointer.  This is a hack around a deficiency
+			 * in the API where the same function is used for 'copy' and
+			 * 'own', but the source entry is a const pointer.  If we were
+			 * precise, the 'own' flavor would be a separate function with a
+			 * non-const source pointer.  But it's not, so we hack away.
+			 */
+			if (!ensure_null_terminated_((FLAC__byte**)(&src->entry), src->length))
+				return false;
+			*dest = *src;
+		}
+	}
+	else {
+		/* the src is null */
+		*dest = *src;
+	}
+
+	free(save);
+
+	vorbiscomment_calculate_length_(object);
+	return true;
+}
+
+static int vorbiscomment_find_entry_from_(const FLAC__StreamMetadata *object, unsigned offset, const char *field_name, unsigned field_name_length)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT(field_name != NULL);
+
+	for (i = offset; i < object->data.vorbis_comment.num_comments; i++) {
+		if (FLAC__metadata_object_vorbiscomment_entry_matches(object->data.vorbis_comment.comments[i], field_name, field_name_length))
+			return (int)i;
+	}
+
+	return -1;
+}
+
+static void cuesheet_calculate_length_(FLAC__StreamMetadata *object)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+
+	object->length = (
+		FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN
+	) / 8;
+
+	object->length += object->data.cue_sheet.num_tracks * (
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN
+	) / 8;
+
+	for (i = 0; i < object->data.cue_sheet.num_tracks; i++) {
+		object->length += object->data.cue_sheet.tracks[i].num_indices * (
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN +
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN +
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN
+		) / 8;
+	}
+}
+
+static FLAC__StreamMetadata_CueSheet_Index *cuesheet_track_index_array_new_(unsigned num_indices)
+{
+	FLAC__ASSERT(num_indices > 0);
+
+	return safe_calloc_(num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index));
+}
+
+static FLAC__StreamMetadata_CueSheet_Track *cuesheet_track_array_new_(unsigned num_tracks)
+{
+	FLAC__ASSERT(num_tracks > 0);
+
+	return safe_calloc_(num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track));
+}
+
+static void cuesheet_track_array_delete_(FLAC__StreamMetadata_CueSheet_Track *object_array, unsigned num_tracks)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object_array != NULL && num_tracks > 0);
+
+	for (i = 0; i < num_tracks; i++) {
+		if (object_array[i].indices != 0) {
+			FLAC__ASSERT(object_array[i].num_indices > 0);
+			free(object_array[i].indices);
+		}
+	}
+
+	free(object_array);
+}
+
+static FLAC__StreamMetadata_CueSheet_Track *cuesheet_track_array_copy_(const FLAC__StreamMetadata_CueSheet_Track *object_array, unsigned num_tracks)
+{
+	FLAC__StreamMetadata_CueSheet_Track *return_array;
+
+	FLAC__ASSERT(object_array != NULL);
+	FLAC__ASSERT(num_tracks > 0);
+
+	return_array = cuesheet_track_array_new_(num_tracks);
+
+	if (return_array != NULL) {
+		unsigned i;
+
+		for (i = 0; i < num_tracks; i++) {
+			if (!copy_track_(return_array+i, object_array+i)) {
+				cuesheet_track_array_delete_(return_array, num_tracks);
+				return 0;
+			}
+		}
+	}
+
+	return return_array;
+}
+
+static FLAC__bool cuesheet_set_track_(FLAC__StreamMetadata *object, FLAC__StreamMetadata_CueSheet_Track *dest, const FLAC__StreamMetadata_CueSheet_Track *src, FLAC__bool copy)
+{
+	FLAC__StreamMetadata_CueSheet_Index *save;
+
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(dest != NULL);
+	FLAC__ASSERT(src != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT((src->indices != NULL && src->num_indices > 0) || (src->indices == NULL && src->num_indices == 0));
+
+	save = dest->indices;
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if (copy) {
+		if (!copy_track_(dest, src))
+			return false;
+	}
+	else {
+		*dest = *src;
+	}
+
+	free(save);
+
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+
+/****************************************************************************
+ *
+ * Metadata object routines
+ *
+ ***************************************************************************/
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_object_new(FLAC__MetadataType type)
+{
+	FLAC__StreamMetadata *object;
+
+	if (type > FLAC__MAX_METADATA_TYPE)
+		return 0;
+
+	object = calloc(1, sizeof(FLAC__StreamMetadata));
+	if (object != NULL) {
+		object->is_last = false;
+		object->type = type;
+		switch(type) {
+			case FLAC__METADATA_TYPE_STREAMINFO:
+				object->length = FLAC__STREAM_METADATA_STREAMINFO_LENGTH;
+				break;
+			case FLAC__METADATA_TYPE_PADDING:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_APPLICATION:
+				object->length = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+				/* calloc() took care of this for us:
+				object->data.application.data = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_SEEKTABLE:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				object->data.seek_table.num_points = 0;
+				object->data.seek_table.points = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+				object->data.vorbis_comment.vendor_string.length = (unsigned)strlen(FLAC__VENDOR_STRING);
+				if (!copy_bytes_(&object->data.vorbis_comment.vendor_string.entry, (const FLAC__byte*)FLAC__VENDOR_STRING, object->data.vorbis_comment.vendor_string.length+1)) {
+					free(object);
+					return 0;
+				}
+				vorbiscomment_calculate_length_(object);
+				break;
+			case FLAC__METADATA_TYPE_CUESHEET:
+				cuesheet_calculate_length_(object);
+				break;
+			case FLAC__METADATA_TYPE_PICTURE:
+				object->length = (
+					FLAC__STREAM_METADATA_PICTURE_TYPE_LEN +
+					FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN + /* empty mime_type string */
+					FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN + /* empty description string */
+					FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN +
+					FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN +
+					FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN +
+					FLAC__STREAM_METADATA_PICTURE_COLORS_LEN +
+					FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN +
+					0 /* no data */
+				) / 8;
+				object->data.picture.type = FLAC__STREAM_METADATA_PICTURE_TYPE_OTHER;
+				object->data.picture.mime_type = 0;
+				object->data.picture.description = 0;
+				/* calloc() took care of this for us:
+				object->data.picture.width = 0;
+				object->data.picture.height = 0;
+				object->data.picture.depth = 0;
+				object->data.picture.colors = 0;
+				object->data.picture.data_length = 0;
+				object->data.picture.data = 0;
+				*/
+				/* now initialize mime_type and description with empty strings to make things easier on the client */
+				if (!copy_cstring_(&object->data.picture.mime_type, "")) {
+					free(object);
+					return 0;
+				}
+				if (!copy_cstring_((char**)(&object->data.picture.description), "")) {
+					free(object->data.picture.mime_type);
+					free(object);
+					return 0;
+				}
+				break;
+			default:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				object->data.unknown.data = 0;
+				*/
+				break;
+		}
+	}
+
+	return object;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_object_clone(const FLAC__StreamMetadata *object)
+{
+	FLAC__StreamMetadata *to;
+
+	FLAC__ASSERT(object != NULL);
+
+	if ((to = FLAC__metadata_object_new(object->type)) != NULL) {
+		to->is_last = object->is_last;
+		to->type = object->type;
+		to->length = object->length;
+		switch(to->type) {
+			case FLAC__METADATA_TYPE_STREAMINFO:
+				memcpy(&to->data.stream_info, &object->data.stream_info, sizeof(FLAC__StreamMetadata_StreamInfo));
+				break;
+			case FLAC__METADATA_TYPE_PADDING:
+				break;
+			case FLAC__METADATA_TYPE_APPLICATION:
+				if (to->length < FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8) { /* underflow check */
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				memcpy(&to->data.application.id, &object->data.application.id, FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8);
+				if (!copy_bytes_(&to->data.application.data, object->data.application.data, object->length - FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			case FLAC__METADATA_TYPE_SEEKTABLE:
+				to->data.seek_table.num_points = object->data.seek_table.num_points;
+				if (to->data.seek_table.num_points > UINT32_MAX / sizeof(FLAC__StreamMetadata_SeekPoint)) { /* overflow check */
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if (!copy_bytes_((FLAC__byte**)&to->data.seek_table.points, (FLAC__byte*)object->data.seek_table.points, object->data.seek_table.num_points * sizeof(FLAC__StreamMetadata_SeekPoint))) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+				if (to->data.vorbis_comment.vendor_string.entry != NULL) {
+					free(to->data.vorbis_comment.vendor_string.entry);
+					to->data.vorbis_comment.vendor_string.entry = 0;
+				}
+				if (!copy_vcentry_(&to->data.vorbis_comment.vendor_string, &object->data.vorbis_comment.vendor_string)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if (object->data.vorbis_comment.num_comments == 0) {
+					to->data.vorbis_comment.comments = 0;
+				}
+				else {
+					to->data.vorbis_comment.comments = vorbiscomment_entry_array_copy_(object->data.vorbis_comment.comments, object->data.vorbis_comment.num_comments);
+					if (to->data.vorbis_comment.comments == NULL) {
+						to->data.vorbis_comment.num_comments = 0;
+						FLAC__metadata_object_delete(to);
+						return 0;
+					}
+				}
+				to->data.vorbis_comment.num_comments = object->data.vorbis_comment.num_comments;
+				break;
+			case FLAC__METADATA_TYPE_CUESHEET:
+				memcpy(&to->data.cue_sheet, &object->data.cue_sheet, sizeof(FLAC__StreamMetadata_CueSheet));
+				if (object->data.cue_sheet.num_tracks == 0) {
+					FLAC__ASSERT(object->data.cue_sheet.tracks == NULL);
+				}
+				else {
+					FLAC__ASSERT(object->data.cue_sheet.tracks != 0);
+					to->data.cue_sheet.tracks = cuesheet_track_array_copy_(object->data.cue_sheet.tracks, object->data.cue_sheet.num_tracks);
+					if (to->data.cue_sheet.tracks == NULL) {
+						FLAC__metadata_object_delete(to);
+						return 0;
+					}
+				}
+				break;
+			case FLAC__METADATA_TYPE_PICTURE:
+				to->data.picture.type = object->data.picture.type;
+				if (!copy_cstring_(&to->data.picture.mime_type, object->data.picture.mime_type)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if (!copy_cstring_((char**)(&to->data.picture.description), (const char*)object->data.picture.description)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				to->data.picture.width = object->data.picture.width;
+				to->data.picture.height = object->data.picture.height;
+				to->data.picture.depth = object->data.picture.depth;
+				to->data.picture.colors = object->data.picture.colors;
+				to->data.picture.data_length = object->data.picture.data_length;
+				if (!copy_bytes_((&to->data.picture.data), object->data.picture.data, object->data.picture.data_length)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			default:
+				if (!copy_bytes_(&to->data.unknown.data, object->data.unknown.data, object->length)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+		}
+	}
+
+	return to;
+}
+
+void FLAC__metadata_object_delete_data(FLAC__StreamMetadata *object)
+{
+	FLAC__ASSERT(object != NULL);
+
+	switch(object->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+		case FLAC__METADATA_TYPE_PADDING:
+			break;
+		case FLAC__METADATA_TYPE_APPLICATION:
+			if (object->data.application.data != NULL) {
+				free(object->data.application.data);
+				object->data.application.data = NULL;
+			}
+			break;
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			if (object->data.seek_table.points != NULL) {
+				free(object->data.seek_table.points);
+				object->data.seek_table.points = NULL;
+			}
+			break;
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			if (object->data.vorbis_comment.vendor_string.entry != NULL) {
+				free(object->data.vorbis_comment.vendor_string.entry);
+				object->data.vorbis_comment.vendor_string.entry = 0;
+			}
+			if (object->data.vorbis_comment.comments != NULL) {
+				FLAC__ASSERT(object->data.vorbis_comment.num_comments > 0);
+				vorbiscomment_entry_array_delete_(object->data.vorbis_comment.comments, object->data.vorbis_comment.num_comments);
+				object->data.vorbis_comment.comments = NULL;
+				object->data.vorbis_comment.num_comments = 0;
+			}
+			break;
+		case FLAC__METADATA_TYPE_CUESHEET:
+			if (object->data.cue_sheet.tracks != NULL) {
+				FLAC__ASSERT(object->data.cue_sheet.num_tracks > 0);
+				cuesheet_track_array_delete_(object->data.cue_sheet.tracks, object->data.cue_sheet.num_tracks);
+				object->data.cue_sheet.tracks = NULL;
+				object->data.cue_sheet.num_tracks = 0;
+			}
+			break;
+		case FLAC__METADATA_TYPE_PICTURE:
+			if (object->data.picture.mime_type != NULL) {
+				free(object->data.picture.mime_type);
+				object->data.picture.mime_type = NULL;
+			}
+			if (object->data.picture.description != NULL) {
+				free(object->data.picture.description);
+				object->data.picture.description = NULL;
+			}
+			if (object->data.picture.data != NULL) {
+				free(object->data.picture.data);
+				object->data.picture.data = NULL;
+			}
+			break;
+		default:
+			if (object->data.unknown.data != NULL) {
+				free(object->data.unknown.data);
+				object->data.unknown.data = NULL;
+			}
+			break;
+	}
+}
+
+FLAC_API void FLAC__metadata_object_delete(FLAC__StreamMetadata *object)
+{
+	FLAC__metadata_object_delete_data(object);
+	free(object);
+}
+
+static FLAC__bool compare_block_data_streaminfo_(const FLAC__StreamMetadata_StreamInfo *block1, const FLAC__StreamMetadata_StreamInfo *block2)
+{
+	if (block1->min_blocksize != block2->min_blocksize)
+		return false;
+	if (block1->max_blocksize != block2->max_blocksize)
+		return false;
+	if (block1->min_framesize != block2->min_framesize)
+		return false;
+	if (block1->max_framesize != block2->max_framesize)
+		return false;
+	if (block1->sample_rate != block2->sample_rate)
+		return false;
+	if (block1->channels != block2->channels)
+		return false;
+	if (block1->bits_per_sample != block2->bits_per_sample)
+		return false;
+	if (block1->total_samples != block2->total_samples)
+		return false;
+	if (memcmp(block1->md5sum, block2->md5sum, 16) != 0)
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_application_(const FLAC__StreamMetadata_Application *block1, const FLAC__StreamMetadata_Application *block2, unsigned block_length)
+{
+	FLAC__ASSERT(block1 != NULL);
+	FLAC__ASSERT(block2 != NULL);
+	FLAC__ASSERT(block_length >= sizeof(block1->id));
+
+	if (memcmp(block1->id, block2->id, sizeof(block1->id)) != 0)
+		return false;
+	if (block1->data != NULL && block2->data != NULL)
+		return memcmp(block1->data, block2->data, block_length - sizeof(block1->id)) == 0;
+	else
+		return block1->data == block2->data;
+}
+
+static FLAC__bool compare_block_data_seektable_(const FLAC__StreamMetadata_SeekTable *block1, const FLAC__StreamMetadata_SeekTable *block2)
+{
+	unsigned i;
+
+	FLAC__ASSERT(block1 != NULL);
+	FLAC__ASSERT(block2 != NULL);
+
+	if (block1->num_points != block2->num_points)
+		return false;
+
+	if (block1->points != NULL && block2->points != NULL) {
+		for (i = 0; i < block1->num_points; i++) {
+			if (block1->points[i].sample_number != block2->points[i].sample_number)
+				return false;
+			if (block1->points[i].stream_offset != block2->points[i].stream_offset)
+				return false;
+			if (block1->points[i].frame_samples != block2->points[i].frame_samples)
+				return false;
+		}
+		return true;
+	}
+	else
+		return block1->points == block2->points;
+}
+
+static FLAC__bool compare_block_data_vorbiscomment_(const FLAC__StreamMetadata_VorbisComment *block1, const FLAC__StreamMetadata_VorbisComment *block2)
+{
+	unsigned i;
+
+	if (block1->vendor_string.length != block2->vendor_string.length)
+		return false;
+
+	if (block1->vendor_string.entry != NULL && block2->vendor_string.entry != NULL) {
+		if (memcmp(block1->vendor_string.entry, block2->vendor_string.entry, block1->vendor_string.length) != 0)
+			return false;
+	}
+	else if (block1->vendor_string.entry != block2->vendor_string.entry)
+		return false;
+
+	if (block1->num_comments != block2->num_comments)
+		return false;
+
+	for (i = 0; i < block1->num_comments; i++) {
+		if (block1->comments[i].entry != NULL && block2->comments[i].entry != NULL) {
+			if (memcmp(block1->comments[i].entry, block2->comments[i].entry, block1->comments[i].length) != 0)
+				return false;
+		}
+		else if (block1->comments[i].entry != block2->comments[i].entry)
+			return false;
+	}
+	return true;
+}
+
+static FLAC__bool compare_block_data_cuesheet_(const FLAC__StreamMetadata_CueSheet *block1, const FLAC__StreamMetadata_CueSheet *block2)
+{
+	unsigned i, j;
+
+	if (strcmp(block1->media_catalog_number, block2->media_catalog_number) != 0)
+		return false;
+
+	if (block1->lead_in != block2->lead_in)
+		return false;
+
+	if (block1->is_cd != block2->is_cd)
+		return false;
+
+	if (block1->num_tracks != block2->num_tracks)
+		return false;
+
+	if (block1->tracks != NULL && block2->tracks != NULL) {
+		FLAC__ASSERT(block1->num_tracks > 0);
+		for (i = 0; i < block1->num_tracks; i++) {
+			if (block1->tracks[i].offset != block2->tracks[i].offset)
+				return false;
+			if (block1->tracks[i].number != block2->tracks[i].number)
+				return false;
+			if (memcmp(block1->tracks[i].isrc, block2->tracks[i].isrc, sizeof(block1->tracks[i].isrc)) != 0)
+				return false;
+			if (block1->tracks[i].type != block2->tracks[i].type)
+				return false;
+			if (block1->tracks[i].pre_emphasis != block2->tracks[i].pre_emphasis)
+				return false;
+			if (block1->tracks[i].num_indices != block2->tracks[i].num_indices)
+				return false;
+			if (block1->tracks[i].indices != NULL && block2->tracks[i].indices != NULL) {
+				FLAC__ASSERT(block1->tracks[i].num_indices > 0);
+				for (j = 0; j < block1->tracks[i].num_indices; j++) {
+					if (block1->tracks[i].indices[j].offset != block2->tracks[i].indices[j].offset)
+						return false;
+					if (block1->tracks[i].indices[j].number != block2->tracks[i].indices[j].number)
+						return false;
+				}
+			}
+			else if (block1->tracks[i].indices != block2->tracks[i].indices)
+				return false;
+		}
+	}
+	else if (block1->tracks != block2->tracks)
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_picture_(const FLAC__StreamMetadata_Picture *block1, const FLAC__StreamMetadata_Picture *block2)
+{
+	if (block1->type != block2->type)
+		return false;
+	if (block1->mime_type != block2->mime_type && (block1->mime_type == 0 || block2->mime_type == 0 || strcmp(block1->mime_type, block2->mime_type)))
+		return false;
+	if (block1->description != block2->description && (block1->description == 0 || block2->description == 0 || strcmp((const char *)block1->description, (const char *)block2->description)))
+		return false;
+	if (block1->width != block2->width)
+		return false;
+	if (block1->height != block2->height)
+		return false;
+	if (block1->depth != block2->depth)
+		return false;
+	if (block1->colors != block2->colors)
+		return false;
+	if (block1->data_length != block2->data_length)
+		return false;
+	if (block1->data != block2->data && (block1->data == NULL || block2->data == NULL || memcmp(block1->data, block2->data, block1->data_length)))
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_unknown_(const FLAC__StreamMetadata_Unknown *block1, const FLAC__StreamMetadata_Unknown *block2, unsigned block_length)
+{
+	FLAC__ASSERT(block1 != NULL);
+	FLAC__ASSERT(block2 != NULL);
+
+	if (block1->data != NULL && block2->data != NULL)
+		return memcmp(block1->data, block2->data, block_length) == 0;
+	else
+		return block1->data == block2->data;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_is_equal(const FLAC__StreamMetadata *block1, const FLAC__StreamMetadata *block2)
+{
+	FLAC__ASSERT(block1 != NULL);
+	FLAC__ASSERT(block2 != NULL);
+
+	if (block1->type != block2->type) {
+		return false;
+	}
+	if (block1->is_last != block2->is_last) {
+		return false;
+	}
+	if (block1->length != block2->length) {
+		return false;
+	}
+	switch(block1->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return compare_block_data_streaminfo_(&block1->data.stream_info, &block2->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return true; /* we don't compare the padding guts */
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return compare_block_data_application_(&block1->data.application, &block2->data.application, block1->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return compare_block_data_seektable_(&block1->data.seek_table, &block2->data.seek_table);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return compare_block_data_vorbiscomment_(&block1->data.vorbis_comment, &block2->data.vorbis_comment);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return compare_block_data_cuesheet_(&block1->data.cue_sheet, &block2->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return compare_block_data_picture_(&block1->data.picture, &block2->data.picture);
+		default:
+			return compare_block_data_unknown_(&block1->data.unknown, &block2->data.unknown, block1->length);
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_application_set_data(FLAC__StreamMetadata *object, FLAC__byte *data, unsigned length, FLAC__bool copy)
+{
+	FLAC__byte *save;
+
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_APPLICATION);
+	FLAC__ASSERT((data != NULL && length > 0) || (data == NULL && length == 0 && copy == false));
+
+	save = object->data.application.data;
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if (copy) {
+		if (!copy_bytes_(&object->data.application.data, data, length))
+			return false;
+	}
+	else {
+		object->data.application.data = data;
+	}
+
+	free(save);
+
+	object->length = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8 + length;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_resize_points(FLAC__StreamMetadata *object, unsigned new_num_points)
+{
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	if (object->data.seek_table.points == 0) {
+		FLAC__ASSERT(object->data.seek_table.num_points == 0);
+		if (new_num_points == 0)
+			return true;
+		else if ((object->data.seek_table.points = seekpoint_array_new_(new_num_points)) == 0)
+			return false;
+	}
+	else {
+		const size_t old_size = object->data.seek_table.num_points * sizeof(FLAC__StreamMetadata_SeekPoint);
+		const size_t new_size = new_num_points * sizeof(FLAC__StreamMetadata_SeekPoint);
+
+		/* overflow check */
+		if (new_num_points > UINT32_MAX / sizeof(FLAC__StreamMetadata_SeekPoint))
+			return false;
+
+		FLAC__ASSERT(object->data.seek_table.num_points > 0);
+
+		if (new_size == 0) {
+			free(object->data.seek_table.points);
+			object->data.seek_table.points = 0;
+		}
+		else if ((object->data.seek_table.points = safe_realloc_(object->data.seek_table.points, new_size)) == NULL)
+			return false;
+
+		/* if growing, set new elements to placeholders */
+		if (new_size > old_size) {
+			unsigned i;
+			for (i = object->data.seek_table.num_points; i < new_num_points; i++) {
+				object->data.seek_table.points[i].sample_number = FLAC__STREAM_METADATA_SEEKPOINT_PLACEHOLDER;
+				object->data.seek_table.points[i].stream_offset = 0;
+				object->data.seek_table.points[i].frame_samples = 0;
+			}
+		}
+	}
+
+	object->data.seek_table.num_points = new_num_points;
+
+	seektable_calculate_length_(object);
+	return true;
+}
+
+FLAC_API void FLAC__metadata_object_seektable_set_point(FLAC__StreamMetadata *object, unsigned point_num, FLAC__StreamMetadata_SeekPoint point)
+{
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(point_num < object->data.seek_table.num_points);
+
+	object->data.seek_table.points[point_num] = point;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_insert_point(FLAC__StreamMetadata *object, unsigned point_num, FLAC__StreamMetadata_SeekPoint point)
+{
+	int i;
+
+	FLAC__ASSERT(object != NULL);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(point_num <= object->data.seek_table.num_points);
+
+	if (!FLAC__metadata_object_seektable_resize_points(object, object->data.seek_table.num_points+1))
+		return false;
+
+	/* move all points >= point_num forward one space */
+	for (i = (int)object->data.seek_table.num_points-1; i > (int)point_num; i--)
+		object->data.seek_table.points[i] = object->data.seek_table.points[i-1];
+
+	FLAC__metadata_object_seektable_set_point(object, point_num, point);
+	seektable_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_delete_point(FLAC__StreamMetadata *object, unsigned poi