Upgrade Opus to v1.1.2

Change-Id: I8211751bab026ab236a612c6e0873f8bdbcd6c98
diff --git a/Android.mk b/Android.mk
index 4134604..3a57f20 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,146 +1,48 @@
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
+
+include $(LOCAL_PATH)/celt_sources.mk
+include $(LOCAL_PATH)/opus_sources.mk
+include $(LOCAL_PATH)/silk_sources.mk
+
 LOCAL_MODULE    := libopus
 OGG_DIR         := external/libogg
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include $(LOCAL_PATH)/src $(LOCAL_PATH)/silk \
                     $(LOCAL_PATH)/celt $(LOCAL_PATH)/silk/fixed $(OGG_DIR)/include
-LOCAL_SRC_FILES := celt/bands.c \
-                   celt/celt.c \
-                   celt/celt_decoder.c \
-                   celt/celt_encoder.c \
-                   celt/celt_lpc.c \
-                   celt/cwrs.c \
-                   celt/entcode.c \
-                   celt/entdec.c \
-                   celt/entenc.c \
-                   celt/kiss_fft.c \
-                   celt/laplace.c \
-                   celt/mathops.c \
-                   celt/mdct.c \
-                   celt/modes.c \
-                   celt/pitch.c \
-                   celt/quant_bands.c \
-                   celt/rate.c \
-                   celt/vq.c \
-                   silk/A2NLSF.c \
-                   silk/ana_filt_bank_1.c \
-                   silk/biquad_alt.c \
-                   silk/bwexpander_32.c \
-                   silk/bwexpander.c \
-                   silk/check_control_input.c \
-                   silk/CNG.c \
-                   silk/code_signs.c \
-                   silk/control_audio_bandwidth.c \
-                   silk/control_codec.c \
-                   silk/control_SNR.c \
-                   silk/debug.c \
-                   silk/dec_API.c \
-                   silk/decode_core.c \
-                   silk/decode_frame.c \
-                   silk/decode_indices.c \
-                   silk/decode_parameters.c \
-                   silk/decode_pitch.c \
-                   silk/decode_pulses.c \
-                   silk/decoder_set_fs.c \
-                   silk/enc_API.c \
-                   silk/encode_indices.c \
-                   silk/encode_pulses.c \
-                   silk/gain_quant.c \
-                   silk/HP_variable_cutoff.c \
-                   silk/init_decoder.c \
-                   silk/init_encoder.c \
-                   silk/inner_prod_aligned.c \
-                   silk/interpolate.c \
-                   silk/lin2log.c \
-                   silk/log2lin.c \
-                   silk/LPC_analysis_filter.c \
-                   silk/LPC_inv_pred_gain.c \
-                   silk/LP_variable_cutoff.c \
-                   silk/NLSF2A.c \
-                   silk/NLSF_decode.c \
-                   silk/NLSF_del_dec_quant.c \
-                   silk/NLSF_encode.c \
-                   silk/NLSF_stabilize.c \
-                   silk/NLSF_unpack.c \
-                   silk/NLSF_VQ.c \
-                   silk/NLSF_VQ_weights_laroia.c \
-                   silk/NSQ.c \
-                   silk/NSQ_del_dec.c \
-                   silk/pitch_est_tables.c \
-                   silk/PLC.c \
-                   silk/process_NLSFs.c \
-                   silk/quant_LTP_gains.c \
-                   silk/resampler.c \
-                   silk/resampler_down2_3.c \
-                   silk/resampler_down2.c \
-                   silk/resampler_private_AR2.c \
-                   silk/resampler_private_down_FIR.c \
-                   silk/resampler_private_IIR_FIR.c \
-                   silk/resampler_private_up2_HQ.c \
-                   silk/resampler_rom.c \
-                   silk/shell_coder.c \
-                   silk/sigm_Q15.c \
-                   silk/sort.c \
-                   silk/stereo_decode_pred.c \
-                   silk/stereo_encode_pred.c \
-                   silk/stereo_find_predictor.c \
-                   silk/stereo_LR_to_MS.c \
-                   silk/stereo_MS_to_LR.c \
-                   silk/stereo_quant_pred.c \
-                   silk/sum_sqr_shift.c \
-                   silk/table_LSF_cos.c \
-                   silk/tables_gain.c \
-                   silk/tables_LTP.c \
-                   silk/tables_NLSF_CB_NB_MB.c \
-                   silk/tables_NLSF_CB_WB.c \
-                   silk/tables_other.c \
-                   silk/tables_pitch_lag.c \
-                   silk/tables_pulses_per_block.c \
-                   silk/VAD.c \
-                   silk/VQ_WMat_EC.c \
-                   silk/fixed/apply_sine_window_FIX.c \
-                   silk/fixed/autocorr_FIX.c \
-                   silk/fixed/burg_modified_FIX.c \
-                   silk/fixed/corrMatrix_FIX.c \
-                   silk/fixed/encode_frame_FIX.c \
-                   silk/fixed/find_LPC_FIX.c \
-                   silk/fixed/find_LTP_FIX.c \
-                   silk/fixed/find_pitch_lags_FIX.c \
-                   silk/fixed/find_pred_coefs_FIX.c \
-                   silk/fixed/k2a_FIX.c \
-                   silk/fixed/k2a_Q16_FIX.c \
-                   silk/fixed/LTP_analysis_filter_FIX.c \
-                   silk/fixed/LTP_scale_ctrl_FIX.c \
-                   silk/fixed/noise_shape_analysis_FIX.c \
-                   silk/fixed/pitch_analysis_core_FIX.c \
-                   silk/fixed/prefilter_FIX.c \
-                   silk/fixed/process_gains_FIX.c \
-                   silk/fixed/regularize_correlations_FIX.c \
-                   silk/fixed/residual_energy16_FIX.c \
-                   silk/fixed/residual_energy_FIX.c \
-                   silk/fixed/schur64_FIX.c \
-                   silk/fixed/schur_FIX.c \
-                   silk/fixed/solve_LS_FIX.c \
-                   silk/fixed/vector_ops_FIX.c \
-                   silk/fixed/warped_autocorrelation_FIX.c \
-                   src/analysis.c \
-                   src/mlp.c \
-                   src/mlp_data.c \
-                   src/opus.c \
-                   src/opus_decoder.c \
-                   src/opus_encoder.c \
-                   src/opus_multistream.c \
-                   src/opus_multistream_decoder.c \
-                   src/opus_multistream_encoder.c \
-                   src/repacketizer.c \
-                   src/repacketizer_demo.c
+LOCAL_SRC_FILES := $(CELT_SOURCES) $(SILK_SOURCES) $(SILK_SOURCES_FIXED) \
+                   $(OPUS_SOURCES) $(OPUS_SOURCES_FLOAT) src/repacketizer_demo.c
 
 LOCAL_CFLAGS        := -DNULL=0 -DSOCKLEN_T=socklen_t -DLOCALE_NOT_USED \
                        -D_LARGEFILE_SOURCE=1 -D_FILE_OFFSET_BITS=64 \
                        -Drestrict='' -D__EMX__ -DOPUS_BUILD -DFIXED_POINT \
-                       -DUSE_ALLOCA -DHAVE_LRINT -DHAVE_LRINTF -O1 -fno-math-errno
-LOCAL_CPPFLAGS      := -DBSD=1 -ffast-math -O1 -funroll-loops
+                       -DUSE_ALLOCA -DHAVE_LRINT -DHAVE_LRINTF -O2 -fno-math-errno
+LOCAL_CPPFLAGS      := -DBSD=1 -ffast-math -O2 -funroll-loops
+
+ifneq ($(findstring $(TARGET_ARCH_ABI), armeabi-v7a arm64-v8a),)
+LOCAL_SRC_FILES += $(CELT_SOURCES_ARM) $(CELT_SOURCES_ARM_NEON_INTR)
+LOCAL_SRC_FILES += celt/arm/armopts_gnu.s.neon
+LOCAL_SRC_FILES += $(subst .s,_gnu.s.neon,$(CELT_SOURCES_ARM_ASM))
+LOCAL_ARM_NEON := true
+LOCAL_CFLAGS += -DOPUS_ARM_ASM -DOPUS_ARM_INLINE_ASM -DOPUS_ARM_INLINE_EDSP \
+                -DOPUS_ARM_INLINE_MEDIA -DOPUS_ARM_INLINE_NEON \
+                -DOPUS_ARM_MAY_HAVE_NEON -DOPUS_ARM_MAY_HAVE_MEDIA \
+                -DOPUS_ARM_MAY_HAVE_EDSP -DOPUS_ARM_MAY_HAVE_NEON_INTR \
+                -DOPUS_HAVE_RTCD -DOPUS_ARM_PRESUME_EDSP \
+                -DOPUS_ARM_PRESUME_MEDIA -DOPUS_ARM_PRESUME_NEON
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSSE3),true)
+LOCAL_CFLAGS += -DOPUS_X86_MAY_HAVE_SSE -DOPUS_X86_PRESUME_SSE \
+                -DOPUS_X86_MAY_HAVE_SSE2 -DOPUS_X86_PRESUME_SSE2
+LOCAL_SRC_FILES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE2)
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_CFLAGS += -DOPUS_X86_MAY_HAVE_SSE4_1 -DOPUS_X86_PRESUME_SSE4_1
+LOCAL_SRC_FILES += $(CELT_SOURCES_SSE4_1) \
+                   $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1)
+endif
 
 LOCAL_STATIC_LIBRARIES := libogg
 
diff --git a/INSTALL b/INSTALL
index a1e89e1..2099840 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,7 +1,7 @@
 Installation Instructions
 *************************
 
-Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation,
+Copyright (C) 1994-1996, 1999-2002, 2004-2013 Free Software Foundation,
 Inc.
 
    Copying and distribution of this file, with or without modification,
@@ -12,8 +12,8 @@
 Basic Installation
 ==================
 
-   Briefly, the shell commands `./configure; make; make install' should
-configure, build, and install this package.  The following
+   Briefly, the shell command `./configure && make && make install'
+should configure, build, and install this package.  The following
 more-detailed instructions are generic; see the `README' file for
 instructions specific to this package.  Some packages provide this
 `INSTALL' file but do not implement all of the features documented
@@ -309,9 +309,10 @@
 overridden in the site shell script).
 
 Unfortunately, this technique does not work for `CONFIG_SHELL' due to
-an Autoconf bug.  Until the bug is fixed you can use this workaround:
+an Autoconf limitation.  Until the limitation is lifted, you can use
+this workaround:
 
-     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+     CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash
 
 `configure' Invocation
 ======================
@@ -367,4 +368,3 @@
 
 `configure' also accepts some other, not widely useful, options.  Run
 `configure --help' for more details.
-
diff --git a/Makefile.am b/Makefile.am
index c39d803..4d3a888 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,7 +10,7 @@
 DIST_SUBDIRS = doc
 
 AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/celt -I$(top_srcdir)/silk \
-              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed
+              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed $(NE10_CFLAGS)
 
 include celt_sources.mk
 include silk_sources.mk
@@ -18,8 +18,14 @@
 
 if FIXED_POINT
 SILK_SOURCES += $(SILK_SOURCES_FIXED)
+if HAVE_SSE4_1
+SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1)
+endif
 else
 SILK_SOURCES += $(SILK_SOURCES_FLOAT)
+if HAVE_SSE4_1
+SILK_SOURCES += $(SILK_SOURCES_SSE4_1)
+endif
 endif
 
 if DISABLE_FLOAT_API
@@ -27,11 +33,31 @@
 OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
 endif
 
+if HAVE_SSE
+CELT_SOURCES += $(CELT_SOURCES_SSE)
+endif
+if HAVE_SSE2
+CELT_SOURCES += $(CELT_SOURCES_SSE2)
+endif
+if HAVE_SSE4_1
+CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
+endif
+
 if CPU_ARM
 CELT_SOURCES += $(CELT_SOURCES_ARM)
 SILK_SOURCES += $(SILK_SOURCES_ARM)
+
+if OPUS_ARM_NEON_INTR
+CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
+endif
+
+if HAVE_ARM_NE10
+CELT_SOURCES += $(CELT_SOURCES_ARM_NE10)
+endif
+
 if OPUS_ARM_EXTERNAL_ASM
-nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
+noinst_LTLIBRARIES = libarmasm.la
+libarmasm_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
 BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
  $(CELT_AM_SOURCES_ARM_ASM:.s.in=.s) \
  $(CELT_AM_SOURCES_ARM_ASM:.s.in=-gnu.S)
@@ -47,7 +73,10 @@
 
 libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
 libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
-libopus_la_LIBADD = $(LIBM)
+libopus_la_LIBADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+libopus_la_LIBADD += libarmasm.la
+endif
 
 pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h
 
@@ -60,32 +89,35 @@
 
 opus_demo_SOURCES = src/opus_demo.c
 
-opus_demo_LDADD = libopus.la $(LIBM)
+opus_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 repacketizer_demo_SOURCES = src/repacketizer_demo.c
 
-repacketizer_demo_LDADD = libopus.la $(LIBM)
+repacketizer_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 opus_compare_SOURCES = src/opus_compare.c
 opus_compare_LDADD = $(LIBM)
 
 tests_test_opus_api_SOURCES = tests/test_opus_api.c tests/test_opus_common.h
-tests_test_opus_api_LDADD = libopus.la $(LIBM)
+tests_test_opus_api_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 tests_test_opus_encode_SOURCES = tests/test_opus_encode.c tests/test_opus_common.h
-tests_test_opus_encode_LDADD = libopus.la $(LIBM)
+tests_test_opus_encode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 tests_test_opus_decode_SOURCES = tests/test_opus_decode.c tests/test_opus_common.h
-tests_test_opus_decode_LDADD = libopus.la $(LIBM)
+tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
-tests_test_opus_padding_LDADD = libopus.la $(LIBM)
+tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
 celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
 celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
 celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
-celt_tests_test_unit_dft_LDADD = $(LIBM)
+celt_tests_test_unit_dft_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_dft_LDADD += libarmasm.la
+endif
 
 celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
 celt_tests_test_unit_entropy_LDADD = $(LIBM)
@@ -94,13 +126,22 @@
 celt_tests_test_unit_laplace_LDADD = $(LIBM)
 
 celt_tests_test_unit_mathops_SOURCES = celt/tests/test_unit_mathops.c
-celt_tests_test_unit_mathops_LDADD = $(LIBM)
+celt_tests_test_unit_mathops_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_mathops_LDADD += libarmasm.la
+endif
 
 celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
-celt_tests_test_unit_mdct_LDADD = $(LIBM)
+celt_tests_test_unit_mdct_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_mdct_LDADD += libarmasm.la
+endif
 
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
-celt_tests_test_unit_rotation_LDADD = $(LIBM)
+celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_rotation_LDADD += libarmasm.la
+endif
 
 celt_tests_test_unit_types_SOURCES = celt/tests/test_unit_types.c
 celt_tests_test_unit_types_LDADD = $(LIBM)
@@ -119,6 +160,7 @@
              opus.pc.in \
              opus-uninstalled.pc.in \
              opus.m4 \
+             Makefile.mips \
              Makefile.unix \
              tests/run_vectors.sh \
              celt/arm/arm2gnu.pl \
@@ -225,7 +267,35 @@
 
 # convert ARM asm to GNU as format
 %-gnu.S: $(top_srcdir)/%.s
-	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
+	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
 # For autoconf-modified sources (e.g., armopts.s)
 %-gnu.S: %.s
-	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
+	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
+
+OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+
+if HAVE_SSE
+SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
+$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
+endif
+
+if HAVE_SSE2
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
+endif
+
+if HAVE_SSE4_1
+SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+             $(SILK_SOURCES_SSE4_1:.c=.lo) \
+             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
+$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
+endif
+
+if OPUS_ARM_NEON_INTR
+CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo)
+$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
+ $(OPUS_ARM_NEON_INTR_CFLAGS)  $(NE10_CFLAGS)
+endif
diff --git a/Makefile.in b/Makefile.in
index cc240ce..843ccc5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,9 +1,8 @@
-# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -19,23 +18,61 @@
 
 
 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
     case $$MAKEFLAGS in \
       *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
     esac; \
-    test $$am__dry = yes; \
-  }
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@@ -54,22 +91,19 @@
 POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
-DIST_COMMON = README $(am__configure_deps) \
-	$(am__pkginclude_HEADERS_DIST) $(noinst_HEADERS) \
-	$(srcdir)/Makefile.am $(srcdir)/Makefile.in \
-	$(srcdir)/celt_headers.mk $(srcdir)/celt_sources.mk \
-	$(srcdir)/config.h.in $(srcdir)/opus-uninstalled.pc.in \
-	$(srcdir)/opus.pc.in $(srcdir)/opus_headers.mk \
-	$(srcdir)/opus_sources.mk $(srcdir)/silk_headers.mk \
-	$(srcdir)/silk_sources.mk $(top_srcdir)/celt/arm/armopts.s.in \
-	$(top_srcdir)/configure AUTHORS COPYING ChangeLog INSTALL NEWS \
-	compile config.guess config.sub depcomp install-sh ltmain.sh \
-	missing
 @FIXED_POINT_TRUE@am__append_1 = $(SILK_SOURCES_FIXED)
-@FIXED_POINT_FALSE@am__append_2 = $(SILK_SOURCES_FLOAT)
-@DISABLE_FLOAT_API_FALSE@am__append_3 = $(OPUS_SOURCES_FLOAT)
-@CPU_ARM_TRUE@am__append_4 = $(CELT_SOURCES_ARM)
-@CPU_ARM_TRUE@am__append_5 = $(SILK_SOURCES_ARM)
+@FIXED_POINT_TRUE@@HAVE_SSE4_1_TRUE@am__append_2 = $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1)
+@FIXED_POINT_FALSE@am__append_3 = $(SILK_SOURCES_FLOAT)
+@FIXED_POINT_FALSE@@HAVE_SSE4_1_TRUE@am__append_4 = $(SILK_SOURCES_SSE4_1)
+@DISABLE_FLOAT_API_FALSE@am__append_5 = $(OPUS_SOURCES_FLOAT)
+@HAVE_SSE_TRUE@am__append_6 = $(CELT_SOURCES_SSE)
+@HAVE_SSE2_TRUE@am__append_7 = $(CELT_SOURCES_SSE2)
+@HAVE_SSE4_1_TRUE@am__append_8 = $(CELT_SOURCES_SSE4_1)
+@CPU_ARM_TRUE@am__append_9 = $(CELT_SOURCES_ARM)
+@CPU_ARM_TRUE@am__append_10 = $(SILK_SOURCES_ARM)
+@CPU_ARM_TRUE@@OPUS_ARM_NEON_INTR_TRUE@am__append_11 = $(CELT_SOURCES_ARM_NEON_INTR)
+@CPU_ARM_TRUE@@HAVE_ARM_NE10_TRUE@am__append_12 = $(CELT_SOURCES_ARM_NE10)
+@OPUS_ARM_EXTERNAL_ASM_TRUE@am__append_13 = libarmasm.la
 @EXTRA_PROGRAMS_TRUE@noinst_PROGRAMS = opus_demo$(EXEEXT) \
 @EXTRA_PROGRAMS_TRUE@	repacketizer_demo$(EXEEXT) \
 @EXTRA_PROGRAMS_TRUE@	opus_compare$(EXEEXT) \
@@ -98,17 +132,25 @@
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_decode$(EXEEXT) \
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_encode$(EXEEXT) \
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_padding$(EXEEXT)
-@CUSTOM_MODES_TRUE@am__append_6 = include/opus_custom.h
-@CUSTOM_MODES_TRUE@@EXTRA_PROGRAMS_TRUE@am__append_7 = opus_custom_demo
+@EXTRA_PROGRAMS_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am__append_14 = libarmasm.la
+@EXTRA_PROGRAMS_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am__append_15 = libarmasm.la
+@EXTRA_PROGRAMS_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am__append_16 = libarmasm.la
+@EXTRA_PROGRAMS_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am__append_17 = libarmasm.la
+@CUSTOM_MODES_TRUE@am__append_18 = include/opus_custom.h
+@CUSTOM_MODES_TRUE@@EXTRA_PROGRAMS_TRUE@am__append_19 = opus_custom_demo
 subdir = .
 SUBDIRS =
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/as-gcc-inline-assembly.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/opus-intrinsics.m4 $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
+	$(am__configure_deps) $(noinst_HEADERS) \
+	$(am__pkginclude_HEADERS_DIST) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
  configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@@ -144,21 +186,38 @@
   }
 am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(m4datadir)" \
 	"$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(pkgincludedir)"
-LTLIBRARIES = $(lib_LTLIBRARIES)
+LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
+libarmasm_la_LIBADD =
+am__libarmasm_la_SOURCES_DIST = celt/arm/celt_pitch_xcorr_arm-gnu.S
+am__dirstamp = $(am__leading_dot)dirstamp
+am__objects_1 = celt/arm/celt_pitch_xcorr_arm-gnu.lo
+@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am_libarmasm_la_OBJECTS =  \
+@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@	$(am__objects_1)
+libarmasm_la_OBJECTS = $(am_libarmasm_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@am_libarmasm_la_rpath =
 am__DEPENDENCIES_1 =
-libopus_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
+libopus_la_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+	$(am__append_13)
 am__libopus_la_SOURCES_DIST = celt/bands.c celt/celt.c \
 	celt/celt_encoder.c celt/celt_decoder.c celt/cwrs.c \
 	celt/entcode.c celt/entdec.c celt/entenc.c celt/kiss_fft.c \
 	celt/laplace.c celt/mathops.c celt/mdct.c celt/modes.c \
 	celt/pitch.c celt/celt_lpc.c celt/quant_bands.c celt/rate.c \
-	celt/vq.c celt/arm/armcpu.c celt/arm/arm_celt_map.c silk/CNG.c \
-	silk/code_signs.c silk/init_decoder.c silk/decode_core.c \
-	silk/decode_frame.c silk/decode_parameters.c \
-	silk/decode_indices.c silk/decode_pulses.c \
-	silk/decoder_set_fs.c silk/dec_API.c silk/enc_API.c \
-	silk/encode_indices.c silk/encode_pulses.c silk/gain_quant.c \
-	silk/interpolate.c silk/LP_variable_cutoff.c \
+	celt/vq.c celt/x86/x86cpu.c celt/x86/x86_celt_map.c \
+	celt/x86/pitch_sse.c celt/x86/pitch_sse2.c \
+	celt/x86/celt_lpc_sse.c celt/x86/pitch_sse4_1.c \
+	celt/arm/armcpu.c celt/arm/arm_celt_map.c \
+	celt/arm/celt_neon_intr.c celt/arm/celt_ne10_fft.c \
+	celt/arm/celt_ne10_mdct.c silk/CNG.c silk/code_signs.c \
+	silk/init_decoder.c silk/decode_core.c silk/decode_frame.c \
+	silk/decode_parameters.c silk/decode_indices.c \
+	silk/decode_pulses.c silk/decoder_set_fs.c silk/dec_API.c \
+	silk/enc_API.c silk/encode_indices.c silk/encode_pulses.c \
+	silk/gain_quant.c silk/interpolate.c silk/LP_variable_cutoff.c \
 	silk/NLSF_decode.c silk/NSQ.c silk/NSQ_del_dec.c silk/PLC.c \
 	silk/shell_coder.c silk/tables_gain.c silk/tables_LTP.c \
 	silk/tables_NLSF_CB_NB_MB.c silk/tables_NLSF_CB_WB.c \
@@ -199,10 +258,15 @@
 	silk/fixed/burg_modified_FIX.c silk/fixed/k2a_FIX.c \
 	silk/fixed/k2a_Q16_FIX.c silk/fixed/pitch_analysis_core_FIX.c \
 	silk/fixed/vector_ops_FIX.c silk/fixed/schur64_FIX.c \
-	silk/fixed/schur_FIX.c silk/float/apply_sine_window_FLP.c \
-	silk/float/corrMatrix_FLP.c silk/float/encode_frame_FLP.c \
-	silk/float/find_LPC_FLP.c silk/float/find_LTP_FLP.c \
-	silk/float/find_pitch_lags_FLP.c \
+	silk/fixed/schur_FIX.c silk/x86/NSQ_sse.c \
+	silk/x86/NSQ_del_dec_sse.c silk/x86/x86_silk_map.c \
+	silk/x86/VAD_sse.c silk/x86/VQ_WMat_EC_sse.c \
+	silk/fixed/x86/vector_ops_FIX_sse.c \
+	silk/fixed/x86/burg_modified_FIX_sse.c \
+	silk/fixed/x86/prefilter_FIX_sse.c \
+	silk/float/apply_sine_window_FLP.c silk/float/corrMatrix_FLP.c \
+	silk/float/encode_frame_FLP.c silk/float/find_LPC_FLP.c \
+	silk/float/find_LTP_FLP.c silk/float/find_pitch_lags_FLP.c \
 	silk/float/find_pred_coefs_FLP.c \
 	silk/float/LPC_analysis_filter_FLP.c \
 	silk/float/LTP_analysis_filter_FLP.c \
@@ -224,16 +288,28 @@
 	src/opus_encoder.c src/opus_multistream.c \
 	src/opus_multistream_encoder.c src/opus_multistream_decoder.c \
 	src/repacketizer.c src/analysis.c src/mlp.c src/mlp_data.c
-am__dirstamp = $(am__leading_dot)dirstamp
-am__objects_1 = celt/arm/armcpu.lo celt/arm/arm_celt_map.lo
-@CPU_ARM_TRUE@am__objects_2 = $(am__objects_1)
-am__objects_3 = celt/bands.lo celt/celt.lo celt/celt_encoder.lo \
+am__objects_2 = celt/x86/x86cpu.lo celt/x86/x86_celt_map.lo \
+	celt/x86/pitch_sse.lo
+@HAVE_SSE_TRUE@am__objects_3 = $(am__objects_2)
+am__objects_4 = celt/x86/pitch_sse2.lo
+@HAVE_SSE2_TRUE@am__objects_5 = $(am__objects_4)
+am__objects_6 = celt/x86/celt_lpc_sse.lo celt/x86/pitch_sse4_1.lo
+@HAVE_SSE4_1_TRUE@am__objects_7 = $(am__objects_6)
+am__objects_8 = celt/arm/armcpu.lo celt/arm/arm_celt_map.lo
+@CPU_ARM_TRUE@am__objects_9 = $(am__objects_8)
+am__objects_10 = celt/arm/celt_neon_intr.lo
+@CPU_ARM_TRUE@@OPUS_ARM_NEON_INTR_TRUE@am__objects_11 =  \
+@CPU_ARM_TRUE@@OPUS_ARM_NEON_INTR_TRUE@	$(am__objects_10)
+am__objects_12 = celt/arm/celt_ne10_fft.lo celt/arm/celt_ne10_mdct.lo
+@CPU_ARM_TRUE@@HAVE_ARM_NE10_TRUE@am__objects_13 = $(am__objects_12)
+am__objects_14 = celt/bands.lo celt/celt.lo celt/celt_encoder.lo \
 	celt/celt_decoder.lo celt/cwrs.lo celt/entcode.lo \
 	celt/entdec.lo celt/entenc.lo celt/kiss_fft.lo celt/laplace.lo \
 	celt/mathops.lo celt/mdct.lo celt/modes.lo celt/pitch.lo \
 	celt/celt_lpc.lo celt/quant_bands.lo celt/rate.lo celt/vq.lo \
-	$(am__objects_2)
-am__objects_4 = silk/fixed/LTP_analysis_filter_FIX.lo \
+	$(am__objects_3) $(am__objects_5) $(am__objects_7) \
+	$(am__objects_9) $(am__objects_11) $(am__objects_13)
+am__objects_15 = silk/fixed/LTP_analysis_filter_FIX.lo \
 	silk/fixed/LTP_scale_ctrl_FIX.lo silk/fixed/corrMatrix_FIX.lo \
 	silk/fixed/encode_frame_FIX.lo silk/fixed/find_LPC_FIX.lo \
 	silk/fixed/find_LTP_FIX.lo silk/fixed/find_pitch_lags_FIX.lo \
@@ -250,8 +326,17 @@
 	silk/fixed/pitch_analysis_core_FIX.lo \
 	silk/fixed/vector_ops_FIX.lo silk/fixed/schur64_FIX.lo \
 	silk/fixed/schur_FIX.lo
-@FIXED_POINT_TRUE@am__objects_5 = $(am__objects_4)
-am__objects_6 = silk/float/apply_sine_window_FLP.lo \
+@FIXED_POINT_TRUE@am__objects_16 = $(am__objects_15)
+am__objects_17 = silk/x86/NSQ_sse.lo silk/x86/NSQ_del_dec_sse.lo \
+	silk/x86/x86_silk_map.lo silk/x86/VAD_sse.lo \
+	silk/x86/VQ_WMat_EC_sse.lo
+am__objects_18 = silk/fixed/x86/vector_ops_FIX_sse.lo \
+	silk/fixed/x86/burg_modified_FIX_sse.lo \
+	silk/fixed/x86/prefilter_FIX_sse.lo
+@FIXED_POINT_TRUE@@HAVE_SSE4_1_TRUE@am__objects_19 =  \
+@FIXED_POINT_TRUE@@HAVE_SSE4_1_TRUE@	$(am__objects_17) \
+@FIXED_POINT_TRUE@@HAVE_SSE4_1_TRUE@	$(am__objects_18)
+am__objects_20 = silk/float/apply_sine_window_FLP.lo \
 	silk/float/corrMatrix_FLP.lo silk/float/encode_frame_FLP.lo \
 	silk/float/find_LPC_FLP.lo silk/float/find_LTP_FLP.lo \
 	silk/float/find_pitch_lags_FLP.lo \
@@ -273,9 +358,11 @@
 	silk/float/scale_copy_vector_FLP.lo \
 	silk/float/scale_vector_FLP.lo silk/float/schur_FLP.lo \
 	silk/float/sort_FLP.lo
-@FIXED_POINT_FALSE@am__objects_7 = $(am__objects_6)
-am__objects_8 =
-am__objects_9 = silk/CNG.lo silk/code_signs.lo silk/init_decoder.lo \
+@FIXED_POINT_FALSE@am__objects_21 = $(am__objects_20)
+@FIXED_POINT_FALSE@@HAVE_SSE4_1_TRUE@am__objects_22 =  \
+@FIXED_POINT_FALSE@@HAVE_SSE4_1_TRUE@	$(am__objects_17)
+am__objects_23 =
+am__objects_24 = silk/CNG.lo silk/code_signs.lo silk/init_decoder.lo \
 	silk/decode_core.lo silk/decode_frame.lo \
 	silk/decode_parameters.lo silk/decode_indices.lo \
 	silk/decode_pulses.lo silk/decoder_set_fs.lo silk/dec_API.lo \
@@ -309,23 +396,17 @@
 	silk/sigm_Q15.lo silk/sort.lo silk/sum_sqr_shift.lo \
 	silk/stereo_decode_pred.lo silk/stereo_encode_pred.lo \
 	silk/stereo_find_predictor.lo silk/stereo_quant_pred.lo \
-	$(am__objects_5) $(am__objects_7) $(am__objects_8)
-am__objects_10 = src/analysis.lo src/mlp.lo src/mlp_data.lo
-@DISABLE_FLOAT_API_FALSE@am__objects_11 = $(am__objects_10)
-am__objects_12 = src/opus.lo src/opus_decoder.lo src/opus_encoder.lo \
+	$(am__objects_16) $(am__objects_19) $(am__objects_21) \
+	$(am__objects_22) $(am__objects_23)
+am__objects_25 = src/analysis.lo src/mlp.lo src/mlp_data.lo
+@DISABLE_FLOAT_API_FALSE@am__objects_26 = $(am__objects_25)
+am__objects_27 = src/opus.lo src/opus_decoder.lo src/opus_encoder.lo \
 	src/opus_multistream.lo src/opus_multistream_encoder.lo \
 	src/opus_multistream_decoder.lo src/repacketizer.lo \
-	$(am__objects_11)
-am_libopus_la_OBJECTS = $(am__objects_3) $(am__objects_9) \
-	$(am__objects_12)
-am__objects_13 = celt/arm/celt_pitch_xcorr_arm-gnu.lo
-@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@nodist_libopus_la_OBJECTS =  \
-@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@	$(am__objects_13)
-libopus_la_OBJECTS = $(am_libopus_la_OBJECTS) \
-	$(nodist_libopus_la_OBJECTS)
-AM_V_lt = $(am__v_lt_@AM_V@)
-am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
-am__v_lt_0 = --silent
+	$(am__objects_26)
+am_libopus_la_OBJECTS = $(am__objects_14) $(am__objects_24) \
+	$(am__objects_27)
+libopus_la_OBJECTS = $(am_libopus_la_OBJECTS)
 libopus_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(libopus_la_LDFLAGS) $(LDFLAGS) -o $@
@@ -346,7 +427,8 @@
 celt_tests_test_unit_dft_OBJECTS =  \
 	$(am_celt_tests_test_unit_dft_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_dft_DEPENDENCIES =  \
-@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) $(am__append_14)
 am__celt_tests_test_unit_entropy_SOURCES_DIST =  \
 	celt/tests/test_unit_entropy.c
 @EXTRA_PROGRAMS_TRUE@am_celt_tests_test_unit_entropy_OBJECTS =  \
@@ -370,7 +452,8 @@
 celt_tests_test_unit_mathops_OBJECTS =  \
 	$(am_celt_tests_test_unit_mathops_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mathops_DEPENDENCIES =  \
-@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) $(am__append_15)
 am__celt_tests_test_unit_mdct_SOURCES_DIST =  \
 	celt/tests/test_unit_mdct.c
 @EXTRA_PROGRAMS_TRUE@am_celt_tests_test_unit_mdct_OBJECTS =  \
@@ -378,7 +461,8 @@
 celt_tests_test_unit_mdct_OBJECTS =  \
 	$(am_celt_tests_test_unit_mdct_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mdct_DEPENDENCIES =  \
-@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) $(am__append_16)
 am__celt_tests_test_unit_rotation_SOURCES_DIST =  \
 	celt/tests/test_unit_rotation.c
 @EXTRA_PROGRAMS_TRUE@am_celt_tests_test_unit_rotation_OBJECTS =  \
@@ -386,7 +470,8 @@
 celt_tests_test_unit_rotation_OBJECTS =  \
 	$(am_celt_tests_test_unit_rotation_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_rotation_DEPENDENCIES =  \
-@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) $(am__append_17)
 am__celt_tests_test_unit_types_SOURCES_DIST =  \
 	celt/tests/test_unit_types.c
 @EXTRA_PROGRAMS_TRUE@am_celt_tests_test_unit_types_OBJECTS =  \
@@ -411,12 +496,14 @@
 @EXTRA_PROGRAMS_TRUE@am_opus_demo_OBJECTS = src/opus_demo.$(OBJEXT)
 opus_demo_OBJECTS = $(am_opus_demo_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@opus_demo_DEPENDENCIES = libopus.la \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
 @EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
 am__repacketizer_demo_SOURCES_DIST = src/repacketizer_demo.c
 @EXTRA_PROGRAMS_TRUE@am_repacketizer_demo_OBJECTS =  \
 @EXTRA_PROGRAMS_TRUE@	src/repacketizer_demo.$(OBJEXT)
 repacketizer_demo_OBJECTS = $(am_repacketizer_demo_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@repacketizer_demo_DEPENDENCIES = libopus.la \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
 @EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
 am__tests_test_opus_api_SOURCES_DIST = tests/test_opus_api.c \
 	tests/test_opus_common.h
@@ -424,6 +511,7 @@
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_api.$(OBJEXT)
 tests_test_opus_api_OBJECTS = $(am_tests_test_opus_api_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_api_DEPENDENCIES = libopus.la \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
 @EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
 am__tests_test_opus_decode_SOURCES_DIST = tests/test_opus_decode.c \
 	tests/test_opus_common.h
@@ -431,6 +519,7 @@
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_decode.$(OBJEXT)
 tests_test_opus_decode_OBJECTS = $(am_tests_test_opus_decode_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_decode_DEPENDENCIES = libopus.la \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
 @EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
 am__tests_test_opus_encode_SOURCES_DIST = tests/test_opus_encode.c \
 	tests/test_opus_common.h
@@ -438,6 +527,7 @@
 @EXTRA_PROGRAMS_TRUE@	tests/test_opus_encode.$(OBJEXT)
 tests_test_opus_encode_OBJECTS = $(am_tests_test_opus_encode_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_encode_DEPENDENCIES = libopus.la \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1) \
 @EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
 am__tests_test_opus_padding_SOURCES_DIST = tests/test_opus_padding.c \
 	tests/test_opus_common.h
@@ -446,7 +536,20 @@
 tests_test_opus_padding_OBJECTS =  \
 	$(am_tests_test_opus_padding_OBJECTS)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_padding_DEPENDENCIES =  \
-@EXTRA_PROGRAMS_TRUE@	libopus.la $(am__DEPENDENCIES_1)
+@EXTRA_PROGRAMS_TRUE@	libopus.la $(am__DEPENDENCIES_1) \
+@EXTRA_PROGRAMS_TRUE@	$(am__DEPENDENCIES_1)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
@@ -459,10 +562,8 @@
 	$(AM_CCASFLAGS) $(CCASFLAGS)
 AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
 am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
-am__v_CPPAS_0 = @echo "  CPPAS " $@;
-AM_V_at = $(am__v_at_@AM_V@)
-am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
-am__v_at_0 = @
+am__v_CPPAS_0 = @echo "  CPPAS   " $@;
+am__v_CPPAS_1 = 
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
 LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
@@ -471,18 +572,17 @@
 	$(AM_CFLAGS) $(CFLAGS)
 AM_V_CC = $(am__v_CC_@AM_V@)
 am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
-am__v_CC_0 = @echo "  CC    " $@;
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
 CCLD = $(CC)
 LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 AM_V_CCLD = $(am__v_CCLD_@AM_V@)
 am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
-am__v_CCLD_0 = @echo "  CCLD  " $@;
-AM_V_GEN = $(am__v_GEN_@AM_V@)
-am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN   " $@;
-SOURCES = $(libopus_la_SOURCES) $(nodist_libopus_la_SOURCES) \
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libarmasm_la_SOURCES) $(libopus_la_SOURCES) \
 	$(celt_tests_test_unit_cwrs32_SOURCES) \
 	$(celt_tests_test_unit_dft_SOURCES) \
 	$(celt_tests_test_unit_entropy_SOURCES) \
@@ -496,7 +596,8 @@
 	$(tests_test_opus_decode_SOURCES) \
 	$(tests_test_opus_encode_SOURCES) \
 	$(tests_test_opus_padding_SOURCES)
-DIST_SOURCES = $(am__libopus_la_SOURCES_DIST) \
+DIST_SOURCES = $(am__libarmasm_la_SOURCES_DIST) \
+	$(am__libopus_la_SOURCES_DIST) \
 	$(am__celt_tests_test_unit_cwrs32_SOURCES_DIST) \
 	$(am__celt_tests_test_unit_dft_SOURCES_DIST) \
 	$(am__celt_tests_test_unit_entropy_SOURCES_DIST) \
@@ -513,13 +614,14 @@
 	$(am__tests_test_opus_decode_SOURCES_DIST) \
 	$(am__tests_test_opus_encode_SOURCES_DIST) \
 	$(am__tests_test_opus_padding_SOURCES_DIST)
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+	ctags-recursive dvi-recursive html-recursive info-recursive \
+	install-data-recursive install-dvi-recursive \
+	install-exec-recursive install-html-recursive \
+	install-info-recursive install-pdf-recursive \
+	install-ps-recursive install-recursive installcheck-recursive \
+	installdirs-recursive pdf-recursive ps-recursive \
+	tags-recursive uninstall-recursive
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -532,13 +634,217 @@
 HEADERS = $(noinst_HEADERS) $(pkginclude_HEADERS)
 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
   distclean-recursive maintainer-clean-recursive
-AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
-	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
-	distdir dist dist-all distcheck
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+	cscope check recheck distdir dist dist-all distcheck
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
+	$(LISP)config.h.in
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
-am__tty_colors = \
-red=; grn=; lgn=; blu=; std=
+CSCOPE = cscope
+am__tty_colors_dummy = \
+  mgn= red= grn= lgn= blu= brg= std=; \
+  am__color_tests=no
+am__tty_colors = { \
+  $(am__tty_colors_dummy); \
+  if test "X$(AM_COLOR_TESTS)" = Xno; then \
+    am__color_tests=no; \
+  elif test "X$(AM_COLOR_TESTS)" = Xalways; then \
+    am__color_tests=yes; \
+  elif test "X$$TERM" != Xdumb && { test -t 1; } 2>/dev/null; then \
+    am__color_tests=yes; \
+  fi; \
+  if test $$am__color_tests = yes; then \
+    red=''; \
+    grn=''; \
+    lgn=''; \
+    blu=''; \
+    mgn=''; \
+    brg=''; \
+    std=''; \
+  fi; \
+}
+am__recheck_rx = ^[ 	]*:recheck:[ 	]*
+am__global_test_result_rx = ^[ 	]*:global-test-result:[ 	]*
+am__copy_in_global_log_rx = ^[ 	]*:copy-in-global-log:[ 	]*
+# A command that, given a newline-separated list of test names on the
+# standard input, print the name of the tests that are to be re-run
+# upon "make recheck".
+am__list_recheck_tests = $(AWK) '{ \
+  recheck = 1; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+        { \
+          if ((getline line2 < ($$0 ".log")) < 0) \
+	    recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[nN][Oo]/) \
+        { \
+          recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[yY][eE][sS]/) \
+        { \
+          break; \
+        } \
+    }; \
+  if (recheck) \
+    print $$0; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# A command that, given a newline-separated list of test names on the
+# standard input, create the global log from their .trs and .log files.
+am__create_global_log = $(AWK) ' \
+function fatal(msg) \
+{ \
+  print "fatal: making $@: " msg | "cat >&2"; \
+  exit 1; \
+} \
+function rst_section(header) \
+{ \
+  print header; \
+  len = length(header); \
+  for (i = 1; i <= len; i = i + 1) \
+    printf "="; \
+  printf "\n\n"; \
+} \
+{ \
+  copy_in_global_log = 1; \
+  global_test_result = "RUN"; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+         fatal("failed to read from " $$0 ".trs"); \
+      if (line ~ /$(am__global_test_result_rx)/) \
+        { \
+          sub("$(am__global_test_result_rx)", "", line); \
+          sub("[ 	]*$$", "", line); \
+          global_test_result = line; \
+        } \
+      else if (line ~ /$(am__copy_in_global_log_rx)[nN][oO]/) \
+        copy_in_global_log = 0; \
+    }; \
+  if (copy_in_global_log) \
+    { \
+      rst_section(global_test_result ": " $$0); \
+      while ((rc = (getline line < ($$0 ".log"))) != 0) \
+      { \
+        if (rc < 0) \
+          fatal("failed to read from " $$0 ".log"); \
+        print line; \
+      }; \
+      printf "\n"; \
+    }; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# Restructured Text title.
+am__rst_title = { sed 's/.*/   &   /;h;s/./=/g;p;x;s/ *$$//;p;g' && echo; }
+# Solaris 10 'make', and several other traditional 'make' implementations,
+# pass "-e" to $(SHELL), and POSIX 2008 even requires this.  Work around it
+# by disabling -e (using the XSI extension "set +e") if it's set.
+am__sh_e_setup = case $$- in *e*) set +e;; esac
+# Default flags passed to test drivers.
+am__common_driver_flags = \
+  --color-tests "$$am__color_tests" \
+  --enable-hard-errors "$$am__enable_hard_errors" \
+  --expect-failure "$$am__expect_failure"
+# To be inserted before the command running the test.  Creates the
+# directory for the log if needed.  Stores in $dir the directory
+# containing $f, in $tst the test, in $log the log.  Executes the
+# developer- defined test setup AM_TESTS_ENVIRONMENT (if any), and
+# passes TESTS_ENVIRONMENT.  Set up options for the wrapper that
+# will run the test scripts (or their associated LOG_COMPILER, if
+# thy have one).
+am__check_pre = \
+$(am__sh_e_setup);					\
+$(am__vpath_adj_setup) $(am__vpath_adj)			\
+$(am__tty_colors);					\
+srcdir=$(srcdir); export srcdir;			\
+case "$@" in						\
+  */*) am__odir=`echo "./$@" | sed 's|/[^/]*$$||'`;;	\
+    *) am__odir=.;; 					\
+esac;							\
+test "x$$am__odir" = x"." || test -d "$$am__odir" 	\
+  || $(MKDIR_P) "$$am__odir" || exit $$?;		\
+if test -f "./$$f"; then dir=./;			\
+elif test -f "$$f"; then dir=;				\
+else dir="$(srcdir)/"; fi;				\
+tst=$$dir$$f; log='$@'; 				\
+if test -n '$(DISABLE_HARD_ERRORS)'; then		\
+  am__enable_hard_errors=no; 				\
+else							\
+  am__enable_hard_errors=yes; 				\
+fi; 							\
+case " $(XFAIL_TESTS) " in				\
+  *[\ \	]$$f[\ \	]* | *[\ \	]$$dir$$f[\ \	]*) \
+    am__expect_failure=yes;;				\
+  *)							\
+    am__expect_failure=no;;				\
+esac; 							\
+$(AM_TESTS_ENVIRONMENT) $(TESTS_ENVIRONMENT)
+# A shell command to get the names of the tests scripts with any registered
+# extension removed (i.e., equivalently, the names of the test logs, with
+# the '.log' extension removed).  The result is saved in the shell variable
+# '$bases'.  This honors runtime overriding of TESTS and TEST_LOGS.  Sadly,
+# we cannot use something simpler, involving e.g., "$(TEST_LOGS:.log=)",
+# since that might cause problem with VPATH rewrites for suffix-less tests.
+# See also 'test-harness-vpath-rewrite.sh' and 'test-trs-basic.sh'.
+am__set_TESTS_bases = \
+  bases='$(TEST_LOGS)'; \
+  bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \
+  bases=`echo $$bases`
+RECHECK_LOGS = $(TEST_LOGS)
+TEST_SUITE_LOG = test-suite.log
+TEST_EXTENSIONS = @EXEEXT@ .test
+LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver
+LOG_COMPILE = $(LOG_COMPILER) $(AM_LOG_FLAGS) $(LOG_FLAGS)
+am__set_b = \
+  case '$@' in \
+    */*) \
+      case '$*' in \
+        */*) b='$*';; \
+          *) b=`echo '$@' | sed 's/\.log$$//'`; \
+       esac;; \
+    *) \
+      b='$*';; \
+  esac
+am__test_logs1 = $(TESTS:=.log)
+am__test_logs2 = $(am__test_logs1:@EXEEXT@.log=.log)
+TEST_LOGS = $(am__test_logs2:.test.log=.log)
+TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver
+TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \
+	$(TEST_LOG_FLAGS)
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/celt_headers.mk \
+	$(srcdir)/celt_sources.mk $(srcdir)/config.h.in \
+	$(srcdir)/opus-uninstalled.pc.in $(srcdir)/opus.pc.in \
+	$(srcdir)/opus_headers.mk $(srcdir)/opus_sources.mk \
+	$(srcdir)/silk_headers.mk $(srcdir)/silk_sources.mk \
+	$(top_srcdir)/celt/arm/armopts.s.in AUTHORS COPYING ChangeLog \
+	INSTALL NEWS README compile config.guess config.sub depcomp \
+	install-sh ltmain.sh missing test-driver
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -548,6 +854,7 @@
       && rm -rf "$(distdir)" \
       || { sleep 5 && rm -rf "$(distdir)"; }; \
   else :; fi
+am__post_remove_distdir = $(am__remove_distdir)
 am__relativize = \
   dir0=`pwd`; \
   sed_first='s,^\([^/]*\)/.*$$,\1,'; \
@@ -575,6 +882,7 @@
   reldir="$$dir2"
 DIST_ARCHIVES = $(distdir).tar.gz
 GZIP_ENV = --best
+DIST_TARGETS = dist-gzip
 distuninstallcheck_listfiles = find . -type f -print
 am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
   | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
@@ -583,6 +891,8 @@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM2GNU_PARAMS = @ARM2GNU_PARAMS@
+ARM_NEON_INTR_CFLAGS = @ARM_NEON_INTR_CFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
@@ -608,6 +918,7 @@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
+HAVE_ARM_NE10 = @HAVE_ARM_NE10@
 HAVE_DOXYGEN = @HAVE_DOXYGEN@
 HAVE_PERL = @HAVE_PERL@
 INSTALL = @INSTALL@
@@ -628,6 +939,8 @@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
+NE10_CFLAGS = @NE10_CFLAGS@
+NE10_LIBS = @NE10_LIBS@
 NM = @NM@
 NMEDIT = @NMEDIT@
 OBJDUMP = @OBJDUMP@
@@ -635,10 +948,15 @@
 OPUS_ARM_MAY_HAVE_EDSP = @OPUS_ARM_MAY_HAVE_EDSP@
 OPUS_ARM_MAY_HAVE_MEDIA = @OPUS_ARM_MAY_HAVE_MEDIA@
 OPUS_ARM_MAY_HAVE_NEON = @OPUS_ARM_MAY_HAVE_NEON@
+OPUS_ARM_NEON_INTR_CFLAGS = @OPUS_ARM_NEON_INTR_CFLAGS@
 OPUS_HAVE_RTCD = @OPUS_HAVE_RTCD@
 OPUS_LT_AGE = @OPUS_LT_AGE@
 OPUS_LT_CURRENT = @OPUS_LT_CURRENT@
 OPUS_LT_REVISION = @OPUS_LT_REVISION@
+OPUS_X86_AVX_CFLAGS = @OPUS_X86_AVX_CFLAGS@
+OPUS_X86_SSE2_CFLAGS = @OPUS_X86_SSE2_CFLAGS@
+OPUS_X86_SSE4_1_CFLAGS = @OPUS_X86_SSE4_1_CFLAGS@
+OPUS_X86_SSE_CFLAGS = @OPUS_X86_SSE_CFLAGS@
 OTOOL = @OTOOL@
 OTOOL64 = @OTOOL64@
 PACKAGE = @PACKAGE@
@@ -656,6 +974,10 @@
 SHELL = @SHELL@
 STRIP = @STRIP@
 VERSION = @VERSION@
+X86_AVX_CFLAGS = @X86_AVX_CFLAGS@
+X86_SSE2_CFLAGS = @X86_SSE2_CFLAGS@
+X86_SSE4_1_CFLAGS = @X86_SSE4_1_CFLAGS@
+X86_SSE_CFLAGS = @X86_SSE_CFLAGS@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
 abs_top_builddir = @abs_top_builddir@
@@ -713,13 +1035,23 @@
 lib_LTLIBRARIES = libopus.la
 DIST_SUBDIRS = doc
 AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/celt -I$(top_srcdir)/silk \
-              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed
+              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed $(NE10_CFLAGS)
 
 CELT_SOURCES = celt/bands.c celt/celt.c celt/celt_encoder.c \
 	celt/celt_decoder.c celt/cwrs.c celt/entcode.c celt/entdec.c \
 	celt/entenc.c celt/kiss_fft.c celt/laplace.c celt/mathops.c \
 	celt/mdct.c celt/modes.c celt/pitch.c celt/celt_lpc.c \
-	celt/quant_bands.c celt/rate.c celt/vq.c $(am__append_4)
+	celt/quant_bands.c celt/rate.c celt/vq.c $(am__append_6) \
+	$(am__append_7) $(am__append_8) $(am__append_9) \
+	$(am__append_11) $(am__append_12)
+CELT_SOURCES_SSE = celt/x86/x86cpu.c \
+celt/x86/x86_celt_map.c \
+celt/x86/pitch_sse.c
+
+CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c
+CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \
+celt/x86/pitch_sse4_1.c
+
 CELT_SOURCES_ARM = \
 celt/arm/armcpu.c \
 celt/arm/arm_celt_map.c
@@ -730,6 +1062,13 @@
 CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
 
+CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_neon_intr.c
+
+CELT_SOURCES_ARM_NE10 = \
+celt/arm/celt_ne10_fft.c \
+celt/arm/celt_ne10_mdct.c
+
 SILK_SOURCES = silk/CNG.c silk/code_signs.c silk/init_decoder.c \
 	silk/decode_core.c silk/decode_frame.c \
 	silk/decode_parameters.c silk/decode_indices.c \
@@ -761,7 +1100,14 @@
 	silk/sigm_Q15.c silk/sort.c silk/sum_sqr_shift.c \
 	silk/stereo_decode_pred.c silk/stereo_encode_pred.c \
 	silk/stereo_find_predictor.c silk/stereo_quant_pred.c \
-	$(am__append_1) $(am__append_2) $(am__append_5)
+	$(am__append_1) $(am__append_2) $(am__append_3) \
+	$(am__append_4) $(am__append_10)
+SILK_SOURCES_SSE4_1 = silk/x86/NSQ_sse.c \
+silk/x86/NSQ_del_dec_sse.c \
+silk/x86/x86_silk_map.c \
+silk/x86/VAD_sse.c \
+silk/x86/VQ_WMat_EC_sse.c
+
 SILK_SOURCES_FIXED = \
 silk/fixed/LTP_analysis_filter_FIX.c \
 silk/fixed/LTP_scale_ctrl_FIX.c \
@@ -789,6 +1135,10 @@
 silk/fixed/schur64_FIX.c \
 silk/fixed/schur_FIX.c
 
+SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \
+silk/fixed/x86/burg_modified_FIX_sse.c \
+silk/fixed/x86/prefilter_FIX_sse.c
+
 SILK_SOURCES_FLOAT = \
 silk/float/apply_sine_window_FLP.c \
 silk/float/corrMatrix_FLP.c \
@@ -825,13 +1175,14 @@
 OPUS_SOURCES = src/opus.c src/opus_decoder.c src/opus_encoder.c \
 	src/opus_multistream.c src/opus_multistream_encoder.c \
 	src/opus_multistream_decoder.c src/repacketizer.c \
-	$(am__append_3)
+	$(am__append_5)
 OPUS_SOURCES_FLOAT = \
 src/analysis.c \
 src/mlp.c \
 src/mlp_data.c
 
-@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
+@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@noinst_LTLIBRARIES = libarmasm.la
+@CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@libarmasm_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
 @CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
 @CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@ $(CELT_AM_SOURCES_ARM_ASM:.s.in=.s) \
 @CPU_ARM_TRUE@@OPUS_ARM_EXTERNAL_ASM_TRUE@ $(CELT_AM_SOURCES_ARM_ASM:.s.in=-gnu.S)
@@ -865,19 +1216,31 @@
 celt/os_support.h \
 celt/pitch.h \
 celt/celt_lpc.h \
+celt/x86/celt_lpc_sse.h \
 celt/quant_bands.h \
 celt/rate.h \
 celt/stack_alloc.h \
 celt/vq.h \
 celt/static_modes_float.h \
 celt/static_modes_fixed.h \
+celt/static_modes_float_arm_ne10.h \
+celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
 celt/arm/pitch_arm.h \
-celt/x86/pitch_sse.h
+celt/arm/fft_arm.h \
+celt/arm/mdct_arm.h \
+celt/mips/celt_mipsr1.h \
+celt/mips/fixed_generic_mipsr1.h \
+celt/mips/kiss_fft_mipsr1.h \
+celt/mips/mdct_mipsr1.h \
+celt/mips/pitch_mipsr1.h \
+celt/mips/vq_mipsr1.h \
+celt/x86/pitch_sse.h \
+celt/x86/x86cpu.h
 
 SILK_HEAD = \
 silk/debug.h \
@@ -887,6 +1250,7 @@
 silk/typedef.h \
 silk/define.h \
 silk/main.h \
+silk/x86/main_sse.h \
 silk/PLC.h \
 silk/structs.h \
 silk/tables.h \
@@ -900,15 +1264,22 @@
 silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
+silk/x86/SigProc_FIX_sse.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
+silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
+silk/fixed/mips/prefilter_FIX_mipsr1.h \
+silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \
 silk/float/main_FLP.h \
 silk/float/structs_FLP.h \
-silk/float/SigProc_FLP.h
+silk/float/SigProc_FLP.h \
+silk/mips/macros_mipsr1.h \
+silk/mips/NSQ_del_dec_mipsr1.h \
+silk/mips/sigproc_fix_mipsr1.h
 
 OPUS_HEAD = \
 include/opus.h \
@@ -920,38 +1291,42 @@
 
 libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
 libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
-libopus_la_LIBADD = $(LIBM)
+libopus_la_LIBADD = $(NE10_LIBS) $(LIBM) $(am__append_13)
 pkginclude_HEADERS = include/opus.h include/opus_multistream.h \
-	include/opus_types.h include/opus_defines.h $(am__append_6)
+	include/opus_types.h include/opus_defines.h $(am__append_18)
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
 @EXTRA_PROGRAMS_TRUE@opus_demo_SOURCES = src/opus_demo.c
-@EXTRA_PROGRAMS_TRUE@opus_demo_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@opus_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@repacketizer_demo_SOURCES = src/repacketizer_demo.c
-@EXTRA_PROGRAMS_TRUE@repacketizer_demo_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@repacketizer_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@opus_compare_SOURCES = src/opus_compare.c
 @EXTRA_PROGRAMS_TRUE@opus_compare_LDADD = $(LIBM)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_api_SOURCES = tests/test_opus_api.c tests/test_opus_common.h
-@EXTRA_PROGRAMS_TRUE@tests_test_opus_api_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@tests_test_opus_api_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_encode_SOURCES = tests/test_opus_encode.c tests/test_opus_common.h
-@EXTRA_PROGRAMS_TRUE@tests_test_opus_encode_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@tests_test_opus_encode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_decode_SOURCES = tests/test_opus_decode.c tests/test_opus_common.h
-@EXTRA_PROGRAMS_TRUE@tests_test_opus_decode_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
-@EXTRA_PROGRAMS_TRUE@tests_test_opus_padding_LDADD = libopus.la $(LIBM)
+@EXTRA_PROGRAMS_TRUE@tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
-@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_dft_LDADD = $(LIBM)
+@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_dft_LDADD = $(NE10_LIBS) \
+@EXTRA_PROGRAMS_TRUE@	$(LIBM) $(am__append_14)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_entropy_LDADD = $(LIBM)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_laplace_SOURCES = celt/tests/test_unit_laplace.c
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_laplace_LDADD = $(LIBM)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mathops_SOURCES = celt/tests/test_unit_mathops.c
-@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mathops_LDADD = $(LIBM)
+@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mathops_LDADD =  \
+@EXTRA_PROGRAMS_TRUE@	$(NE10_LIBS) $(LIBM) $(am__append_15)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
-@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mdct_LDADD = $(LIBM)
+@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_mdct_LDADD = $(NE10_LIBS) \
+@EXTRA_PROGRAMS_TRUE@	$(LIBM) $(am__append_16)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
-@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_rotation_LDADD = $(LIBM)
+@EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_rotation_LDADD =  \
+@EXTRA_PROGRAMS_TRUE@	$(NE10_LIBS) $(LIBM) $(am__append_17)
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_types_SOURCES = celt/tests/test_unit_types.c
 @EXTRA_PROGRAMS_TRUE@celt_tests_test_unit_types_LDADD = $(LIBM)
 @CUSTOM_MODES_TRUE@@EXTRA_PROGRAMS_TRUE@opus_custom_demo_SOURCES = celt/opus_custom_demo.c
@@ -960,6 +1335,7 @@
              opus.pc.in \
              opus-uninstalled.pc.in \
              opus.m4 \
+             Makefile.mips \
              Makefile.unix \
              tests/run_vectors.sh \
              celt/arm/arm2gnu.pl \
@@ -990,11 +1366,23 @@
 pkgconfig_DATA = opus.pc
 m4datadir = $(datadir)/aclocal
 m4data_DATA = opus.m4
+OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
+                    $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+
+@HAVE_SSE_TRUE@SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
+@HAVE_SSE2_TRUE@SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+@HAVE_SSE4_1_TRUE@SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+@HAVE_SSE4_1_TRUE@             $(SILK_SOURCES_SSE4_1:.c=.lo) \
+@HAVE_SSE4_1_TRUE@             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
+
+@OPUS_ARM_NEON_INTR_TRUE@CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo)
 all: $(BUILT_SOURCES) config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-recursive
 
 .SUFFIXES:
-.SUFFIXES: .S .c .lo .o .obj
+.SUFFIXES: .S .c .lo .log .o .obj .test .test$(EXEEXT) .trs
 am--refresh: Makefile
 	@:
 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/celt_sources.mk $(srcdir)/silk_sources.mk $(srcdir)/opus_sources.mk $(srcdir)/celt_headers.mk $(srcdir)/silk_headers.mk $(srcdir)/opus_headers.mk $(am__configure_deps)
@@ -1010,7 +1398,6 @@
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
-.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -1020,7 +1407,7 @@
 	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
 	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
 	esac;
-$(srcdir)/celt_sources.mk $(srcdir)/silk_sources.mk $(srcdir)/opus_sources.mk $(srcdir)/celt_headers.mk $(srcdir)/silk_headers.mk $(srcdir)/opus_headers.mk:
+$(srcdir)/celt_sources.mk $(srcdir)/silk_sources.mk $(srcdir)/opus_sources.mk $(srcdir)/celt_headers.mk $(srcdir)/silk_headers.mk $(srcdir)/opus_headers.mk $(am__empty):
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	$(SHELL) ./config.status --recheck
@@ -1032,8 +1419,8 @@
 $(am__aclocal_m4_deps):
 
 config.h: stamp-h1
-	@if test ! -f $@; then rm -f stamp-h1; else :; fi
-	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
+	@test -f $@ || rm -f stamp-h1
+	@test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1
 
 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
 	@rm -f stamp-h1
@@ -1051,6 +1438,7 @@
 	cd $(top_builddir) && $(SHELL) ./config.status $@
 celt/arm/armopts.s: $(top_builddir)/config.status $(top_srcdir)/celt/arm/armopts.s.in
 	cd $(top_builddir) && $(SHELL) ./config.status $@
+
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
 	@$(NORMAL_INSTALL)
 	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
@@ -1077,12 +1465,36 @@
 
 clean-libLTLIBRARIES:
 	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
-	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
-	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
-	  test "$$dir" != "$$p" || dir=.; \
-	  echo "rm -f \"$${dir}/so_locations\""; \
-	  rm -f "$${dir}/so_locations"; \
-	done
+	@list='$(lib_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+celt/arm/$(am__dirstamp):
+	@$(MKDIR_P) celt/arm
+	@: > celt/arm/$(am__dirstamp)
+celt/arm/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) celt/arm/$(DEPDIR)
+	@: > celt/arm/$(DEPDIR)/$(am__dirstamp)
+celt/arm/celt_pitch_xcorr_arm-gnu.lo: celt/arm/$(am__dirstamp) \
+	celt/arm/$(DEPDIR)/$(am__dirstamp)
+
+libarmasm.la: $(libarmasm_la_OBJECTS) $(libarmasm_la_DEPENDENCIES) $(EXTRA_libarmasm_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK) $(am_libarmasm_la_rpath) $(libarmasm_la_OBJECTS) $(libarmasm_la_LIBADD) $(LIBS)
 celt/$(am__dirstamp):
 	@$(MKDIR_P) celt
 	@: > celt/$(am__dirstamp)
@@ -1110,16 +1522,34 @@
 	celt/$(DEPDIR)/$(am__dirstamp)
 celt/rate.lo: celt/$(am__dirstamp) celt/$(DEPDIR)/$(am__dirstamp)
 celt/vq.lo: celt/$(am__dirstamp) celt/$(DEPDIR)/$(am__dirstamp)
-celt/arm/$(am__dirstamp):
-	@$(MKDIR_P) celt/arm
-	@: > celt/arm/$(am__dirstamp)
-celt/arm/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) celt/arm/$(DEPDIR)
-	@: > celt/arm/$(DEPDIR)/$(am__dirstamp)
+celt/x86/$(am__dirstamp):
+	@$(MKDIR_P) celt/x86
+	@: > celt/x86/$(am__dirstamp)
+celt/x86/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) celt/x86/$(DEPDIR)
+	@: > celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/x86cpu.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/x86_celt_map.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/pitch_sse.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/pitch_sse2.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/celt_lpc_sse.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
+celt/x86/pitch_sse4_1.lo: celt/x86/$(am__dirstamp) \
+	celt/x86/$(DEPDIR)/$(am__dirstamp)
 celt/arm/armcpu.lo: celt/arm/$(am__dirstamp) \
 	celt/arm/$(DEPDIR)/$(am__dirstamp)
 celt/arm/arm_celt_map.lo: celt/arm/$(am__dirstamp) \
 	celt/arm/$(DEPDIR)/$(am__dirstamp)
+celt/arm/celt_neon_intr.lo: celt/arm/$(am__dirstamp) \
+	celt/arm/$(DEPDIR)/$(am__dirstamp)
+celt/arm/celt_ne10_fft.lo: celt/arm/$(am__dirstamp) \
+	celt/arm/$(DEPDIR)/$(am__dirstamp)
+celt/arm/celt_ne10_mdct.lo: celt/arm/$(am__dirstamp) \
+	celt/arm/$(DEPDIR)/$(am__dirstamp)
 silk/$(am__dirstamp):
 	@$(MKDIR_P) silk
 	@: > silk/$(am__dirstamp)
@@ -1319,6 +1749,35 @@
 	silk/fixed/$(DEPDIR)/$(am__dirstamp)
 silk/fixed/schur_FIX.lo: silk/fixed/$(am__dirstamp) \
 	silk/fixed/$(DEPDIR)/$(am__dirstamp)
+silk/x86/$(am__dirstamp):
+	@$(MKDIR_P) silk/x86
+	@: > silk/x86/$(am__dirstamp)
+silk/x86/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) silk/x86/$(DEPDIR)
+	@: > silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/x86/NSQ_sse.lo: silk/x86/$(am__dirstamp) \
+	silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/x86/NSQ_del_dec_sse.lo: silk/x86/$(am__dirstamp) \
+	silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/x86/x86_silk_map.lo: silk/x86/$(am__dirstamp) \
+	silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/x86/VAD_sse.lo: silk/x86/$(am__dirstamp) \
+	silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/x86/VQ_WMat_EC_sse.lo: silk/x86/$(am__dirstamp) \
+	silk/x86/$(DEPDIR)/$(am__dirstamp)
+silk/fixed/x86/$(am__dirstamp):
+	@$(MKDIR_P) silk/fixed/x86
+	@: > silk/fixed/x86/$(am__dirstamp)
+silk/fixed/x86/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) silk/fixed/x86/$(DEPDIR)
+	@: > silk/fixed/x86/$(DEPDIR)/$(am__dirstamp)
+silk/fixed/x86/vector_ops_FIX_sse.lo: silk/fixed/x86/$(am__dirstamp) \
+	silk/fixed/x86/$(DEPDIR)/$(am__dirstamp)
+silk/fixed/x86/burg_modified_FIX_sse.lo:  \
+	silk/fixed/x86/$(am__dirstamp) \
+	silk/fixed/x86/$(DEPDIR)/$(am__dirstamp)
+silk/fixed/x86/prefilter_FIX_sse.lo: silk/fixed/x86/$(am__dirstamp) \
+	silk/fixed/x86/$(DEPDIR)/$(am__dirstamp)
 silk/float/$(am__dirstamp):
 	@$(MKDIR_P) silk/float
 	@: > silk/float/$(am__dirstamp)
@@ -1406,8 +1865,7 @@
 src/analysis.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
 src/mlp.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
 src/mlp_data.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
-celt/arm/celt_pitch_xcorr_arm-gnu.lo: celt/arm/$(am__dirstamp) \
-	celt/arm/$(DEPDIR)/$(am__dirstamp)
+
 libopus.la: $(libopus_la_OBJECTS) $(libopus_la_DEPENDENCIES) $(EXTRA_libopus_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libopus_la_LINK) -rpath $(libdir) $(libopus_la_OBJECTS) $(libopus_la_LIBADD) $(LIBS)
 
@@ -1427,61 +1885,73 @@
 	@: > celt/tests/$(DEPDIR)/$(am__dirstamp)
 celt/tests/test_unit_cwrs32.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_cwrs32$(EXEEXT): $(celt_tests_test_unit_cwrs32_OBJECTS) $(celt_tests_test_unit_cwrs32_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_cwrs32_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_cwrs32$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_cwrs32_OBJECTS) $(celt_tests_test_unit_cwrs32_LDADD) $(LIBS)
 celt/tests/test_unit_dft.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_dft$(EXEEXT): $(celt_tests_test_unit_dft_OBJECTS) $(celt_tests_test_unit_dft_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_dft_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_dft$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_dft_OBJECTS) $(celt_tests_test_unit_dft_LDADD) $(LIBS)
 celt/tests/test_unit_entropy.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_entropy$(EXEEXT): $(celt_tests_test_unit_entropy_OBJECTS) $(celt_tests_test_unit_entropy_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_entropy_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_entropy$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_entropy_OBJECTS) $(celt_tests_test_unit_entropy_LDADD) $(LIBS)
 celt/tests/test_unit_laplace.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_laplace$(EXEEXT): $(celt_tests_test_unit_laplace_OBJECTS) $(celt_tests_test_unit_laplace_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_laplace_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_laplace$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_laplace_OBJECTS) $(celt_tests_test_unit_laplace_LDADD) $(LIBS)
 celt/tests/test_unit_mathops.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_mathops$(EXEEXT): $(celt_tests_test_unit_mathops_OBJECTS) $(celt_tests_test_unit_mathops_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_mathops_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_mathops$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_mathops_OBJECTS) $(celt_tests_test_unit_mathops_LDADD) $(LIBS)
 celt/tests/test_unit_mdct.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_mdct$(EXEEXT): $(celt_tests_test_unit_mdct_OBJECTS) $(celt_tests_test_unit_mdct_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_mdct_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_mdct$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_mdct_OBJECTS) $(celt_tests_test_unit_mdct_LDADD) $(LIBS)
 celt/tests/test_unit_rotation.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_rotation$(EXEEXT): $(celt_tests_test_unit_rotation_OBJECTS) $(celt_tests_test_unit_rotation_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_rotation_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_rotation$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_rotation_OBJECTS) $(celt_tests_test_unit_rotation_LDADD) $(LIBS)
 celt/tests/test_unit_types.$(OBJEXT): celt/tests/$(am__dirstamp) \
 	celt/tests/$(DEPDIR)/$(am__dirstamp)
+
 celt/tests/test_unit_types$(EXEEXT): $(celt_tests_test_unit_types_OBJECTS) $(celt_tests_test_unit_types_DEPENDENCIES) $(EXTRA_celt_tests_test_unit_types_DEPENDENCIES) celt/tests/$(am__dirstamp)
 	@rm -f celt/tests/test_unit_types$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(celt_tests_test_unit_types_OBJECTS) $(celt_tests_test_unit_types_LDADD) $(LIBS)
 src/opus_compare.$(OBJEXT): src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
+
 opus_compare$(EXEEXT): $(opus_compare_OBJECTS) $(opus_compare_DEPENDENCIES) $(EXTRA_opus_compare_DEPENDENCIES) 
 	@rm -f opus_compare$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(opus_compare_OBJECTS) $(opus_compare_LDADD) $(LIBS)
 celt/opus_custom_demo.$(OBJEXT): celt/$(am__dirstamp) \
 	celt/$(DEPDIR)/$(am__dirstamp)
+
 opus_custom_demo$(EXEEXT): $(opus_custom_demo_OBJECTS) $(opus_custom_demo_DEPENDENCIES) $(EXTRA_opus_custom_demo_DEPENDENCIES) 
 	@rm -f opus_custom_demo$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(opus_custom_demo_OBJECTS) $(opus_custom_demo_LDADD) $(LIBS)
 src/opus_demo.$(OBJEXT): src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
+
 opus_demo$(EXEEXT): $(opus_demo_OBJECTS) $(opus_demo_DEPENDENCIES) $(EXTRA_opus_demo_DEPENDENCIES) 
 	@rm -f opus_demo$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(opus_demo_OBJECTS) $(opus_demo_LDADD) $(LIBS)
 src/repacketizer_demo.$(OBJEXT): src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
+
 repacketizer_demo$(EXEEXT): $(repacketizer_demo_OBJECTS) $(repacketizer_demo_DEPENDENCIES) $(EXTRA_repacketizer_demo_DEPENDENCIES) 
 	@rm -f repacketizer_demo$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(repacketizer_demo_OBJECTS) $(repacketizer_demo_LDADD) $(LIBS)
@@ -1493,369 +1963,51 @@
 	@: > tests/$(DEPDIR)/$(am__dirstamp)
 tests/test_opus_api.$(OBJEXT): tests/$(am__dirstamp) \
 	tests/$(DEPDIR)/$(am__dirstamp)
+
 tests/test_opus_api$(EXEEXT): $(tests_test_opus_api_OBJECTS) $(tests_test_opus_api_DEPENDENCIES) $(EXTRA_tests_test_opus_api_DEPENDENCIES) tests/$(am__dirstamp)
 	@rm -f tests/test_opus_api$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(tests_test_opus_api_OBJECTS) $(tests_test_opus_api_LDADD) $(LIBS)
 tests/test_opus_decode.$(OBJEXT): tests/$(am__dirstamp) \
 	tests/$(DEPDIR)/$(am__dirstamp)
+
 tests/test_opus_decode$(EXEEXT): $(tests_test_opus_decode_OBJECTS) $(tests_test_opus_decode_DEPENDENCIES) $(EXTRA_tests_test_opus_decode_DEPENDENCIES) tests/$(am__dirstamp)
 	@rm -f tests/test_opus_decode$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(tests_test_opus_decode_OBJECTS) $(tests_test_opus_decode_LDADD) $(LIBS)
 tests/test_opus_encode.$(OBJEXT): tests/$(am__dirstamp) \
 	tests/$(DEPDIR)/$(am__dirstamp)
+
 tests/test_opus_encode$(EXEEXT): $(tests_test_opus_encode_OBJECTS) $(tests_test_opus_encode_DEPENDENCIES) $(EXTRA_tests_test_opus_encode_DEPENDENCIES) tests/$(am__dirstamp)
 	@rm -f tests/test_opus_encode$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(tests_test_opus_encode_OBJECTS) $(tests_test_opus_encode_LDADD) $(LIBS)
 tests/test_opus_padding.$(OBJEXT): tests/$(am__dirstamp) \
 	tests/$(DEPDIR)/$(am__dirstamp)
+
 tests/test_opus_padding$(EXEEXT): $(tests_test_opus_padding_OBJECTS) $(tests_test_opus_padding_DEPENDENCIES) $(EXTRA_tests_test_opus_padding_DEPENDENCIES) tests/$(am__dirstamp)
 	@rm -f tests/test_opus_padding$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(tests_test_opus_padding_OBJECTS) $(tests_test_opus_padding_LDADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
-	-rm -f celt/arm/arm_celt_map.$(OBJEXT)
-	-rm -f celt/arm/arm_celt_map.lo
-	-rm -f celt/arm/armcpu.$(OBJEXT)
-	-rm -f celt/arm/armcpu.lo
-	-rm -f celt/arm/celt_pitch_xcorr_arm-gnu.$(OBJEXT)
-	-rm -f celt/arm/celt_pitch_xcorr_arm-gnu.lo
-	-rm -f celt/bands.$(OBJEXT)
-	-rm -f celt/bands.lo
-	-rm -f celt/celt.$(OBJEXT)
-	-rm -f celt/celt.lo
-	-rm -f celt/celt_decoder.$(OBJEXT)
-	-rm -f celt/celt_decoder.lo
-	-rm -f celt/celt_encoder.$(OBJEXT)
-	-rm -f celt/celt_encoder.lo
-	-rm -f celt/celt_lpc.$(OBJEXT)
-	-rm -f celt/celt_lpc.lo
-	-rm -f celt/cwrs.$(OBJEXT)
-	-rm -f celt/cwrs.lo
-	-rm -f celt/entcode.$(OBJEXT)
-	-rm -f celt/entcode.lo
-	-rm -f celt/entdec.$(OBJEXT)
-	-rm -f celt/entdec.lo
-	-rm -f celt/entenc.$(OBJEXT)
-	-rm -f celt/entenc.lo
-	-rm -f celt/kiss_fft.$(OBJEXT)
-	-rm -f celt/kiss_fft.lo
-	-rm -f celt/laplace.$(OBJEXT)
-	-rm -f celt/laplace.lo
-	-rm -f celt/mathops.$(OBJEXT)
-	-rm -f celt/mathops.lo
-	-rm -f celt/mdct.$(OBJEXT)
-	-rm -f celt/mdct.lo
-	-rm -f celt/modes.$(OBJEXT)
-	-rm -f celt/modes.lo
-	-rm -f celt/opus_custom_demo.$(OBJEXT)
-	-rm -f celt/pitch.$(OBJEXT)
-	-rm -f celt/pitch.lo
-	-rm -f celt/quant_bands.$(OBJEXT)
-	-rm -f celt/quant_bands.lo
-	-rm -f celt/rate.$(OBJEXT)
-	-rm -f celt/rate.lo
-	-rm -f celt/tests/test_unit_cwrs32.$(OBJEXT)
-	-rm -f celt/tests/test_unit_dft.$(OBJEXT)
-	-rm -f celt/tests/test_unit_entropy.$(OBJEXT)
-	-rm -f celt/tests/test_unit_laplace.$(OBJEXT)
-	-rm -f celt/tests/test_unit_mathops.$(OBJEXT)
-	-rm -f celt/tests/test_unit_mdct.$(OBJEXT)
-	-rm -f celt/tests/test_unit_rotation.$(OBJEXT)
-	-rm -f celt/tests/test_unit_types.$(OBJEXT)
-	-rm -f celt/vq.$(OBJEXT)
-	-rm -f celt/vq.lo
-	-rm -f silk/A2NLSF.$(OBJEXT)
-	-rm -f silk/A2NLSF.lo
-	-rm -f silk/CNG.$(OBJEXT)
-	-rm -f silk/CNG.lo
-	-rm -f silk/HP_variable_cutoff.$(OBJEXT)
-	-rm -f silk/HP_variable_cutoff.lo
-	-rm -f silk/LPC_analysis_filter.$(OBJEXT)
-	-rm -f silk/LPC_analysis_filter.lo
-	-rm -f silk/LPC_inv_pred_gain.$(OBJEXT)
-	-rm -f silk/LPC_inv_pred_gain.lo
-	-rm -f silk/LP_variable_cutoff.$(OBJEXT)
-	-rm -f silk/LP_variable_cutoff.lo
-	-rm -f silk/NLSF2A.$(OBJEXT)
-	-rm -f silk/NLSF2A.lo
-	-rm -f silk/NLSF_VQ.$(OBJEXT)
-	-rm -f silk/NLSF_VQ.lo
-	-rm -f silk/NLSF_VQ_weights_laroia.$(OBJEXT)
-	-rm -f silk/NLSF_VQ_weights_laroia.lo
-	-rm -f silk/NLSF_decode.$(OBJEXT)
-	-rm -f silk/NLSF_decode.lo
-	-rm -f silk/NLSF_del_dec_quant.$(OBJEXT)
-	-rm -f silk/NLSF_del_dec_quant.lo
-	-rm -f silk/NLSF_encode.$(OBJEXT)
-	-rm -f silk/NLSF_encode.lo
-	-rm -f silk/NLSF_stabilize.$(OBJEXT)
-	-rm -f silk/NLSF_stabilize.lo
-	-rm -f silk/NLSF_unpack.$(OBJEXT)
-	-rm -f silk/NLSF_unpack.lo
-	-rm -f silk/NSQ.$(OBJEXT)
-	-rm -f silk/NSQ.lo
-	-rm -f silk/NSQ_del_dec.$(OBJEXT)
-	-rm -f silk/NSQ_del_dec.lo
-	-rm -f silk/PLC.$(OBJEXT)
-	-rm -f silk/PLC.lo
-	-rm -f silk/VAD.$(OBJEXT)
-	-rm -f silk/VAD.lo
-	-rm -f silk/VQ_WMat_EC.$(OBJEXT)
-	-rm -f silk/VQ_WMat_EC.lo
-	-rm -f silk/ana_filt_bank_1.$(OBJEXT)
-	-rm -f silk/ana_filt_bank_1.lo
-	-rm -f silk/biquad_alt.$(OBJEXT)
-	-rm -f silk/biquad_alt.lo
-	-rm -f silk/bwexpander.$(OBJEXT)
-	-rm -f silk/bwexpander.lo
-	-rm -f silk/bwexpander_32.$(OBJEXT)
-	-rm -f silk/bwexpander_32.lo
-	-rm -f silk/check_control_input.$(OBJEXT)
-	-rm -f silk/check_control_input.lo
-	-rm -f silk/code_signs.$(OBJEXT)
-	-rm -f silk/code_signs.lo
-	-rm -f silk/control_SNR.$(OBJEXT)
-	-rm -f silk/control_SNR.lo
-	-rm -f silk/control_audio_bandwidth.$(OBJEXT)
-	-rm -f silk/control_audio_bandwidth.lo
-	-rm -f silk/control_codec.$(OBJEXT)
-	-rm -f silk/control_codec.lo
-	-rm -f silk/debug.$(OBJEXT)
-	-rm -f silk/debug.lo
-	-rm -f silk/dec_API.$(OBJEXT)
-	-rm -f silk/dec_API.lo
-	-rm -f silk/decode_core.$(OBJEXT)
-	-rm -f silk/decode_core.lo
-	-rm -f silk/decode_frame.$(OBJEXT)
-	-rm -f silk/decode_frame.lo
-	-rm -f silk/decode_indices.$(OBJEXT)
-	-rm -f silk/decode_indices.lo
-	-rm -f silk/decode_parameters.$(OBJEXT)
-	-rm -f silk/decode_parameters.lo
-	-rm -f silk/decode_pitch.$(OBJEXT)
-	-rm -f silk/decode_pitch.lo
-	-rm -f silk/decode_pulses.$(OBJEXT)
-	-rm -f silk/decode_pulses.lo
-	-rm -f silk/decoder_set_fs.$(OBJEXT)
-	-rm -f silk/decoder_set_fs.lo
-	-rm -f silk/enc_API.$(OBJEXT)
-	-rm -f silk/enc_API.lo
-	-rm -f silk/encode_indices.$(OBJEXT)
-	-rm -f silk/encode_indices.lo
-	-rm -f silk/encode_pulses.$(OBJEXT)
-	-rm -f silk/encode_pulses.lo
-	-rm -f silk/fixed/LTP_analysis_filter_FIX.$(OBJEXT)
-	-rm -f silk/fixed/LTP_analysis_filter_FIX.lo
-	-rm -f silk/fixed/LTP_scale_ctrl_FIX.$(OBJEXT)
-	-rm -f silk/fixed/LTP_scale_ctrl_FIX.lo
-	-rm -f silk/fixed/apply_sine_window_FIX.$(OBJEXT)
-	-rm -f silk/fixed/apply_sine_window_FIX.lo
-	-rm -f silk/fixed/autocorr_FIX.$(OBJEXT)
-	-rm -f silk/fixed/autocorr_FIX.lo
-	-rm -f silk/fixed/burg_modified_FIX.$(OBJEXT)
-	-rm -f silk/fixed/burg_modified_FIX.lo
-	-rm -f silk/fixed/corrMatrix_FIX.$(OBJEXT)
-	-rm -f silk/fixed/corrMatrix_FIX.lo
-	-rm -f silk/fixed/encode_frame_FIX.$(OBJEXT)
-	-rm -f silk/fixed/encode_frame_FIX.lo
-	-rm -f silk/fixed/find_LPC_FIX.$(OBJEXT)
-	-rm -f silk/fixed/find_LPC_FIX.lo
-	-rm -f silk/fixed/find_LTP_FIX.$(OBJEXT)
-	-rm -f silk/fixed/find_LTP_FIX.lo
-	-rm -f silk/fixed/find_pitch_lags_FIX.$(OBJEXT)
-	-rm -f silk/fixed/find_pitch_lags_FIX.lo
-	-rm -f silk/fixed/find_pred_coefs_FIX.$(OBJEXT)
-	-rm -f silk/fixed/find_pred_coefs_FIX.lo
-	-rm -f silk/fixed/k2a_FIX.$(OBJEXT)
-	-rm -f silk/fixed/k2a_FIX.lo
-	-rm -f silk/fixed/k2a_Q16_FIX.$(OBJEXT)
-	-rm -f silk/fixed/k2a_Q16_FIX.lo
-	-rm -f silk/fixed/noise_shape_analysis_FIX.$(OBJEXT)
-	-rm -f silk/fixed/noise_shape_analysis_FIX.lo
-	-rm -f silk/fixed/pitch_analysis_core_FIX.$(OBJEXT)
-	-rm -f silk/fixed/pitch_analysis_core_FIX.lo
-	-rm -f silk/fixed/prefilter_FIX.$(OBJEXT)
-	-rm -f silk/fixed/prefilter_FIX.lo
-	-rm -f silk/fixed/process_gains_FIX.$(OBJEXT)
-	-rm -f silk/fixed/process_gains_FIX.lo
-	-rm -f silk/fixed/regularize_correlations_FIX.$(OBJEXT)
-	-rm -f silk/fixed/regularize_correlations_FIX.lo
-	-rm -f silk/fixed/residual_energy16_FIX.$(OBJEXT)
-	-rm -f silk/fixed/residual_energy16_FIX.lo
-	-rm -f silk/fixed/residual_energy_FIX.$(OBJEXT)
-	-rm -f silk/fixed/residual_energy_FIX.lo
-	-rm -f silk/fixed/schur64_FIX.$(OBJEXT)
-	-rm -f silk/fixed/schur64_FIX.lo
-	-rm -f silk/fixed/schur_FIX.$(OBJEXT)
-	-rm -f silk/fixed/schur_FIX.lo
-	-rm -f silk/fixed/solve_LS_FIX.$(OBJEXT)
-	-rm -f silk/fixed/solve_LS_FIX.lo
-	-rm -f silk/fixed/vector_ops_FIX.$(OBJEXT)
-	-rm -f silk/fixed/vector_ops_FIX.lo
-	-rm -f silk/fixed/warped_autocorrelation_FIX.$(OBJEXT)
-	-rm -f silk/fixed/warped_autocorrelation_FIX.lo
-	-rm -f silk/float/LPC_analysis_filter_FLP.$(OBJEXT)
-	-rm -f silk/float/LPC_analysis_filter_FLP.lo
-	-rm -f silk/float/LPC_inv_pred_gain_FLP.$(OBJEXT)
-	-rm -f silk/float/LPC_inv_pred_gain_FLP.lo
-	-rm -f silk/float/LTP_analysis_filter_FLP.$(OBJEXT)
-	-rm -f silk/float/LTP_analysis_filter_FLP.lo
-	-rm -f silk/float/LTP_scale_ctrl_FLP.$(OBJEXT)
-	-rm -f silk/float/LTP_scale_ctrl_FLP.lo
-	-rm -f silk/float/apply_sine_window_FLP.$(OBJEXT)
-	-rm -f silk/float/apply_sine_window_FLP.lo
-	-rm -f silk/float/autocorrelation_FLP.$(OBJEXT)
-	-rm -f silk/float/autocorrelation_FLP.lo
-	-rm -f silk/float/burg_modified_FLP.$(OBJEXT)
-	-rm -f silk/float/burg_modified_FLP.lo
-	-rm -f silk/float/bwexpander_FLP.$(OBJEXT)
-	-rm -f silk/float/bwexpander_FLP.lo
-	-rm -f silk/float/corrMatrix_FLP.$(OBJEXT)
-	-rm -f silk/float/corrMatrix_FLP.lo
-	-rm -f silk/float/encode_frame_FLP.$(OBJEXT)
-	-rm -f silk/float/encode_frame_FLP.lo
-	-rm -f silk/float/energy_FLP.$(OBJEXT)
-	-rm -f silk/float/energy_FLP.lo
-	-rm -f silk/float/find_LPC_FLP.$(OBJEXT)
-	-rm -f silk/float/find_LPC_FLP.lo
-	-rm -f silk/float/find_LTP_FLP.$(OBJEXT)
-	-rm -f silk/float/find_LTP_FLP.lo
-	-rm -f silk/float/find_pitch_lags_FLP.$(OBJEXT)
-	-rm -f silk/float/find_pitch_lags_FLP.lo
-	-rm -f silk/float/find_pred_coefs_FLP.$(OBJEXT)
-	-rm -f silk/float/find_pred_coefs_FLP.lo
-	-rm -f silk/float/inner_product_FLP.$(OBJEXT)
-	-rm -f silk/float/inner_product_FLP.lo
-	-rm -f silk/float/k2a_FLP.$(OBJEXT)
-	-rm -f silk/float/k2a_FLP.lo
-	-rm -f silk/float/levinsondurbin_FLP.$(OBJEXT)
-	-rm -f silk/float/levinsondurbin_FLP.lo
-	-rm -f silk/float/noise_shape_analysis_FLP.$(OBJEXT)
-	-rm -f silk/float/noise_shape_analysis_FLP.lo
-	-rm -f silk/float/pitch_analysis_core_FLP.$(OBJEXT)
-	-rm -f silk/float/pitch_analysis_core_FLP.lo
-	-rm -f silk/float/prefilter_FLP.$(OBJEXT)
-	-rm -f silk/float/prefilter_FLP.lo
-	-rm -f silk/float/process_gains_FLP.$(OBJEXT)
-	-rm -f silk/float/process_gains_FLP.lo
-	-rm -f silk/float/regularize_correlations_FLP.$(OBJEXT)
-	-rm -f silk/float/regularize_correlations_FLP.lo
-	-rm -f silk/float/residual_energy_FLP.$(OBJEXT)
-	-rm -f silk/float/residual_energy_FLP.lo
-	-rm -f silk/float/scale_copy_vector_FLP.$(OBJEXT)
-	-rm -f silk/float/scale_copy_vector_FLP.lo
-	-rm -f silk/float/scale_vector_FLP.$(OBJEXT)
-	-rm -f silk/float/scale_vector_FLP.lo
-	-rm -f silk/float/schur_FLP.$(OBJEXT)
-	-rm -f silk/float/schur_FLP.lo
-	-rm -f silk/float/solve_LS_FLP.$(OBJEXT)
-	-rm -f silk/float/solve_LS_FLP.lo
-	-rm -f silk/float/sort_FLP.$(OBJEXT)
-	-rm -f silk/float/sort_FLP.lo
-	-rm -f silk/float/warped_autocorrelation_FLP.$(OBJEXT)
-	-rm -f silk/float/warped_autocorrelation_FLP.lo
-	-rm -f silk/float/wrappers_FLP.$(OBJEXT)
-	-rm -f silk/float/wrappers_FLP.lo
-	-rm -f silk/gain_quant.$(OBJEXT)
-	-rm -f silk/gain_quant.lo
-	-rm -f silk/init_decoder.$(OBJEXT)
-	-rm -f silk/init_decoder.lo
-	-rm -f silk/init_encoder.$(OBJEXT)
-	-rm -f silk/init_encoder.lo
-	-rm -f silk/inner_prod_aligned.$(OBJEXT)
-	-rm -f silk/inner_prod_aligned.lo
-	-rm -f silk/interpolate.$(OBJEXT)
-	-rm -f silk/interpolate.lo
-	-rm -f silk/lin2log.$(OBJEXT)
-	-rm -f silk/lin2log.lo
-	-rm -f silk/log2lin.$(OBJEXT)
-	-rm -f silk/log2lin.lo
-	-rm -f silk/pitch_est_tables.$(OBJEXT)
-	-rm -f silk/pitch_est_tables.lo
-	-rm -f silk/process_NLSFs.$(OBJEXT)
-	-rm -f silk/process_NLSFs.lo
-	-rm -f silk/quant_LTP_gains.$(OBJEXT)
-	-rm -f silk/quant_LTP_gains.lo
-	-rm -f silk/resampler.$(OBJEXT)
-	-rm -f silk/resampler.lo
-	-rm -f silk/resampler_down2.$(OBJEXT)
-	-rm -f silk/resampler_down2.lo
-	-rm -f silk/resampler_down2_3.$(OBJEXT)
-	-rm -f silk/resampler_down2_3.lo
-	-rm -f silk/resampler_private_AR2.$(OBJEXT)
-	-rm -f silk/resampler_private_AR2.lo
-	-rm -f silk/resampler_private_IIR_FIR.$(OBJEXT)
-	-rm -f silk/resampler_private_IIR_FIR.lo
-	-rm -f silk/resampler_private_down_FIR.$(OBJEXT)
-	-rm -f silk/resampler_private_down_FIR.lo
-	-rm -f silk/resampler_private_up2_HQ.$(OBJEXT)
-	-rm -f silk/resampler_private_up2_HQ.lo
-	-rm -f silk/resampler_rom.$(OBJEXT)
-	-rm -f silk/resampler_rom.lo
-	-rm -f silk/shell_coder.$(OBJEXT)
-	-rm -f silk/shell_coder.lo
-	-rm -f silk/sigm_Q15.$(OBJEXT)
-	-rm -f silk/sigm_Q15.lo
-	-rm -f silk/sort.$(OBJEXT)
-	-rm -f silk/sort.lo
-	-rm -f silk/stereo_LR_to_MS.$(OBJEXT)
-	-rm -f silk/stereo_LR_to_MS.lo
-	-rm -f silk/stereo_MS_to_LR.$(OBJEXT)
-	-rm -f silk/stereo_MS_to_LR.lo
-	-rm -f silk/stereo_decode_pred.$(OBJEXT)
-	-rm -f silk/stereo_decode_pred.lo
-	-rm -f silk/stereo_encode_pred.$(OBJEXT)
-	-rm -f silk/stereo_encode_pred.lo
-	-rm -f silk/stereo_find_predictor.$(OBJEXT)
-	-rm -f silk/stereo_find_predictor.lo
-	-rm -f silk/stereo_quant_pred.$(OBJEXT)
-	-rm -f silk/stereo_quant_pred.lo
-	-rm -f silk/sum_sqr_shift.$(OBJEXT)
-	-rm -f silk/sum_sqr_shift.lo
-	-rm -f silk/table_LSF_cos.$(OBJEXT)
-	-rm -f silk/table_LSF_cos.lo
-	-rm -f silk/tables_LTP.$(OBJEXT)
-	-rm -f silk/tables_LTP.lo
-	-rm -f silk/tables_NLSF_CB_NB_MB.$(OBJEXT)
-	-rm -f silk/tables_NLSF_CB_NB_MB.lo
-	-rm -f silk/tables_NLSF_CB_WB.$(OBJEXT)
-	-rm -f silk/tables_NLSF_CB_WB.lo
-	-rm -f silk/tables_gain.$(OBJEXT)
-	-rm -f silk/tables_gain.lo
-	-rm -f silk/tables_other.$(OBJEXT)
-	-rm -f silk/tables_other.lo
-	-rm -f silk/tables_pitch_lag.$(OBJEXT)
-	-rm -f silk/tables_pitch_lag.lo
-	-rm -f silk/tables_pulses_per_block.$(OBJEXT)
-	-rm -f silk/tables_pulses_per_block.lo
-	-rm -f src/analysis.$(OBJEXT)
-	-rm -f src/analysis.lo
-	-rm -f src/mlp.$(OBJEXT)
-	-rm -f src/mlp.lo
-	-rm -f src/mlp_data.$(OBJEXT)
-	-rm -f src/mlp_data.lo
-	-rm -f src/opus.$(OBJEXT)
-	-rm -f src/opus.lo
-	-rm -f src/opus_compare.$(OBJEXT)
-	-rm -f src/opus_decoder.$(OBJEXT)
-	-rm -f src/opus_decoder.lo
-	-rm -f src/opus_demo.$(OBJEXT)
-	-rm -f src/opus_encoder.$(OBJEXT)
-	-rm -f src/opus_encoder.lo
-	-rm -f src/opus_multistream.$(OBJEXT)
-	-rm -f src/opus_multistream.lo
-	-rm -f src/opus_multistream_decoder.$(OBJEXT)
-	-rm -f src/opus_multistream_decoder.lo
-	-rm -f src/opus_multistream_encoder.$(OBJEXT)
-	-rm -f src/opus_multistream_encoder.lo
-	-rm -f src/repacketizer.$(OBJEXT)
-	-rm -f src/repacketizer.lo
-	-rm -f src/repacketizer_demo.$(OBJEXT)
-	-rm -f tests/test_opus_api.$(OBJEXT)
-	-rm -f tests/test_opus_decode.$(OBJEXT)
-	-rm -f tests/test_opus_encode.$(OBJEXT)
-	-rm -f tests/test_opus_padding.$(OBJEXT)
+	-rm -f celt/*.$(OBJEXT)
+	-rm -f celt/*.lo
+	-rm -f celt/arm/*.$(OBJEXT)
+	-rm -f celt/arm/*.lo
+	-rm -f celt/tests/*.$(OBJEXT)
+	-rm -f celt/x86/*.$(OBJEXT)
+	-rm -f celt/x86/*.lo
+	-rm -f silk/*.$(OBJEXT)
+	-rm -f silk/*.lo
+	-rm -f silk/fixed/*.$(OBJEXT)
+	-rm -f silk/fixed/*.lo
+	-rm -f silk/fixed/x86/*.$(OBJEXT)
+	-rm -f silk/fixed/x86/*.lo
+	-rm -f silk/float/*.$(OBJEXT)
+	-rm -f silk/float/*.lo
+	-rm -f silk/x86/*.$(OBJEXT)
+	-rm -f silk/x86/*.lo
+	-rm -f src/*.$(OBJEXT)
+	-rm -f src/*.lo
+	-rm -f tests/*.$(OBJEXT)
 
 distclean-compile:
 	-rm -f *.tab.c
@@ -1881,6 +2033,9 @@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/$(DEPDIR)/vq.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/arm_celt_map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/armcpu.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/celt_ne10_fft.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/celt_ne10_mdct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/celt_neon_intr.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/arm/$(DEPDIR)/celt_pitch_xcorr_arm-gnu.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/tests/$(DEPDIR)/test_unit_cwrs32.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/tests/$(DEPDIR)/test_unit_dft.Po@am__quote@
@@ -1890,6 +2045,12 @@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/tests/$(DEPDIR)/test_unit_mdct.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/tests/$(DEPDIR)/test_unit_rotation.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@celt/tests/$(DEPDIR)/test_unit_types.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/celt_lpc_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/pitch_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/pitch_sse2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/pitch_sse4_1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/x86_celt_map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@celt/x86/$(DEPDIR)/x86cpu.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/$(DEPDIR)/A2NLSF.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/$(DEPDIR)/CNG.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/$(DEPDIR)/HP_variable_cutoff.Plo@am__quote@
@@ -1991,6 +2152,9 @@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/$(DEPDIR)/solve_LS_FIX.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/$(DEPDIR)/vector_ops_FIX.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/$(DEPDIR)/warped_autocorrelation_FIX.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/x86/$(DEPDIR)/burg_modified_FIX_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/x86/$(DEPDIR)/prefilter_FIX_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/fixed/x86/$(DEPDIR)/vector_ops_FIX_sse.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/LPC_analysis_filter_FLP.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/LPC_inv_pred_gain_FLP.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/LTP_analysis_filter_FLP.Plo@am__quote@
@@ -2022,6 +2186,11 @@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/sort_FLP.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/warped_autocorrelation_FLP.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@silk/float/$(DEPDIR)/wrappers_FLP.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/x86/$(DEPDIR)/NSQ_del_dec_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/x86/$(DEPDIR)/NSQ_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/x86/$(DEPDIR)/VAD_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/x86/$(DEPDIR)/VQ_WMat_EC_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@silk/x86/$(DEPDIR)/x86_silk_map.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/analysis.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/mlp.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/mlp_data.Plo@am__quote@
@@ -2096,9 +2265,12 @@
 	-rm -rf celt/.libs celt/_libs
 	-rm -rf celt/arm/.libs celt/arm/_libs
 	-rm -rf celt/tests/.libs celt/tests/_libs
+	-rm -rf celt/x86/.libs celt/x86/_libs
 	-rm -rf silk/.libs silk/_libs
 	-rm -rf silk/fixed/.libs silk/fixed/_libs
+	-rm -rf silk/fixed/x86/.libs silk/fixed/x86/_libs
 	-rm -rf silk/float/.libs silk/float/_libs
+	-rm -rf silk/x86/.libs silk/x86/_libs
 	-rm -rf src/.libs src/_libs
 	-rm -rf tests/.libs tests/_libs
 
@@ -2169,22 +2341,25 @@
 	dir='$(DESTDIR)$(pkgincludedir)'; $(am__uninstall_files_from_dir)
 
 # This directory's subdirectories are mostly independent; you can cd
-# into them and run `make' without going through this Makefile.
-# To change the values of `make' variables: instead of editing Makefiles,
-# (1) if the variable is set in `config.status', edit `config.status'
-#     (which will cause the Makefiles to be regenerated when you run `make');
-# (2) otherwise, pass the desired values on the `make' command line.
-$(RECURSIVE_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+#     (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+	@fail=; \
+	if $(am__make_keepgoing); then \
+	  failcom='fail=yes'; \
+	else \
+	  failcom='exit 1'; \
+	fi; \
 	dot_seen=no; \
 	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	for subdir in $$list; do \
 	  echo "Making $$target in $$subdir"; \
 	  if test "$$subdir" = "."; then \
 	    dot_seen=yes; \
@@ -2199,57 +2374,12 @@
 	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
 	fi; test -z "$$fail"
 
-$(RECURSIVE_CLEAN_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	rev=''; for subdir in $$list; do \
-	  if test "$$subdir" = "."; then :; else \
-	    rev="$$subdir $$rev"; \
-	  fi; \
-	done; \
-	rev="$$rev ."; \
-	target=`echo $@ | sed s/-recursive//`; \
-	for subdir in $$rev; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done && test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-recursive
+TAGS: tags
 
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
 	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
@@ -2265,12 +2395,7 @@
 	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
 	  fi; \
 	done; \
-	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
 	  test -n "$$unique" || unique=$$empty_fix; \
@@ -2282,15 +2407,11 @@
 	      $$unique; \
 	  fi; \
 	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+ctags: ctags-recursive
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
 	test -z "$(CTAGS_ARGS)$$unique" \
 	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
 	     $$unique
@@ -2299,102 +2420,271 @@
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscope: cscope.files
+	test ! -s cscope.files \
+	  || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
+clean-cscope:
+	-rm -f cscope.files
+cscope.files: clean-cscope cscopelist
+cscopelist: cscopelist-recursive
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
 
 distclean-tags:
 	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+	-rm -f cscope.out cscope.in.out cscope.po.out cscope.files
 
-check-TESTS: $(TESTS)
-	@failed=0; all=0; xfail=0; xpass=0; skip=0; \
-	srcdir=$(srcdir); export srcdir; \
-	list=' $(TESTS) '; \
-	$(am__tty_colors); \
-	if test -n "$$list"; then \
-	  for tst in $$list; do \
-	    if test -f ./$$tst; then dir=./; \
-	    elif test -f $$tst; then dir=; \
-	    else dir="$(srcdir)/"; fi; \
-	    if $(TESTS_ENVIRONMENT) $${dir}$$tst; then \
-	      all=`expr $$all + 1`; \
-	      case " $(XFAIL_TESTS) " in \
-	      *[\ \	]$$tst[\ \	]*) \
-		xpass=`expr $$xpass + 1`; \
-		failed=`expr $$failed + 1`; \
-		col=$$red; res=XPASS; \
-	      ;; \
-	      *) \
-		col=$$grn; res=PASS; \
-	      ;; \
-	      esac; \
-	    elif test $$? -ne 77; then \
-	      all=`expr $$all + 1`; \
-	      case " $(XFAIL_TESTS) " in \
-	      *[\ \	]$$tst[\ \	]*) \
-		xfail=`expr $$xfail + 1`; \
-		col=$$lgn; res=XFAIL; \
-	      ;; \
-	      *) \
-		failed=`expr $$failed + 1`; \
-		col=$$red; res=FAIL; \
-	      ;; \
-	      esac; \
-	    else \
-	      skip=`expr $$skip + 1`; \
-	      col=$$blu; res=SKIP; \
-	    fi; \
-	    echo "$${col}$$res$${std}: $$tst"; \
+# Recover from deleted '.trs' file; this should ensure that
+# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create
+# both 'foo.log' and 'foo.trs'.  Break the recipe in two subshells
+# to avoid problems with "make -n".
+.log.trs:
+	rm -f $< $@
+	$(MAKE) $(AM_MAKEFLAGS) $<
+
+# Leading 'am--fnord' is there to ensure the list of targets does not
+# expand to empty, as could happen e.g. with make check TESTS=''.
+am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck)
+am--force-recheck:
+	@:
+
+$(TEST_SUITE_LOG): $(TEST_LOGS)
+	@$(am__set_TESTS_bases); \
+	am__f_ok () { test -f "$$1" && test -r "$$1"; }; \
+	redo_bases=`for i in $$bases; do \
+	              am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \
+	            done`; \
+	if test -n "$$redo_bases"; then \
+	  redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \
+	  redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \
+	  if $(am__make_dryrun); then :; else \
+	    rm -f $$redo_logs && rm -f $$redo_results || exit 1; \
+	  fi; \
+	fi; \
+	if test -n "$$am__remaking_logs"; then \
+	  echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
+	       "recursion detected" >&2; \
+	elif test -n "$$redo_logs"; then \
+	  am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
+	fi; \
+	if $(am__make_dryrun); then :; else \
+	  st=0;  \
+	  errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \
+	  for i in $$redo_bases; do \
+	    test -f $$i.trs && test -r $$i.trs \
+	      || { echo "$$errmsg $$i.trs" >&2; st=1; }; \
+	    test -f $$i.log && test -r $$i.log \
+	      || { echo "$$errmsg $$i.log" >&2; st=1; }; \
 	  done; \
-	  if test "$$all" -eq 1; then \
-	    tests="test"; \
-	    All=""; \
-	  else \
-	    tests="tests"; \
-	    All="All "; \
-	  fi; \
-	  if test "$$failed" -eq 0; then \
-	    if test "$$xfail" -eq 0; then \
-	      banner="$$All$$all $$tests passed"; \
+	  test $$st -eq 0 || exit 1; \
+	fi
+	@$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \
+	ws='[ 	]'; \
+	results=`for b in $$bases; do echo $$b.trs; done`; \
+	test -n "$$results" || results=/dev/null; \
+	all=`  grep "^$$ws*:test-result:"           $$results | wc -l`; \
+	pass=` grep "^$$ws*:test-result:$$ws*PASS"  $$results | wc -l`; \
+	fail=` grep "^$$ws*:test-result:$$ws*FAIL"  $$results | wc -l`; \
+	skip=` grep "^$$ws*:test-result:$$ws*SKIP"  $$results | wc -l`; \
+	xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \
+	xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \
+	error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \
+	if test `expr $$fail + $$xpass + $$error` -eq 0; then \
+	  success=true; \
+	else \
+	  success=false; \
+	fi; \
+	br='==================='; br=$$br$$br$$br$$br; \
+	result_count () \
+	{ \
+	    if test x"$$1" = x"--maybe-color"; then \
+	      maybe_colorize=yes; \
+	    elif test x"$$1" = x"--no-color"; then \
+	      maybe_colorize=no; \
 	    else \
-	      if test "$$xfail" -eq 1; then failures=failure; else failures=failures; fi; \
-	      banner="$$All$$all $$tests behaved as expected ($$xfail expected $$failures)"; \
+	      echo "$@: invalid 'result_count' usage" >&2; exit 4; \
 	    fi; \
-	  else \
-	    if test "$$xpass" -eq 0; then \
-	      banner="$$failed of $$all $$tests failed"; \
+	    shift; \
+	    desc=$$1 count=$$2; \
+	    if test $$maybe_colorize = yes && test $$count -gt 0; then \
+	      color_start=$$3 color_end=$$std; \
 	    else \
-	      if test "$$xpass" -eq 1; then passes=pass; else passes=passes; fi; \
-	      banner="$$failed of $$all $$tests did not behave as expected ($$xpass unexpected $$passes)"; \
+	      color_start= color_end=; \
 	    fi; \
-	  fi; \
-	  dashes="$$banner"; \
-	  skipped=""; \
-	  if test "$$skip" -ne 0; then \
-	    if test "$$skip" -eq 1; then \
-	      skipped="($$skip test was not run)"; \
-	    else \
-	      skipped="($$skip tests were not run)"; \
-	    fi; \
-	    test `echo "$$skipped" | wc -c` -le `echo "$$banner" | wc -c` || \
-	      dashes="$$skipped"; \
-	  fi; \
-	  report=""; \
-	  if test "$$failed" -ne 0 && test -n "$(PACKAGE_BUGREPORT)"; then \
-	    report="Please report to $(PACKAGE_BUGREPORT)"; \
-	    test `echo "$$report" | wc -c` -le `echo "$$banner" | wc -c` || \
-	      dashes="$$report"; \
-	  fi; \
-	  dashes=`echo "$$dashes" | sed s/./=/g`; \
-	  if test "$$failed" -eq 0; then \
-	    col="$$grn"; \
-	  else \
-	    col="$$red"; \
-	  fi; \
-	  echo "$${col}$$dashes$${std}"; \
-	  echo "$${col}$$banner$${std}"; \
-	  test -z "$$skipped" || echo "$${col}$$skipped$${std}"; \
-	  test -z "$$report" || echo "$${col}$$report$${std}"; \
-	  echo "$${col}$$dashes$${std}"; \
-	  test "$$failed" -eq 0; \
-	else :; fi
+	    echo "$${color_start}# $$desc $$count$${color_end}"; \
+	}; \
+	create_testsuite_report () \
+	{ \
+	  result_count $$1 "TOTAL:" $$all   "$$brg"; \
+	  result_count $$1 "PASS: " $$pass  "$$grn"; \
+	  result_count $$1 "SKIP: " $$skip  "$$blu"; \
+	  result_count $$1 "XFAIL:" $$xfail "$$lgn"; \
+	  result_count $$1 "FAIL: " $$fail  "$$red"; \
+	  result_count $$1 "XPASS:" $$xpass "$$red"; \
+	  result_count $$1 "ERROR:" $$error "$$mgn"; \
+	}; \
+	{								\
+	  echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" |	\
+	    $(am__rst_title);						\
+	  create_testsuite_report --no-color;				\
+	  echo;								\
+	  echo ".. contents:: :depth: 2";				\
+	  echo;								\
+	  for b in $$bases; do echo $$b; done				\
+	    | $(am__create_global_log);					\
+	} >$(TEST_SUITE_LOG).tmp || exit 1;				\
+	mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG);			\
+	if $$success; then						\
+	  col="$$grn";							\
+	 else								\
+	  col="$$red";							\
+	  test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG);		\
+	fi;								\
+	echo "$${col}$$br$${std}"; 					\
+	echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}";	\
+	echo "$${col}$$br$${std}"; 					\
+	create_testsuite_report --maybe-color;				\
+	echo "$$col$$br$$std";						\
+	if $$success; then :; else					\
+	  echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}";		\
+	  if test -n "$(PACKAGE_BUGREPORT)"; then			\
+	    echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}";	\
+	  fi;								\
+	  echo "$$col$$br$$std";					\
+	fi;								\
+	$$success || exit 1
+
+check-TESTS:
+	@list='$(RECHECK_LOGS)';           test -z "$$list" || rm -f $$list
+	@list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	trs_list=`for i in $$bases; do echo $$i.trs; done`; \
+	log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \
+	exit $$?;
+recheck: all 
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	bases=`for i in $$bases; do echo $$i; done \
+	         | $(am__list_recheck_tests)` || exit 1; \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	log_list=`echo $$log_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \
+	        am__force_recheck=am--force-recheck \
+	        TEST_LOGS="$$log_list"; \
+	exit $$?
+celt/tests/test_unit_types.log: celt/tests/test_unit_types$(EXEEXT)
+	@p='celt/tests/test_unit_types$(EXEEXT)'; \
+	b='celt/tests/test_unit_types'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_mathops.log: celt/tests/test_unit_mathops$(EXEEXT)
+	@p='celt/tests/test_unit_mathops$(EXEEXT)'; \
+	b='celt/tests/test_unit_mathops'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_entropy.log: celt/tests/test_unit_entropy$(EXEEXT)
+	@p='celt/tests/test_unit_entropy$(EXEEXT)'; \
+	b='celt/tests/test_unit_entropy'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_laplace.log: celt/tests/test_unit_laplace$(EXEEXT)
+	@p='celt/tests/test_unit_laplace$(EXEEXT)'; \
+	b='celt/tests/test_unit_laplace'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_dft.log: celt/tests/test_unit_dft$(EXEEXT)
+	@p='celt/tests/test_unit_dft$(EXEEXT)'; \
+	b='celt/tests/test_unit_dft'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_mdct.log: celt/tests/test_unit_mdct$(EXEEXT)
+	@p='celt/tests/test_unit_mdct$(EXEEXT)'; \
+	b='celt/tests/test_unit_mdct'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_rotation.log: celt/tests/test_unit_rotation$(EXEEXT)
+	@p='celt/tests/test_unit_rotation$(EXEEXT)'; \
+	b='celt/tests/test_unit_rotation'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+celt/tests/test_unit_cwrs32.log: celt/tests/test_unit_cwrs32$(EXEEXT)
+	@p='celt/tests/test_unit_cwrs32$(EXEEXT)'; \
+	b='celt/tests/test_unit_cwrs32'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+tests/test_opus_api.log: tests/test_opus_api$(EXEEXT)
+	@p='tests/test_opus_api$(EXEEXT)'; \
+	b='tests/test_opus_api'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+tests/test_opus_decode.log: tests/test_opus_decode$(EXEEXT)
+	@p='tests/test_opus_decode$(EXEEXT)'; \
+	b='tests/test_opus_decode'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+tests/test_opus_encode.log: tests/test_opus_encode$(EXEEXT)
+	@p='tests/test_opus_encode$(EXEEXT)'; \
+	b='tests/test_opus_encode'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+tests/test_opus_padding.log: tests/test_opus_padding$(EXEEXT)
+	@p='tests/test_opus_padding$(EXEEXT)'; \
+	b='tests/test_opus_padding'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+.test.log:
+	@p='$<'; \
+	$(am__set_b); \
+	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+@am__EXEEXT_TRUE@.test$(EXEEXT).log:
+@am__EXEEXT_TRUE@	@p='$<'; \
+@am__EXEEXT_TRUE@	$(am__set_b); \
+@am__EXEEXT_TRUE@	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+@am__EXEEXT_TRUE@	--log-file $$b.log --trs-file $$b.trs \
+@am__EXEEXT_TRUE@	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+@am__EXEEXT_TRUE@	"$$tst" $(AM_TESTS_FD_REDIRECT)
 
 distdir: $(DISTFILES)
 	$(am__remove_distdir)
@@ -2465,40 +2755,42 @@
 	|| chmod -R a+r "$(distdir)"
 dist-gzip: distdir
 	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-bzip2: distdir
 	tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-lzip: distdir
 	tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
-	$(am__remove_distdir)
-
-dist-lzma: distdir
-	tardir=$(distdir) && $(am__tar) | lzma -9 -c >$(distdir).tar.lzma
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-xz: distdir
 	tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-tarZ: distdir
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-shar: distdir
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
+	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
 dist-zip: distdir
 	-rm -f $(distdir).zip
 	zip -rq $(distdir).zip $(distdir)
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 
-dist dist-all: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
+dist dist-all:
+	$(MAKE) $(AM_MAKEFLAGS) $(DIST_TARGETS) am__post_remove_distdir='@:'
+	$(am__post_remove_distdir)
 
 # This target untars the dist file and tries a VPATH configuration.  Then
 # it guarantees that the distribution is self-contained by making another
@@ -2509,8 +2801,6 @@
 	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
 	*.tar.bz2*) \
 	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
-	*.tar.lzma*) \
-	  lzma -dc $(distdir).tar.lzma | $(am__untar) ;;\
 	*.tar.lz*) \
 	  lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
 	*.tar.xz*) \
@@ -2522,18 +2812,19 @@
 	*.zip*) \
 	  unzip $(distdir).zip ;;\
 	esac
-	chmod -R a-w $(distdir); chmod u+w $(distdir)
-	mkdir $(distdir)/_build
-	mkdir $(distdir)/_inst
+	chmod -R a-w $(distdir)
+	chmod u+w $(distdir)
+	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build \
-	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	  && $(am__cd) $(distdir)/_build/sub \
+	  && ../../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
+	    --srcdir=../.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@@ -2556,7 +2847,7 @@
 	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \
 	  && cd "$$am__cwd" \
 	  || exit 1
-	$(am__remove_distdir)
+	$(am__post_remove_distdir)
 	@(echo "$(distdir) archives ready for distribution: "; \
 	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
 	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
@@ -2618,6 +2909,9 @@
 	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
 	fi
 mostlyclean-generic:
+	-test -z "$(TEST_LOGS)" || rm -f $(TEST_LOGS)
+	-test -z "$(TEST_LOGS:.log=.trs)" || rm -f $(TEST_LOGS:.log=.trs)
+	-test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
 
 clean-generic:
 	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
@@ -2631,12 +2925,18 @@
 	-rm -f celt/arm/$(am__dirstamp)
 	-rm -f celt/tests/$(DEPDIR)/$(am__dirstamp)
 	-rm -f celt/tests/$(am__dirstamp)
+	-rm -f celt/x86/$(DEPDIR)/$(am__dirstamp)
+	-rm -f celt/x86/$(am__dirstamp)
 	-rm -f silk/$(DEPDIR)/$(am__dirstamp)
 	-rm -f silk/$(am__dirstamp)
 	-rm -f silk/fixed/$(DEPDIR)/$(am__dirstamp)
 	-rm -f silk/fixed/$(am__dirstamp)
+	-rm -f silk/fixed/x86/$(DEPDIR)/$(am__dirstamp)
+	-rm -f silk/fixed/x86/$(am__dirstamp)
 	-rm -f silk/float/$(DEPDIR)/$(am__dirstamp)
 	-rm -f silk/float/$(am__dirstamp)
+	-rm -f silk/x86/$(DEPDIR)/$(am__dirstamp)
+	-rm -f silk/x86/$(am__dirstamp)
 	-rm -f src/$(DEPDIR)/$(am__dirstamp)
 	-rm -f src/$(am__dirstamp)
 	-rm -f tests/$(DEPDIR)/$(am__dirstamp)
@@ -2649,11 +2949,11 @@
 clean: clean-recursive
 
 clean-am: clean-generic clean-libLTLIBRARIES clean-libtool clean-local \
-	clean-noinstPROGRAMS mostlyclean-am
+	clean-noinstLTLIBRARIES clean-noinstPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf celt/$(DEPDIR) celt/arm/$(DEPDIR) celt/tests/$(DEPDIR) silk/$(DEPDIR) silk/fixed/$(DEPDIR) silk/float/$(DEPDIR) src/$(DEPDIR) tests/$(DEPDIR)
+	-rm -rf celt/$(DEPDIR) celt/arm/$(DEPDIR) celt/tests/$(DEPDIR) celt/x86/$(DEPDIR) silk/$(DEPDIR) silk/fixed/$(DEPDIR) silk/fixed/x86/$(DEPDIR) silk/float/$(DEPDIR) silk/x86/$(DEPDIR) src/$(DEPDIR) tests/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-libtool distclean-tags
@@ -2702,7 +3002,7 @@
 maintainer-clean: maintainer-clean-recursive
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
 	-rm -rf $(top_srcdir)/autom4te.cache
-	-rm -rf celt/$(DEPDIR) celt/arm/$(DEPDIR) celt/tests/$(DEPDIR) silk/$(DEPDIR) silk/fixed/$(DEPDIR) silk/float/$(DEPDIR) src/$(DEPDIR) tests/$(DEPDIR)
+	-rm -rf celt/$(DEPDIR) celt/arm/$(DEPDIR) celt/tests/$(DEPDIR) celt/x86/$(DEPDIR) silk/$(DEPDIR) silk/fixed/$(DEPDIR) silk/fixed/x86/$(DEPDIR) silk/float/$(DEPDIR) silk/x86/$(DEPDIR) src/$(DEPDIR) tests/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -2723,33 +3023,35 @@
 	uninstall-m4dataDATA uninstall-pkgconfigDATA \
 	uninstall-pkgincludeHEADERS
 
-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) all check \
-	check-am ctags-recursive install install-am install-strip \
-	tags-recursive
+.MAKE: $(am__recursive_targets) all check check-am install install-am \
+	install-strip
 
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am all-local am--refresh check check-TESTS check-am \
-	clean clean-generic clean-libLTLIBRARIES clean-libtool \
-	clean-local clean-noinstPROGRAMS ctags ctags-recursive dist \
-	dist-all dist-bzip2 dist-gzip dist-hook dist-lzip dist-lzma \
-	dist-shar dist-tarZ dist-xz dist-zip distcheck distclean \
-	distclean-compile distclean-generic distclean-hdr \
-	distclean-libtool distclean-tags distcleancheck distdir \
-	distuninstallcheck dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am \
-	install-data-local install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-libLTLIBRARIES install-m4dataDATA \
-	install-man install-pdf install-pdf-am install-pkgconfigDATA \
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am all-local \
+	am--refresh check check-TESTS check-am clean clean-cscope \
+	clean-generic clean-libLTLIBRARIES clean-libtool clean-local \
+	clean-noinstLTLIBRARIES clean-noinstPROGRAMS cscope \
+	cscopelist-am ctags ctags-am dist dist-all dist-bzip2 \
+	dist-gzip dist-hook dist-lzip dist-shar dist-tarZ dist-xz \
+	dist-zip distcheck distclean distclean-compile \
+	distclean-generic distclean-hdr distclean-libtool \
+	distclean-tags distcleancheck distdir distuninstallcheck dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-data-local install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am \
+	install-libLTLIBRARIES install-m4dataDATA install-man \
+	install-pdf install-pdf-am install-pkgconfigDATA \
 	install-pkgincludeHEADERS install-ps install-ps-am \
 	install-strip installcheck installcheck-am installdirs \
 	installdirs-am maintainer-clean maintainer-clean-generic \
 	mostlyclean mostlyclean-compile mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
+	mostlyclean-libtool pdf pdf-am ps ps-am recheck tags tags-am \
 	uninstall uninstall-am uninstall-libLTLIBRARIES \
 	uninstall-local uninstall-m4dataDATA uninstall-pkgconfigDATA \
 	uninstall-pkgincludeHEADERS
 
+.PRECIOUS: Makefile
+
 
 # Provide the full test output for failed tests when using the parallel
 # test suite (which is enabled by default with automake 1.13+).
@@ -2825,10 +3127,15 @@
 
 # convert ARM asm to GNU as format
 %-gnu.S: $(top_srcdir)/%.s
-	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
+	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
 # For autoconf-modified sources (e.g., armopts.s)
 %-gnu.S: %.s
-	$(top_srcdir)/celt/arm/arm2gnu.pl < $< > $@
+	$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
+@HAVE_SSE_TRUE@$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
+@HAVE_SSE2_TRUE@$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
+@HAVE_SSE4_1_TRUE@$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
+@OPUS_ARM_NEON_INTR_TRUE@$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
+@OPUS_ARM_NEON_INTR_TRUE@ $(OPUS_ARM_NEON_INTR_CFLAGS)  $(NE10_CFLAGS)
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/Makefile.mips b/Makefile.mips
new file mode 100644
index 0000000..56a5062
--- /dev/null
+++ b/Makefile.mips
@@ -0,0 +1,161 @@
+#################### COMPILE OPTIONS #######################
+
+# Uncomment this for fixed-point build
+FIXED_POINT=1
+
+# It is strongly recommended to uncomment one of these
+# VAR_ARRAYS: Use C99 variable-length arrays for stack allocation
+# USE_ALLOCA: Use alloca() for stack allocation
+# If none is defined, then the fallback is a non-threadsafe global array
+CFLAGS := -DUSE_ALLOCA $(CFLAGS)
+#CFLAGS := -DVAR_ARRAYS $(CFLAGS)
+
+# These options affect performance
+# HAVE_LRINTF: Use C99 intrinsics to speed up float-to-int conversion
+#CFLAGS := -DHAVE_LRINTF $(CFLAGS)
+
+###################### END OF OPTIONS ######################
+
+-include package_version
+
+include silk_sources.mk
+include celt_sources.mk
+include opus_sources.mk
+
+ifdef FIXED_POINT
+SILK_SOURCES += $(SILK_SOURCES_FIXED)
+else
+SILK_SOURCES += $(SILK_SOURCES_FLOAT)
+OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
+endif
+
+EXESUFFIX =
+LIBPREFIX = lib
+LIBSUFFIX = .a
+OBJSUFFIX = .o
+
+CC     = $(TOOLCHAIN_PREFIX)cc$(TOOLCHAIN_SUFFIX)
+AR     = $(TOOLCHAIN_PREFIX)ar
+RANLIB = $(TOOLCHAIN_PREFIX)ranlib
+CP     = $(TOOLCHAIN_PREFIX)cp
+
+cppflags-from-defines   = $(addprefix -D,$(1))
+cppflags-from-includes  = $(addprefix -I,$(1))
+ldflags-from-ldlibdirs  = $(addprefix -L,$(1))
+ldlibs-from-libs        = $(addprefix -l,$(1))
+
+WARNINGS = -Wall -W -Wstrict-prototypes -Wextra -Wcast-align -Wnested-externs -Wshadow
+
+CFLAGS  += -mips32r2 -mno-mips16 -std=gnu99 -O2 -g $(WARNINGS) -DENABLE_ASSERTIONS -DMIPSr1_ASM -DOPUS_BUILD -mdspr2 -march=74kc -mtune=74kc -mmt -mgp32
+
+CINCLUDES = include silk celt
+
+ifdef FIXED_POINT
+CFLAGS += -DFIXED_POINT=1 -DDISABLE_FLOAT_API
+CINCLUDES += silk/fixed
+else
+CINCLUDES += silk/float
+endif
+
+
+LIBS = m
+
+LDLIBDIRS = ./
+
+CFLAGS  += $(call cppflags-from-defines,$(CDEFINES))
+CFLAGS  += $(call cppflags-from-includes,$(CINCLUDES))
+LDFLAGS += $(call ldflags-from-ldlibdirs,$(LDLIBDIRS))
+LDLIBS  += $(call ldlibs-from-libs,$(LIBS))
+
+COMPILE.c.cmdline   = $(CC) -c $(CFLAGS) -o $@ $<
+LINK.o              = $(CC) $(LDPREFLAGS) $(LDFLAGS)
+LINK.o.cmdline      = $(LINK.o) $^ $(LDLIBS) -o $@$(EXESUFFIX)
+
+ARCHIVE.cmdline     = $(AR) $(ARFLAGS) $@ $^ && $(RANLIB) $@
+
+%$(OBJSUFFIX):%.c
+	$(COMPILE.c.cmdline)
+
+%$(OBJSUFFIX):%.cpp
+	$(COMPILE.cpp.cmdline)
+
+# Directives
+
+
+# Variable definitions
+LIB_NAME = opus
+TARGET = $(LIBPREFIX)$(LIB_NAME)$(LIBSUFFIX)
+
+SRCS_C = $(SILK_SOURCES) $(CELT_SOURCES) $(OPUS_SOURCES)
+
+OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(SRCS_C))
+
+OPUSDEMO_SRCS_C = src/opus_demo.c
+OPUSDEMO_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSDEMO_SRCS_C))
+
+TESTOPUSAPI_SRCS_C = tests/test_opus_api.c
+TESTOPUSAPI_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSAPI_SRCS_C))
+
+TESTOPUSDECODE_SRCS_C = tests/test_opus_decode.c
+TESTOPUSDECODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSDECODE_SRCS_C))
+
+TESTOPUSENCODE_SRCS_C = tests/test_opus_encode.c
+TESTOPUSENCODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSENCODE_SRCS_C))
+
+TESTOPUSPADDING_SRCS_C = tests/test_opus_padding.c
+TESTOPUSPADDING_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSPADDING_SRCS_C))
+
+OPUSCOMPARE_SRCS_C = src/opus_compare.c
+OPUSCOMPARE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSCOMPARE_SRCS_C))
+
+TESTS := test_opus_api test_opus_decode test_opus_encode test_opus_padding
+
+# Rules
+all: lib opus_demo opus_compare $(TESTS)
+
+lib: $(TARGET)
+
+check: all
+	for test in $(TESTS); do ./$$test; done
+
+$(TARGET): $(OBJS)
+	$(ARCHIVE.cmdline)
+
+opus_demo$(EXESUFFIX): $(OPUSDEMO_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_api$(EXESUFFIX): $(TESTOPUSAPI_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_decode$(EXESUFFIX): $(TESTOPUSDECODE_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_encode$(EXESUFFIX): $(TESTOPUSENCODE_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_padding$(EXESUFFIX): $(TESTOPUSPADDING_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+opus_compare$(EXESUFFIX): $(OPUSCOMPARE_OBJS)
+	$(LINK.o.cmdline)
+
+celt/celt.o: CFLAGS += -DPACKAGE_VERSION='$(PACKAGE_VERSION)'
+celt/celt.o: package_version
+
+package_version: force
+	@if [ -x ./update_version ]; then \
+		./update_version || true; \
+	elif [ ! -e ./package_version ]; then \
+		echo 'PACKAGE_VERSION="unknown"' > ./package_version; \
+	fi
+
+force:
+
+clean:
+	rm -f opus_demo$(EXESUFFIX) opus_compare$(EXESUFFIX) $(TARGET) \
+                test_opus_api$(EXESUFFIX) test_opus_decode$(EXESUFFIX) \
+                test_opus_encode$(EXESUFFIX) test_opus_padding$(EXESUFFIX) \
+		$(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS) $(TESTOPUSAPI_OBJS) \
+                $(TESTOPUSDECODE_OBJS) $(TESTOPUSENCODE_OBJS) $(TESTOPUSPADDING_OBJS)
+
+.PHONY: all lib clean force check
diff --git a/Makefile.unix b/Makefile.unix
index 36fd337..b13230e 100644
--- a/Makefile.unix
+++ b/Makefile.unix
@@ -91,20 +91,49 @@
 OPUSDEMO_SRCS_C = src/opus_demo.c
 OPUSDEMO_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSDEMO_SRCS_C))
 
+TESTOPUSAPI_SRCS_C = tests/test_opus_api.c
+TESTOPUSAPI_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSAPI_SRCS_C))
+
+TESTOPUSDECODE_SRCS_C = tests/test_opus_decode.c
+TESTOPUSDECODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSDECODE_SRCS_C))
+
+TESTOPUSENCODE_SRCS_C = tests/test_opus_encode.c
+TESTOPUSENCODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSENCODE_SRCS_C))
+
+TESTOPUSPADDING_SRCS_C = tests/test_opus_padding.c
+TESTOPUSPADDING_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSPADDING_SRCS_C))
+
 OPUSCOMPARE_SRCS_C = src/opus_compare.c
 OPUSCOMPARE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSCOMPARE_SRCS_C))
 
+TESTS := test_opus_api test_opus_decode test_opus_encode test_opus_padding
+
 # Rules
-all: lib opus_demo opus_compare
+all: lib opus_demo opus_compare $(TESTS)
 
 lib: $(TARGET)
 
+check: all
+	for test in $(TESTS); do ./$$test; done
+
 $(TARGET): $(OBJS)
 	$(ARCHIVE.cmdline)
 
 opus_demo$(EXESUFFIX): $(OPUSDEMO_OBJS) $(TARGET)
 	$(LINK.o.cmdline)
 
+test_opus_api$(EXESUFFIX): $(TESTOPUSAPI_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_decode$(EXESUFFIX): $(TESTOPUSDECODE_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_encode$(EXESUFFIX): $(TESTOPUSENCODE_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
+test_opus_padding$(EXESUFFIX): $(TESTOPUSPADDING_OBJS) $(TARGET)
+	$(LINK.o.cmdline)
+
 opus_compare$(EXESUFFIX): $(OPUSCOMPARE_OBJS)
 	$(LINK.o.cmdline)
 
@@ -122,6 +151,9 @@
 
 clean:
 	rm -f opus_demo$(EXESUFFIX) opus_compare$(EXESUFFIX) $(TARGET) \
-		$(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS)
+                test_opus_api$(EXESUFFIX) test_opus_decode$(EXESUFFIX) \
+                test_opus_encode$(EXESUFFIX) test_opus_padding$(EXESUFFIX) \
+		$(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS) $(TESTOPUSAPI_OBJS) \
+                $(TESTOPUSDECODE_OBJS) $(TESTOPUSENCODE_OBJS) $(TESTOPUSPADDING_OBJS)
 
-.PHONY: all lib clean
+.PHONY: all lib clean force check
diff --git a/README b/README
index 655c6b4..ac6264e 100644
--- a/README
+++ b/README
@@ -13,14 +13,14 @@
 have historically used high latency formats such as MP3, AAC, or Vorbis.
 
                     Opus is specified by IETF RFC 6716:
-                    http://tools.ietf.org/html/rfc6716
+                    https://tools.ietf.org/html/rfc6716
 
   The Opus format and this implementation of it are subject to the royalty-
 free patent and copyright licenses specified in the file COPYING.
 
 This package implements a shared library for encoding and decoding raw Opus
 bitstreams. Raw Opus bitstreams should be used over RTP according to
- http://tools.ietf.org/html/draft-spittka-payload-rtp-opus
+ https://tools.ietf.org/html/rfc7587
 
 The package also includes a number of test  tools used for testing the
 correct operation of the library. The bitstreams read/written by these
@@ -29,7 +29,7 @@
 
 Opus stored in files should use the Ogg encapsulation for Opus which is
 described at:
- http://wiki.xiph.org/OggOpus
+ https://wiki.xiph.org/OggOpus
 
 An opus-tools package is available which provides encoding and decoding of
 Ogg encapsulated Opus files and includes a number of useful features.
@@ -37,7 +37,7 @@
 Opus-tools can be found at:
  https://git.xiph.org/?p=opus-tools.git
 or on the main Opus website:
- http://opus-codec.org/
+ https://opus-codec.org/
 
 == Compiling libopus ==
 
@@ -50,7 +50,7 @@
 
 1) Clone the repository:
 
-% git clone git://git.opus-codec.org/opus.git
+% git clone https://git.xiph.org/opus.git
 % cd opus
 
 2) Compiling the source
@@ -106,11 +106,11 @@
 
 There is also collection of standard test vectors which are not
 included in this package for size reasons but can be obtained from:
-http://opus-codec.org/testvectors/opus_testvectors.tar.gz
+https://opus-codec.org/testvectors/opus_testvectors.tar.gz
 
 To run compare the code to these test vectors:
 
-% curl -O http://opus-codec.org/testvectors/opus_testvectors.tar.gz
+% curl -O https://opus-codec.org/testvectors/opus_testvectors.tar.gz
 % tar -zxf opus_testvectors.tar.gz
 % ./tests/run_vectors.sh ./ opus_testvectors 48000
 
diff --git a/README.android b/README.android
index e09b6ce..b34d4b2 100644
--- a/README.android
+++ b/README.android
@@ -1,2 +1,7 @@
-* current source is based on libopus 1.1 stable (http://downloads.xiph.org/releases/opus/opus-1.1.tar.gz)
+* current source is based on libopus 1.1.2 stable (http://downloads.xiph.org/releases/opus/opus-1.1.2.tar.gz)
 * libopus is BSD-licensed - http://www.opus-codec.org/license/
+
+Updating:
+* Run "convert_android_asm.sh" from the root of the library (external/libopus).
+  This uses 'arm2gnu.pl' included in libopus to convert ARM ASM files to GNU ASM
+  files for building under the Android NDK.
diff --git a/aclocal.m4 b/aclocal.m4
index 5a41d55..ca08e91 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,8 +1,7 @@
-# generated automatically by aclocal 1.11.6 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
 
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
-# Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -12,33 +11,31 @@
 # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 # PARTICULAR PURPOSE.
 
+m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.68],,
-[m4_warning([this file was generated for autoconf 2.68.
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
+[m4_warning([this file was generated for autoconf 2.69.
 You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
-To do so, use the procedure documented by the package, typically `autoreconf'.])])
+To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008, 2011 Free Software
-# Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 1
-
 # AM_AUTOMAKE_VERSION(VERSION)
 # ----------------------------
 # Automake X.Y traces this macro to ensure aclocal.m4 has been
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.11'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.11.6], [],
+m4_if([$1], [1.15], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -54,21 +51,19 @@
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.11.6])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # Figure out how to run the assembler.                      -*- Autoconf -*-
 
-# Copyright (C) 2001, 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 5
-
 # AM_PROG_AS
 # ----------
 AC_DEFUN([AM_PROG_AS],
@@ -83,17 +78,15 @@
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 1
-
 # For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
-# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
-# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
+# $ac_aux_dir to '$srcdir/foo'.  In other projects, it is set to
+# '$srcdir', '$srcdir/..', or '$srcdir/../..'.
 #
 # Of course, Automake must honor this variable whenever it calls a
 # tool from the auxiliary directory.  The problem is that $srcdir (and
@@ -112,7 +105,7 @@
 #
 # The reason of the latter failure is that $top_srcdir and $ac_aux_dir
 # are both prefixed by $srcdir.  In an in-source build this is usually
-# harmless because $srcdir is `.', but things will broke when you
+# harmless because $srcdir is '.', but things will broke when you
 # start a VPATH build or use an absolute $srcdir.
 #
 # So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
@@ -130,30 +123,26 @@
 # configured tree to be moved without reconfiguration.
 
 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005, 2006, 2008
-# Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 9
-
 # AM_CONDITIONAL(NAME, SHELL-CONDITION)
 # -------------------------------------
 # Define a conditional.
 AC_DEFUN([AM_CONDITIONAL],
-[AC_PREREQ(2.52)dnl
- ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
-	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
+[AC_PREREQ([2.52])dnl
+ m4_if([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
+       [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
 AC_SUBST([$1_TRUE])dnl
 AC_SUBST([$1_FALSE])dnl
 _AM_SUBST_NOTMAKE([$1_TRUE])dnl
@@ -172,16 +161,14 @@
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009,
-# 2010, 2011 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 12
 
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
+# There are a few dirty hacks below to avoid letting 'AC_PROG_CC' be
 # written in clear, in which case automake, when reading aclocal.m4,
 # will think it sees a *use*, and therefore will trigger all it's
 # C support machinery.  Also note that it means that autoscan, seeing
@@ -191,7 +178,7 @@
 # _AM_DEPENDENCIES(NAME)
 # ----------------------
 # See how the compiler implements dependency checking.
-# NAME is "CC", "CXX", "GCJ", or "OBJC".
+# NAME is "CC", "CXX", "OBJC", "OBJCXX", "UPC", or "GJC".
 # We try a few techniques and use that to set a single cache variable.
 #
 # We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
@@ -204,12 +191,13 @@
 AC_REQUIRE([AM_MAKE_INCLUDE])dnl
 AC_REQUIRE([AM_DEP_TRACK])dnl
 
-ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
-       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
-       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
-       [$1], UPC,  [depcc="$UPC"  am_compiler_list=],
-       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
-                   [depcc="$$1"   am_compiler_list=])
+m4_if([$1], [CC],   [depcc="$CC"   am_compiler_list=],
+      [$1], [CXX],  [depcc="$CXX"  am_compiler_list=],
+      [$1], [OBJC], [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
+      [$1], [OBJCXX], [depcc="$OBJCXX" am_compiler_list='gcc3 gcc'],
+      [$1], [UPC],  [depcc="$UPC"  am_compiler_list=],
+      [$1], [GCJ],  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
+                    [depcc="$$1"   am_compiler_list=])
 
 AC_CACHE_CHECK([dependency style of $depcc],
                [am_cv_$1_dependencies_compiler_type],
@@ -217,8 +205,8 @@
   # We make a subdir and do the tests there.  Otherwise we can end up
   # making bogus files that we don't know about and never remove.  For
   # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
   rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
@@ -258,16 +246,16 @@
     : > sub/conftest.c
     for i in 1 2 3 4 5 6; do
       echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
     done
     echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
 
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
     # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
     am__obj=sub/conftest.${OBJEXT-o}
     am__minus_obj="-o $am__obj"
     case $depmode in
@@ -276,8 +264,8 @@
       test "$am__universal" = false || continue
       ;;
     nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
       if test "x$enable_dependency_tracking" = xyes; then
 	continue
       else
@@ -285,7 +273,7 @@
       fi
       ;;
     msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
+      # This compiler won't grok '-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
       am__obj=conftest.${OBJEXT-o}
@@ -333,7 +321,7 @@
 # AM_SET_DEPDIR
 # -------------
 # Choose a directory name for dependency files.
-# This macro is AC_REQUIREd in _AM_DEPENDENCIES
+# This macro is AC_REQUIREd in _AM_DEPENDENCIES.
 AC_DEFUN([AM_SET_DEPDIR],
 [AC_REQUIRE([AM_SET_LEADING_DOT])dnl
 AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
@@ -343,9 +331,13 @@
 # AM_DEP_TRACK
 # ------------
 AC_DEFUN([AM_DEP_TRACK],
-[AC_ARG_ENABLE(dependency-tracking,
-[  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors])
+[AC_ARG_ENABLE([dependency-tracking], [dnl
+AS_HELP_STRING(
+  [--enable-dependency-tracking],
+  [do not reject slow dependency extractors])
+AS_HELP_STRING(
+  [--disable-dependency-tracking],
+  [speeds up one-time build])])
 if test "x$enable_dependency_tracking" != xno; then
   am_depcomp="$ac_aux_dir/depcomp"
   AMDEPBACKSLASH='\'
@@ -360,20 +352,18 @@
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-#serial 5
 
 # _AM_OUTPUT_DEPENDENCY_COMMANDS
 # ------------------------------
 AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
 [{
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # Older Autoconf quotes --file arguments for eval, but not when files
   # are listed without --file.  Let's play safe and only enable the eval
   # if we detect the quoting.
   case $CONFIG_FILES in
@@ -386,7 +376,7 @@
     # Strip MF so we end up with the name of the file.
     mf=`echo "$mf" | sed -e 's/:.*$//'`
     # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named `Makefile.in', but
+    # We used to match only the files named 'Makefile.in', but
     # some people rename them; so instead we look at the file content.
     # Grep'ing the first line is not enough: some people post-process
     # each Makefile.in and add a new line on top of each file to say so.
@@ -398,21 +388,19 @@
       continue
     fi
     # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running `make'.
+    # from the Makefile without running 'make'.
     DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
     test -z "$DEPDIR" && continue
     am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
+    test -z "$am__include" && continue
     am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # When using ansi2knr, U may be empty or an underscore; expand it
-    U=`sed -n 's/^U = //p' < "$mf"`
     # Find all dependency output files, they are included files with
     # $(DEPDIR) in their names.  We invoke sed twice because it is the
     # simplest approach to changing $(DEPDIR) to its actual value in the
     # expansion.
     for file in `sed -n "
       s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
       # Make sure the directory exists.
       test -f "$dirpart/$file" && continue
       fdir=`AS_DIRNAME(["$file"])`
@@ -430,7 +418,7 @@
 # This macro should only be invoked once -- use via AC_REQUIRE.
 #
 # This code is only required when automatic dependency tracking
-# is enabled.  FIXME.  This creates each `.P' file that we will
+# is enabled.  FIXME.  This creates each '.P' file that we will
 # need in order to bootstrap the dependency handling code.
 AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 [AC_CONFIG_COMMANDS([depfiles],
@@ -440,18 +428,21 @@
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006, 2008, 2009 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 16
-
 # This macro actually does too much.  Some checks are only needed if
 # your package does certain things.  But this isn't really a big deal.
 
+dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O.
+m4_define([AC_PROG_CC],
+m4_defn([AC_PROG_CC])
+[_AM_PROG_CC_C_O
+])
+
 # AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
 # AM_INIT_AUTOMAKE([OPTIONS])
 # -----------------------------------------------
@@ -464,7 +455,7 @@
 # arguments mandatory, and then we can depend on a new Autoconf
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_PREREQ([2.62])dnl
+[AC_PREREQ([2.65])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -493,33 +484,42 @@
 # Define the identity of the package.
 dnl Distinguish between old-style and new-style calls.
 m4_ifval([$2],
-[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
+[AC_DIAGNOSE([obsolete],
+             [$0: two- and three-arguments forms are deprecated.])
+m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
  AC_SUBST([PACKAGE], [$1])dnl
  AC_SUBST([VERSION], [$2])],
 [_AM_SET_OPTIONS([$1])dnl
 dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
-m4_if(m4_ifdef([AC_PACKAGE_NAME], 1)m4_ifdef([AC_PACKAGE_VERSION], 1), 11,,
+m4_if(
+  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
+  [ok:ok],,
   [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
  AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
  AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
 
 _AM_IF_OPTION([no-define],,
-[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
- AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
+[AC_DEFINE_UNQUOTED([PACKAGE], ["$PACKAGE"], [Name of package])
+ AC_DEFINE_UNQUOTED([VERSION], ["$VERSION"], [Version number of package])])dnl
 
 # Some tools Automake needs.
 AC_REQUIRE([AM_SANITY_CHECK])dnl
 AC_REQUIRE([AC_ARG_PROGRAM])dnl
-AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
-AM_MISSING_PROG(AUTOCONF, autoconf)
-AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
-AM_MISSING_PROG(AUTOHEADER, autoheader)
-AM_MISSING_PROG(MAKEINFO, makeinfo)
+AM_MISSING_PROG([ACLOCAL], [aclocal-${am__api_version}])
+AM_MISSING_PROG([AUTOCONF], [autoconf])
+AM_MISSING_PROG([AUTOMAKE], [automake-${am__api_version}])
+AM_MISSING_PROG([AUTOHEADER], [autoheader])
+AM_MISSING_PROG([MAKEINFO], [makeinfo])
 AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
 AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl
-AC_REQUIRE([AM_PROG_MKDIR_P])dnl
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+AC_REQUIRE([AC_PROG_MKDIR_P])dnl
+# For better backward compatibility.  To be removed once Automake 1.9.x
+# dies out for good.  For more background, see:
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -528,34 +528,82 @@
 			     [_AM_PROG_TAR([v7])])])
 _AM_IF_OPTION([no-dependencies],,
 [AC_PROVIDE_IFELSE([AC_PROG_CC],
-		  [_AM_DEPENDENCIES(CC)],
-		  [define([AC_PROG_CC],
-			  defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
+		  [_AM_DEPENDENCIES([CC])],
+		  [m4_define([AC_PROG_CC],
+			     m4_defn([AC_PROG_CC])[_AM_DEPENDENCIES([CC])])])dnl
 AC_PROVIDE_IFELSE([AC_PROG_CXX],
-		  [_AM_DEPENDENCIES(CXX)],
-		  [define([AC_PROG_CXX],
-			  defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
+		  [_AM_DEPENDENCIES([CXX])],
+		  [m4_define([AC_PROG_CXX],
+			     m4_defn([AC_PROG_CXX])[_AM_DEPENDENCIES([CXX])])])dnl
 AC_PROVIDE_IFELSE([AC_PROG_OBJC],
-		  [_AM_DEPENDENCIES(OBJC)],
-		  [define([AC_PROG_OBJC],
-			  defn([AC_PROG_OBJC])[_AM_DEPENDENCIES(OBJC)])])dnl
+		  [_AM_DEPENDENCIES([OBJC])],
+		  [m4_define([AC_PROG_OBJC],
+			     m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
+		  [_AM_DEPENDENCIES([OBJCXX])],
+		  [m4_define([AC_PROG_OBJCXX],
+			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
-_AM_IF_OPTION([silent-rules], [AC_REQUIRE([AM_SILENT_RULES])])dnl
-dnl The `parallel-tests' driver may need to know about EXEEXT, so add the
-dnl `am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This macro
-dnl is hooked onto _AC_COMPILER_EXEEXT early, see below.
+AC_REQUIRE([AM_SILENT_RULES])dnl
+dnl The testsuite driver may need to know about EXEEXT, so add the
+dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
+dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below.
 AC_CONFIG_COMMANDS_PRE(dnl
 [m4_provide_if([_AM_COMPILER_EXEEXT],
   [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
+
+# POSIX will say in a future version that running "rm -f" with no argument
+# is OK; and we want to be able to make that assumption in our Makefile
+# recipes.  So use an aggressive probe to check that the usage we want is
+# actually supported "in the wild" to an acceptable degree.
+# See automake bug#10828.
+# To make any issue more visible, cause the running configure to be aborted
+# by default if the 'rm' program in use doesn't match our expectations; the
+# user can still override this though.
+if rm -f && rm -fr && rm -rf; then : OK; else
+  cat >&2 <<'END'
+Oops!
+
+Your 'rm' program seems unable to run without file operands specified
+on the command line, even when the '-f' option is present.  This is contrary
+to the behaviour of most rm programs out there, and not conforming with
+the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
+
+Please tell bug-automake@gnu.org about your system, including the value
+of your $PATH and any error possibly output before this message.  This
+can help us improve future automake versions.
+
+END
+  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
+    echo 'Configuration will proceed anyway, since you have set the' >&2
+    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
+    echo >&2
+  else
+    cat >&2 <<'END'
+Aborting the configuration process, to ensure you take notice of the issue.
+
+You can download and install GNU coreutils to get an 'rm' implementation
+that behaves properly: <http://www.gnu.org/software/coreutils/>.
+
+If you want to complete the configuration process using your problematic
+'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
+to "yes", and re-run configure.
+
+END
+    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
+  fi
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
 ])
 
-dnl Hook into `_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
+dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
 dnl mangled by Autoconf and run in a shell conditional statement.
 m4_define([_AC_COMPILER_EXEEXT],
 m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])
 
-
 # When config.status generates a header, we must update the stamp-h file.
 # This file resides in the same directory as the config header
 # that is generated.  The stamp files are numbered to have different names.
@@ -577,21 +625,18 @@
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001, 2003, 2005, 2008, 2011 Free Software Foundation,
-# Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 1
-
 # AM_PROG_INSTALL_SH
 # ------------------
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -599,16 +644,14 @@
     install_sh="\${SHELL} $am_aux_dir/install-sh"
   esac
 fi
-AC_SUBST(install_sh)])
+AC_SUBST([install_sh])])
 
-# Copyright (C) 2003, 2005  Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 2
-
 # Check whether the underlying file-system supports filenames
 # with a leading dot.  For instance MS-DOS doesn't.
 AC_DEFUN([AM_SET_LEADING_DOT],
@@ -625,20 +668,17 @@
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering
 
-# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008,
-# 2011 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 5
-
 # AM_MAINTAINER_MODE([DEFAULT-MODE])
 # ----------------------------------
 # Control maintainer-specific portions of Makefiles.
-# Default is to disable them, unless `enable' is passed literally.
-# For symmetry, `disable' may be passed as well.  Anyway, the user
+# Default is to disable them, unless 'enable' is passed literally.
+# For symmetry, 'disable' may be passed as well.  Anyway, the user
 # can override the default with the --enable/--disable switch.
 AC_DEFUN([AM_MAINTAINER_MODE],
 [m4_case(m4_default([$1], [disable]),
@@ -649,10 +689,11 @@
 AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
   dnl maintainer-mode's default is 'disable' unless 'enable' is passed
   AC_ARG_ENABLE([maintainer-mode],
-[  --][am_maintainer_other][-maintainer-mode  am_maintainer_other make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer],
-      [USE_MAINTAINER_MODE=$enableval],
-      [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
+    [AS_HELP_STRING([--]am_maintainer_other[-maintainer-mode],
+      am_maintainer_other[ make rules and dependencies not useful
+      (and sometimes confusing) to the casual installer])],
+    [USE_MAINTAINER_MODE=$enableval],
+    [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
   AC_MSG_RESULT([$USE_MAINTAINER_MODE])
   AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
   MAINT=$MAINTAINER_MODE_TRUE
@@ -660,18 +701,14 @@
 ]
 )
 
-AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
-
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001, 2002, 2003, 2005, 2009  Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 4
-
 # AM_MAKE_INCLUDE()
 # -----------------
 # Check to see how make treats includes.
@@ -689,7 +726,7 @@
 _am_result=none
 # First try GNU make style include.
 echo "include confinc" > confmf
-# Ignore all kinds of additional output from `make'.
+# Ignore all kinds of additional output from 'make'.
 case `$am_make -s -f confmf 2> /dev/null` in #(
 *the\ am__doit\ target*)
   am__include=include
@@ -714,52 +751,14 @@
 rm -f confinc confmf
 ])
 
-# Copyright (C) 1999, 2000, 2001, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 6
-
-# AM_PROG_CC_C_O
-# --------------
-# Like AC_PROG_CC_C_O, but changed for automake.
-AC_DEFUN([AM_PROG_CC_C_O],
-[AC_REQUIRE([AC_PROG_CC_C_O])dnl
-AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-AC_REQUIRE_AUX_FILE([compile])dnl
-# FIXME: we rely on the cache variable name because
-# there is no other way.
-set dummy $CC
-am_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
-eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
-if test "$am_t" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-dnl Make sure AC_PROG_CC is never called again, or it will override our
-dnl setting of CC.
-m4_define([AC_PROG_CC],
-          [m4_fatal([AC_PROG_CC cannot be called after AM_PROG_CC_C_O])])
-])
-
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997, 1999, 2000, 2001, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 6
-
 # AM_MISSING_PROG(NAME, PROGRAM)
 # ------------------------------
 AC_DEFUN([AM_MISSING_PROG],
@@ -767,11 +766,10 @@
 $1=${$1-"${am_missing_run}$2"}
 AC_SUBST($1)])
 
-
 # AM_MISSING_HAS_RUN
 # ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
+# Define MISSING if not defined so far and test if it is modern enough.
+# If it is, set am_missing_run to use it, otherwise, to nothing.
 AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
@@ -784,54 +782,22 @@
   esac
 fi
 # Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
 else
   am_missing_run=
-  AC_MSG_WARN([`missing' script is too old or missing])
+  AC_MSG_WARN(['missing' script is too old or missing])
 fi
 ])
 
-# Copyright (C) 2003, 2004, 2005, 2006, 2011 Free Software Foundation,
-# Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# AM_PROG_MKDIR_P
-# ---------------
-# Check for `mkdir -p'.
-AC_DEFUN([AM_PROG_MKDIR_P],
-[AC_PREREQ([2.60])dnl
-AC_REQUIRE([AC_PROG_MKDIR_P])dnl
-dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
-dnl while keeping a definition of mkdir_p for backward compatibility.
-dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
-dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
-dnl Makefile.ins that do not define MKDIR_P, so we do our own
-dnl adjustment using top_builddir (which is defined more often than
-dnl MKDIR_P).
-AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
-case $mkdir_p in
-  [[\\/$]]* | ?:[[\\/]]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-])
-
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001, 2002, 2003, 2005, 2008, 2010 Free Software
-# Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 5
-
 # _AM_MANGLE_OPTION(NAME)
 # -----------------------
 AC_DEFUN([_AM_MANGLE_OPTION],
@@ -841,7 +807,7 @@
 # --------------------
 # Set option NAME.  Presently that only means defining a flag for this option.
 AC_DEFUN([_AM_SET_OPTION],
-[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
+[m4_define(_AM_MANGLE_OPTION([$1]), [1])])
 
 # _AM_SET_OPTIONS(OPTIONS)
 # ------------------------
@@ -855,24 +821,82 @@
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
 
-# Check to make sure that the build environment is sane.    -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005, 2008
-# Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 5
+# _AM_PROG_CC_C_O
+# ---------------
+# Like AC_PROG_CC_C_O, but changed for automake.  We rewrite AC_PROG_CC
+# to automatically call this.
+AC_DEFUN([_AM_PROG_CC_C_O],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([compile])dnl
+AC_LANG_PUSH([C])dnl
+AC_CACHE_CHECK(
+  [whether $CC understands -c and -o together],
+  [am_cv_prog_cc_c_o],
+  [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
+  # Make sure it works both with $CC and with simple cc.
+  # Following AC_PROG_CC_C_O, we do the test twice because some
+  # compilers refuse to overwrite an existing .o file with -o,
+  # though they will create one.
+  am_cv_prog_cc_c_o=yes
+  for am_i in 1 2; do
+    if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
+         && test -f conftest2.$ac_objext; then
+      : OK
+    else
+      am_cv_prog_cc_c_o=no
+      break
+    fi
+  done
+  rm -f core conftest*
+  unset am_i])
+if test "$am_cv_prog_cc_c_o" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+AC_LANG_POP([C])])
+
+# For backward compatibility.
+AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
+
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_RUN_LOG(COMMAND)
+# -------------------
+# Run COMMAND, save the exit status in ac_status, and log it.
+# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
+AC_DEFUN([AM_RUN_LOG],
+[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
+   ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
+   (exit $ac_status); }])
+
+# Check to make sure that the build environment is sane.    -*- Autoconf -*-
+
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
 
 # AM_SANITY_CHECK
 # ---------------
 AC_DEFUN([AM_SANITY_CHECK],
 [AC_MSG_CHECKING([whether build environment is sane])
-# Just in case
-sleep 1
-echo timestamp > conftest.file
 # Reject unsafe characters in $srcdir or the absolute working directory
 # name.  Accept space and tab only in the latter.
 am_lf='
@@ -883,32 +907,40 @@
 esac
 case $srcdir in
   *[[\\\"\#\$\&\'\`$am_lf\ \	]]*)
-    AC_MSG_ERROR([unsafe srcdir value: `$srcdir']);;
+    AC_MSG_ERROR([unsafe srcdir value: '$srcdir']);;
 esac
 
-# Do `set' in a subshell so we don't clobber the current shell's
+# Do 'set' in a subshell so we don't clobber the current shell's
 # arguments.  Must try -L first in case configure is actually a
 # symlink; some systems play weird games with the mod time of symlinks
 # (eg FreeBSD returns the mod time of the symlink's containing
 # directory).
 if (
-   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
-   if test "$[*]" = "X"; then
-      # -L didn't work.
-      set X `ls -t "$srcdir/configure" conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$[*]" != "X $srcdir/configure conftest.file" \
-      && test "$[*]" != "X conftest.file $srcdir/configure"; then
+   am_has_slept=no
+   for am_try in 1 2; do
+     echo "timestamp, slept: $am_has_slept" > conftest.file
+     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+     if test "$[*]" = "X"; then
+	# -L didn't work.
+	set X `ls -t "$srcdir/configure" conftest.file`
+     fi
+     if test "$[*]" != "X $srcdir/configure conftest.file" \
+	&& test "$[*]" != "X conftest.file $srcdir/configure"; then
 
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
-alias in your environment])
-   fi
-
+	# If neither matched, then we have a broken ls.  This can happen
+	# if, for instance, CONFIG_SHELL is bash and it inherits a
+	# broken ls alias from the environment.  This has actually
+	# happened.  Such a system could not be considered "sane".
+	AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+  alias in your environment])
+     fi
+     if test "$[2]" = conftest.file || test $am_try -eq 2; then
+       break
+     fi
+     # Just in case.
+     sleep 1
+     am_has_slept=yes
+   done
    test "$[2]" = conftest.file
    )
 then
@@ -918,31 +950,50 @@
    AC_MSG_ERROR([newly created file is older than distributed files!
 Check your system clock])
 fi
-AC_MSG_RESULT(yes)])
+AC_MSG_RESULT([yes])
+# If we didn't sleep, we still need to ensure time stamps of config.status and
+# generated files are strictly newer.
+am_sleep_pid=
+if grep 'slept: no' conftest.file >/dev/null 2>&1; then
+  ( sleep 1 ) &
+  am_sleep_pid=$!
+fi
+AC_CONFIG_COMMANDS_PRE(
+  [AC_MSG_CHECKING([that generated files are newer than configure])
+   if test -n "$am_sleep_pid"; then
+     # Hide warnings about reused PIDs.
+     wait $am_sleep_pid 2>/dev/null
+   fi
+   AC_MSG_RESULT([done])])
+rm -f conftest.file
+])
 
-# Copyright (C) 2009, 2011  Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 2
-
 # AM_SILENT_RULES([DEFAULT])
 # --------------------------
 # Enable less verbose build rules; with the default set to DEFAULT
-# (`yes' being less verbose, `no' or empty being verbose).
+# ("yes" being less verbose, "no" or empty being verbose).
 AC_DEFUN([AM_SILENT_RULES],
-[AC_ARG_ENABLE([silent-rules],
-[  --enable-silent-rules          less verbose build output (undo: `make V=1')
-  --disable-silent-rules         verbose build output (undo: `make V=0')])
-case $enable_silent_rules in
-yes) AM_DEFAULT_VERBOSITY=0;;
-no)  AM_DEFAULT_VERBOSITY=1;;
-*)   AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);;
+[AC_ARG_ENABLE([silent-rules], [dnl
+AS_HELP_STRING(
+  [--enable-silent-rules],
+  [less verbose build output (undo: "make V=1")])
+AS_HELP_STRING(
+  [--disable-silent-rules],
+  [verbose build output (undo: "make V=0")])dnl
+])
+case $enable_silent_rules in @%:@ (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);;
 esac
 dnl
-dnl A few `make' implementations (e.g., NonStop OS and NextStep)
+dnl A few 'make' implementations (e.g., NonStop OS and NextStep)
 dnl do not support nested variable expansions.
 dnl See automake bug#9928 and bug#10237.
 am_make=${MAKE-make}
@@ -960,7 +1011,7 @@
   am_cv_make_support_nested_variables=no
 fi])
 if test $am_cv_make_support_nested_variables = yes; then
-  dnl Using `$V' instead of `$(V)' breaks IRIX make.
+  dnl Using '$V' instead of '$(V)' breaks IRIX make.
   AM_V='$(V)'
   AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
 else
@@ -977,44 +1028,40 @@
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 1
-
 # AM_PROG_INSTALL_STRIP
 # ---------------------
-# One issue with vendor `install' (even GNU) is that you can't
+# One issue with vendor 'install' (even GNU) is that you can't
 # specify the program used to strip binaries.  This is especially
 # annoying in cross-compiling environments, where the build's strip
 # is unlikely to handle the host's binaries.
 # Fortunately install-sh will honor a STRIPPROG variable, so we
-# always use install-sh in `make install-strip', and initialize
+# always use install-sh in "make install-strip", and initialize
 # STRIPPROG with the value of the STRIP variable (set by the user).
 AC_DEFUN([AM_PROG_INSTALL_STRIP],
 [AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
+# Installed binaries are usually stripped using 'strip' when the user
+# run "make install-strip".  However 'strip' might not be the right
 # tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
-dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
+# will honor the 'STRIP' environment variable to overrule this program.
+dnl Don't test for $cross_compiling = yes, because it might be 'maybe'.
 if test "$cross_compiling" != no; then
   AC_CHECK_TOOL([STRIP], [strip], :)
 fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006, 2008, 2010 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 3
-
 # _AM_SUBST_NOTMAKE(VARIABLE)
 # ---------------------------
 # Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in.
@@ -1028,18 +1075,16 @@
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004, 2005, 2012 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 2
-
 # _AM_PROG_TAR(FORMAT)
 # --------------------
 # Check how to create a tarball in format FORMAT.
-# FORMAT should be one of `v7', `ustar', or `pax'.
+# FORMAT should be one of 'v7', 'ustar', or 'pax'.
 #
 # Substitute a variable $(am__tar) that is a command
 # writing to stdout a FORMAT-tarball containing the directory
@@ -1049,76 +1094,114 @@
 # Substitute a variable $(am__untar) that extract such
 # a tarball read from stdin.
 #     $(am__untar) < result.tar
+#
 AC_DEFUN([_AM_PROG_TAR],
 [# Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AC_SUBST([AMTAR], ['$${TAR-tar}'])
-m4_if([$1], [v7],
-     [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
-     [m4_case([$1], [ustar],, [pax],,
-              [m4_fatal([Unknown tar format])])
-AC_MSG_CHECKING([how to create a $1 tar archive])
-# Loop over all known methods to create a tar archive until one works.
+
+# We'll loop over all known methods to create a tar archive until one works.
 _am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
-_am_tools=${am_cv_prog_tar_$1-$_am_tools}
-# Do not fold the above two line into one, because Tru64 sh and
-# Solaris sh will not grok spaces in the rhs of `-'.
-for _am_tool in $_am_tools
-do
-  case $_am_tool in
-  gnutar)
-    for _am_tar in tar gnutar gtar;
-    do
-      AM_RUN_LOG([$_am_tar --version]) && break
-    done
-    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
-    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
-    am__untar="$_am_tar -xf -"
-    ;;
-  plaintar)
-    # Must skip GNU tar: if it does not support --format= it doesn't create
-    # ustar tarball either.
-    (tar --version) >/dev/null 2>&1 && continue
-    am__tar='tar chf - "$$tardir"'
-    am__tar_='tar chf - "$tardir"'
-    am__untar='tar xf -'
-    ;;
-  pax)
-    am__tar='pax -L -x $1 -w "$$tardir"'
-    am__tar_='pax -L -x $1 -w "$tardir"'
-    am__untar='pax -r'
-    ;;
-  cpio)
-    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
-    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
-    am__untar='cpio -i -H $1 -d'
-    ;;
-  none)
-    am__tar=false
-    am__tar_=false
-    am__untar=false
-    ;;
-  esac
 
-  # If the value was cached, stop now.  We just wanted to have am__tar
-  # and am__untar set.
-  test -n "${am_cv_prog_tar_$1}" && break
+m4_if([$1], [v7],
+  [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
 
-  # tar/untar a dummy directory, and stop if the command works
+  [m4_case([$1],
+    [ustar],
+     [# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format])
+      if test $am_uid -le $am_max_uid; then
+         AC_MSG_RESULT([yes])
+      else
+         AC_MSG_RESULT([no])
+         _am_tools=none
+      fi
+      AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format])
+      if test $am_gid -le $am_max_gid; then
+         AC_MSG_RESULT([yes])
+      else
+        AC_MSG_RESULT([no])
+        _am_tools=none
+      fi],
+
+  [pax],
+    [],
+
+  [m4_fatal([Unknown tar format])])
+
+  AC_MSG_CHECKING([how to create a $1 tar archive])
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_$1-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        AM_RUN_LOG([$_am_tar --version]) && break
+      done
+      am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x $1 -w "$$tardir"'
+      am__tar_='pax -L -x $1 -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+      am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+      am__untar='cpio -i -H $1 -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_$1}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      AM_RUN_LOG([$am__untar <conftest.tar])
+      AM_RUN_LOG([cat conftest.dir/file])
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
   rm -rf conftest.dir
-  mkdir conftest.dir
-  echo GrepMe > conftest.dir/file
-  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
-  rm -rf conftest.dir
-  if test -s conftest.tar; then
-    AM_RUN_LOG([$am__untar <conftest.tar])
-    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
-  fi
-done
-rm -rf conftest.dir
 
-AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
-AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+  AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+  AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+
 AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
@@ -1129,3 +1212,4 @@
 m4_include([m4/ltsugar.m4])
 m4_include([m4/ltversion.m4])
 m4_include([m4/lt~obsolete.m4])
+m4_include([m4/opus-intrinsics.m4])
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index aefe490..5e3d58f 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -65,10 +65,6 @@
       do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
           (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
 
-#   define C_MUL4(m,a,b) \
-      do{ (m).r = SHR32(SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)),2); \
-          (m).i = SHR32(ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)),2); }while(0)
-
 #   define C_MULBYSCALAR( c, s ) \
       do{ (c).r =  S_MUL( (c).r , s ) ;\
           (c).i =  S_MUL( (c).i , s ) ; }while(0)
@@ -101,6 +97,9 @@
 #if defined(OPUS_ARM_INLINE_EDSP)
 #include "arm/kiss_fft_armv5e.h"
 #endif
+#if defined(MIPSr1_ASM)
+#include "mips/kiss_fft_mipsr1.h"
+#endif
 
 #else  /* not FIXED_POINT*/
 
diff --git a/celt/arch.h b/celt/arch.h
index 3bbcd36..9f74ddd 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -69,11 +69,8 @@
 
 #define IMUL32(a,b) ((a)*(b))
 
-#define ABS(x) ((x) < 0 ? (-(x)) : (x))      /**< Absolute integer value. */
-#define ABS16(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 16-bit value.  */
 #define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 16-bit value.   */
 #define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
-#define ABS32(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 32-bit value.  */
 #define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 32-bit value.   */
 #define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */
@@ -108,6 +105,13 @@
 #define SCALEIN(a)      (a)
 #define SCALEOUT(a)     (a)
 
+#define ABS16(x) ((x) < 0 ? (-(x)) : (x))
+#define ABS32(x) ((x) < 0 ? (-(x)) : (x))
+
+static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
+   return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
+}
+
 #ifdef FIXED_DEBUG
 #include "fixed_debug.h"
 #else
@@ -137,6 +141,22 @@
 typedef float celt_norm;
 typedef float celt_ener;
 
+#ifdef FLOAT_APPROX
+/* This code should reliably detect NaN/inf even when -ffast-math is used.
+   Assumes IEEE 754 format. */
+static OPUS_INLINE int celt_isnan(float x)
+{
+   union {float f; opus_uint32 i;} in;
+   in.f = x;
+   return ((in.i>>23)&0xFF)==0xFF && (in.i&0x007FFFFF)!=0;
+}
+#else
+#ifdef __FAST_MATH__
+#error Cannot build libopus with -ffast-math unless FLOAT_APPROX is defined. This could result in crashes on extreme (e.g. NaN) input
+#endif
+#define celt_isnan(x) ((x)!=(x))
+#endif
+
 #define Q15ONE 1.0f
 
 #define NORM_SCALING 1.f
@@ -146,6 +166,10 @@
 #define VERY_LARGE16 1e15f
 #define Q15_ONE ((opus_val16)1.f)
 
+/* This appears to be the same speed as C99's fabsf() but it's more portable. */
+#define ABS16(x) ((float)fabs(x))
+#define ABS32(x) ((float)fabs(x))
+
 #define QCONST16(x,bits) (x)
 #define QCONST32(x,bits) (x)
 
@@ -184,6 +208,7 @@
 #define MULT32_32_Q31(a,b)     ((a)*(b))
 
 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
+#define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))
 
 #define MULT16_16_Q11_32(a,b)     ((a)*(b))
 #define MULT16_16_Q11(a,b)     ((a)*(b))
@@ -201,6 +226,8 @@
 #define SCALEIN(a)      ((a)*CELT_SIG_SCALE)
 #define SCALEOUT(a)     ((a)*(1/CELT_SIG_SCALE))
 
+#define SIG2WORD16(x) (x)
+
 #endif /* !FIXED_POINT */
 
 #ifndef GLOBAL_STACK_SIZE
diff --git a/celt/arm/arm2gnu.pl b/celt/arm/arm2gnu.pl
index eab42ef..6c922ac 100755
--- a/celt/arm/arm2gnu.pl
+++ b/celt/arm/arm2gnu.pl
@@ -1,7 +1,33 @@
 #!/usr/bin/perl
+# Copyright (C) 2002-2013 Xiph.org Foundation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# - Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 my $bigend;  # little/big endian
 my $nxstack;
+my $apple = 0;
+my $symprefix = "";
 
 $nxstack = 0;
 
@@ -10,11 +36,16 @@
 
 while ($ARGV[0] =~ /^-/) {
     $_ = shift;
-  last if /^--/;
-    if (/^-n/) {
+  last if /^--$/;
+    if (/^-n$/) {
     $nflag++;
     next;
     }
+    if (/^--apple$/) {
+        $apple = 1;
+        $symprefix = "_";
+        next;
+    }
     die "I don't recognize this switch: $_\\n";
 }
 $printit++ unless $nflag;
@@ -25,6 +56,8 @@
 $thumb = 0;     # ARM mode by default, not Thumb.
 @proc_stack = ();
 
+printf ("    .syntax unified\n");
+
 LINE:
 while (<>) {
 
@@ -53,7 +86,7 @@
     s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
     s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
     s/\bIMPORT\b/.extern/;
-    s/\bEXPORT\b/.global/;
+    s/\bEXPORT\b\s*/.global $symprefix/;
     s/^(\s+)\[/$1IF/;
     s/^(\s+)\|/$1ELSE/;
     s/^(\s+)\]/$1ENDIF/;
@@ -109,7 +142,7 @@
             # won't match the original source file (we could use the .line
             # directive, which is documented to be obsolete, but then gdb will
             # show the wrong line in the translated source file).
-            s/$/;   .arch armv7-a\n   .fpu neon\n   .object_arch armv4t/;
+            s/$/;   .arch armv7-a\n   .fpu neon\n   .object_arch armv4t/ unless ($apple);
         }
     }
 
@@ -131,9 +164,13 @@
         $prefix = "";
         if ($proc)
         {
-            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
+            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc) unless ($apple);
+            # Make sure we $prefix isn't empty here (for the $apple case).
+            # We handle mangling the label here, make sure it doesn't match
+            # the label handling below (if $prefix would be empty).
+            $prefix = "; ";
             push(@proc_stack, $proc);
-            s/^[A-Za-z_\.]\w+/$&:/;
+            s/^[A-Za-z_\.]\w+/$symprefix$&:/;
         }
         $prefix = $prefix."\t.thumb_func; " if ($thumb);
         s/\bPROC\b/@ $&/;
@@ -146,7 +183,7 @@
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc && !$apple);
     }
     s/\bSUBT\b/@ $&/;
     s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
@@ -311,6 +348,6 @@
 }
 #If we had a code section, mark that this object doesn't need an executable
 # stack.
-if ($nxstack) {
+if ($nxstack && !$apple) {
     printf ("    .section\t.note.GNU-stack,\"\",\%\%progbits\n");
 }
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 547a84d..ee6c244 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -30,6 +30,8 @@
 #endif
 
 #include "pitch.h"
+#include "kiss_fft.h"
+#include "mdct.h"
 
 #if defined(OPUS_HAVE_RTCD)
 
@@ -41,9 +43,79 @@
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
   MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
 };
-# else
-#  error "Floating-point implementation is not supported by ARM asm yet." \
- "Reconfigure with --disable-rtcd or send patches."
-# endif
+# else /* !FIXED_POINT */
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int, int) = {
+  celt_pitch_xcorr_c,              /* ARMv4 */
+  celt_pitch_xcorr_c,              /* EDSP */
+  celt_pitch_xcorr_c,              /* Media */
+  celt_pitch_xcorr_float_neon      /* Neon */
+};
+#  endif
+# endif /* FIXED_POINT */
 
-#endif
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  if defined(HAVE_ARM_NE10)
+#   if defined(CUSTOM_MODES)
+int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
+   opus_fft_alloc_arch_c,        /* ARMv4 */
+   opus_fft_alloc_arch_c,        /* EDSP */
+   opus_fft_alloc_arch_c,        /* Media */
+   opus_fft_alloc_arm_neon       /* Neon with NE10 library support */
+};
+
+void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
+   opus_fft_free_arch_c,         /* ARMv4 */
+   opus_fft_free_arch_c,         /* EDSP */
+   opus_fft_free_arch_c,         /* Media */
+   opus_fft_free_arm_neon        /* Neon with NE10 */
+};
+#   endif /* CUSTOM_MODES */
+
+void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+                                        const kiss_fft_cpx *fin,
+                                        kiss_fft_cpx *fout) = {
+   opus_fft_c,                   /* ARMv4 */
+   opus_fft_c,                   /* EDSP */
+   opus_fft_c,                   /* Media */
+   opus_fft_neon                 /* Neon with NE10 */
+};
+
+void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+                                         const kiss_fft_cpx *fin,
+                                         kiss_fft_cpx *fout) = {
+   opus_ifft_c,                   /* ARMv4 */
+   opus_ifft_c,                   /* EDSP */
+   opus_ifft_c,                   /* Media */
+   opus_ifft_neon                 /* Neon with NE10 */
+};
+
+void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
+                                                     kiss_fft_scalar *in,
+                                                     kiss_fft_scalar * OPUS_RESTRICT out,
+                                                     const opus_val16 *window,
+                                                     int overlap, int shift,
+                                                     int stride, int arch) = {
+   clt_mdct_forward_c,           /* ARMv4 */
+   clt_mdct_forward_c,           /* EDSP */
+   clt_mdct_forward_c,           /* Media */
+   clt_mdct_forward_neon         /* Neon with NE10 */
+};
+
+void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
+                                                      kiss_fft_scalar *in,
+                                                      kiss_fft_scalar * OPUS_RESTRICT out,
+                                                      const opus_val16 *window,
+                                                      int overlap, int shift,
+                                                      int stride, int arch) = {
+   clt_mdct_backward_c,           /* ARMv4 */
+   clt_mdct_backward_c,           /* EDSP */
+   clt_mdct_backward_c,           /* Media */
+   clt_mdct_backward_neon         /* Neon with NE10 */
+};
+
+#  endif /* HAVE_ARM_NE10 */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+#endif /* OPUS_HAVE_RTCD */
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index 1768525..5e5d10c 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -73,7 +73,7 @@
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
   }
-#   if defined(OPUS_ARM_MAY_HAVE_NEON)
+#   if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
   __try{
     /*VORR q0,q0,q0*/
     __emit(0xF2200150);
@@ -107,7 +107,7 @@
 
     while(fgets(buf, 512, cpuinfo) != NULL)
     {
-# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
       /* Search for edsp and neon flag */
       if(memcmp(buf, "Features", 8) == 0)
       {
@@ -118,7 +118,7 @@
           flags |= OPUS_CPU_ARM_EDSP;
 #  endif
 
-#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+#  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON;
diff --git a/celt/arm/armopts_gnu.s b/celt/arm/armopts_gnu.s
new file mode 100644
index 0000000..c7082fc
--- /dev/null
+++ b/celt/arm/armopts_gnu.s
@@ -0,0 +1,38 @@
+    .syntax unified
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, @ DATA, OR
+   PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN  .if ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+@ Set the following to 1 if we have EDSP instructions
+@  (LDRD/STRD, etc., ARMv5E and later).
+ .set OPUS_ARM_MAY_HAVE_EDSP, 1
+
+@ Set the following to 1 if we have ARMv6 media instructions.
+ .set OPUS_ARM_MAY_HAVE_MEDIA, 1
+
+@ Set the following to 1 if we have NEON (some ARMv7)
+ .set OPUS_ARM_MAY_HAVE_NEON, 1
+
+@ END:
diff --git a/celt/arm/celt_ne10_fft.c b/celt/arm/celt_ne10_fft.c
new file mode 100644
index 0000000..42d96a7
--- /dev/null
+++ b/celt/arm/celt_ne10_fft.c
@@ -0,0 +1,174 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_ne10_fft.c
+   @brief ARM Neon optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include <NE10_init.h>
+#include <NE10_dsp.h>
+#include "os_support.h"
+#include "kiss_fft.h"
+#include "stack_alloc.h"
+
+#if !defined(FIXED_POINT)
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
+#else
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
+#endif
+
+#if defined(CUSTOM_MODES)
+
+/* nfft lengths in NE10 that support scaled fft */
+# define NE10_FFTSCALED_SUPPORT_MAX 4
+static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
+   480, 240, 120, 60
+};
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st)
+{
+   int i;
+   size_t memneeded = sizeof(struct arch_fft_state);
+
+   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
+   if (!st->arch_fft)
+      return -1;
+
+   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
+      if(st->nfft == ne10_fft_scaled_support[i])
+         break;
+   }
+   if (i == NE10_FFTSCALED_SUPPORT_MAX) {
+      /* This nfft length (scaled fft) is not supported in NE10 */
+      st->arch_fft->is_supported = 0;
+      st->arch_fft->priv = NULL;
+   }
+   else {
+      st->arch_fft->is_supported = 1;
+      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
+      if (st->arch_fft->priv == NULL) {
+         return -1;
+      }
+   }
+   return 0;
+}
+
+void opus_fft_free_arm_neon(kiss_fft_state *st)
+{
+   NE10_FFT_CFG_TYPE_T cfg;
+
+   if (!st->arch_fft)
+      return;
+
+   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
+   if (cfg)
+      NE10_FFT_DESTROY_C2C_TYPE(cfg);
+   opus_free(st->arch_fft);
+}
+#endif
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_fft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_forward_scaled = 1;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0, 1);
+#endif
+   }
+   RESTORE_STACK;
+}
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_ifft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_backward_scaled = 0;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1, 0);
+#endif
+   }
+   RESTORE_STACK;
+}
diff --git a/celt/arm/celt_ne10_mdct.c b/celt/arm/celt_ne10_mdct.c
new file mode 100644
index 0000000..293c3ef
--- /dev/null
+++ b/celt/arm/celt_ne10_mdct.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_ne10_mdct.c
+   @brief ARM Neon optimizations for mdct using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include "mdct.h"
+#include "stack_alloc.h"
+
+void clt_mdct_forward_neon(const mdct_lookup *l,
+                           kiss_fft_scalar *in,
+                           kiss_fft_scalar * OPUS_RESTRICT out,
+                           const opus_val16 *window,
+                           int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+
+   SAVE_STACK;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         f2[i] = yc;
+      }
+   }
+
+   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+void clt_mdct_backward_neon(const mdct_lookup *l,
+                            kiss_fft_scalar *in,
+                            kiss_fft_scalar * OPUS_RESTRICT out,
+                            const opus_val16 * OPUS_RESTRICT window,
+                            int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   const kiss_twiddle_scalar *trig;
+   const kiss_fft_state *st = l->kfft[shift];
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         yp[2*i] = yr;
+         yp[2*i+1] = yi;
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         re = yp1[0];
+         im = yp1[1];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+   RESTORE_STACK;
+}
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
new file mode 100644
index 0000000..47dce15
--- /dev/null
+++ b/celt/arm/celt_neon_intr.c
@@ -0,0 +1,252 @@
+/* Copyright (c) 2014-2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "../pitch.h"
+
+#if !defined(FIXED_POINT)
+/*
+ * Function: xcorr_kernel_neon_float
+ * ---------------------------------
+ * Computes 4 correlation values and stores them in sum[4]
+ */
+static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
+      float32_t sum[4], int len) {
+   float32x4_t YY[3];
+   float32x4_t YEXT[3];
+   float32x4_t XX[2];
+   float32x2_t XX_2;
+   float32x4_t SUMM;
+   const float32_t *xi = x;
+   const float32_t *yi = y;
+
+   celt_assert(len>0);
+
+   YY[0] = vld1q_f32(yi);
+   SUMM = vdupq_n_f32(0);
+
+   /* Consume 8 elements in x vector and 12 elements in y
+    * vector. However, the 12'th element never really gets
+    * touched in this loop. So, if len == 8, then we only
+    * must access y[0] to y[10]. y[11] must not be accessed
+    * hence make sure len > 8 and not len >= 8
+    */
+   while (len > 8) {
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+      yi += 4;
+      YY[2] = vld1q_f32(yi);
+
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+      XX[1] = vld1q_f32(xi);
+      xi += 4;
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
+      YEXT[0] = vextq_f32(YY[1], YY[2], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
+      YEXT[1] = vextq_f32(YY[1], YY[2], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
+      YEXT[2] = vextq_f32(YY[1], YY[2], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
+
+      YY[0] = YY[2];
+      len -= 8;
+   }
+
+   /* Consume 4 elements in x vector and 8 elements in y
+    * vector. However, the 8'th element in y never really gets
+    * touched in this loop. So, if len == 4, then we only
+    * must access y[0] to y[6]. y[7] must not be accessed
+    * hence make sure len>4 and not len>=4
+    */
+   if (len > 4) {
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+      YY[0] = YY[1];
+      len -= 4;
+   }
+
+   while (--len > 0) {
+      XX_2 = vld1_dup_f32(xi++);
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+      YY[0]= vld1q_f32(++yi);
+   }
+
+   XX_2 = vld1_dup_f32(xi);
+   SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+
+   vst1q_f32(sum, SUMM);
+}
+
+/*
+ * Function: xcorr_kernel_neon_float_process1
+ * ---------------------------------
+ * Computes single correlation values and stores in *sum
+ */
+static void xcorr_kernel_neon_float_process1(const float32_t *x,
+      const float32_t *y, float32_t *sum, int len) {
+   float32x4_t XX[4];
+   float32x4_t YY[4];
+   float32x2_t XX_2;
+   float32x2_t YY_2;
+   float32x4_t SUMM;
+   float32x2_t SUMM_2[2];
+   const float32_t *xi = x;
+   const float32_t *yi = y;
+
+   SUMM = vdupq_n_f32(0);
+
+   /* Work on 16 values per iteration */
+   while (len >= 16) {
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+      XX[1] = vld1q_f32(xi);
+      xi += 4;
+      XX[2] = vld1q_f32(xi);
+      xi += 4;
+      XX[3] = vld1q_f32(xi);
+      xi += 4;
+
+      YY[0] = vld1q_f32(yi);
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+      yi += 4;
+      YY[2] = vld1q_f32(yi);
+      yi += 4;
+      YY[3] = vld1q_f32(yi);
+      yi += 4;
+
+      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
+      SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
+      SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
+      len -= 16;
+   }
+
+   /* Work on 8 values */
+   if (len >= 8) {
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+      XX[1] = vld1q_f32(xi);
+      xi += 4;
+
+      YY[0] = vld1q_f32(yi);
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+      yi += 4;
+
+      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
+      len -= 8;
+   }
+
+   /* Work on 4 values */
+   if (len >= 4) {
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+      YY[0] = vld1q_f32(yi);
+      yi += 4;
+      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
+      len -= 4;
+   }
+
+   /* Start accumulating results */
+   SUMM_2[0] = vget_low_f32(SUMM);
+   if (len >= 2) {
+      /* While at it, consume 2 more values if available */
+      XX_2 = vld1_f32(xi);
+      xi += 2;
+      YY_2 = vld1_f32(yi);
+      yi += 2;
+      SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
+      len -= 2;
+   }
+   SUMM_2[1] = vget_high_f32(SUMM);
+   SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
+   SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
+   /* Ok, now we have result accumulated in SUMM_2[0].0 */
+
+   if (len > 0) {
+      /* Case when you have one value left */
+      XX_2 = vld1_dup_f32(xi);
+      YY_2 = vld1_dup_f32(yi);
+      SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
+   }
+
+   vst1_lane_f32(sum, SUMM_2[0], 0);
+}
+
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+                        opus_val32 *xcorr, int len, int max_pitch) {
+   int i;
+   celt_assert(max_pitch > 0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+
+   for (i = 0; i < (max_pitch-3); i += 4) {
+      xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
+            (float32_t *)xcorr+i, len);
+   }
+
+   /* In case max_pitch isn't multiple of 4
+    * compute single correlation value per iteration
+    */
+   for (; i < max_pitch; i++) {
+      xcorr_kernel_neon_float_process1((const float32_t *)_x,
+            (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
+   }
+}
+#endif
diff --git a/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s b/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s
new file mode 100644
index 0000000..b62c520
--- /dev/null
+++ b/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s
@@ -0,0 +1,552 @@
+    .syntax unified
+    .syntax unified
+,: Copyright (c) 2007-2008 CSIRO
+,: Copyright (c) 2007-2009 Xiph.Org Foundation
+,: Copyright (c) 2013      Parrot
+,: Written by Aurélien Zanelli
+,:
+,: Redistribution and use in source and binary forms, with or without
+,: modification, are permitted provided that the following conditions
+,: are met:
+,:
+,: - Redistributions of source code must retain the above copyright
+,: notice, this list of conditions and the following disclaimer.
+,:
+,: - Redistributions in binary form must reproduce the above copyright
+,: notice, this list of conditions and the following disclaimer in the
+,: documentation and/or other materials provided with the distribution.
+,:
+,: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+,: ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+,: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+,: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+,: OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+,: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+,: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, @ DATA, OR
+,: PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+,: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+,: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+,: SOFTWARE, EVEN  .if ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    .text@   .p2align 2;   .arch armv7-a
+   .fpu neon
+   .object_arch armv4t
+
+  .include "celt/arm/armopts_gnu.s"
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+  .global celt_pitch_xcorr_edsp
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+  .global celt_pitch_xcorr_neon
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+
+,: Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+@ xcorr_kernel_neon: @ PROC
+xcorr_kernel_neon_start::
+  ,: input:
+  ,:   r3     = int         len
+  ,:   r4     = opus_val16 *x
+  ,:   r5     = opus_val16 *y
+  ,:   q0     = opus_val32  sum[4]
+  ,: output:
+  ,:   q0     = opus_val32  sum[4]
+  ,: preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  ,: internal usage:
+  ,:   r12 = int j
+  ,:   d3  = y_3|y_2|y_1|y_0
+  ,:   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  ,:   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  ,:   q8  = scratch
+  ,:
+  ,: Load y[0...3]
+  ,: This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+,: Process 8 samples at a time.
+,: This loop loads one y value more than we actually need. Therefore we have to
+,: stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+,: reading past the end of the array.
+xcorr_kernel_neon_process8::
+  ,: This loop has 19 total instructions (10 cycles to issue, minimum), with
+  ,: - 2 cycles of ARM insrtuctions,
+  ,: - 10 cycles of load/store/byte permute instructions, and
+  ,: - 9 cycles of data processing instructions.
+  ,: On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  ,: latter two categories, meaning the whole loop should run in 10 cycles per
+  ,: iteration, barring cache misses.
+  ,:
+  ,: Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  ,: Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  ,: assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  ,: Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+,: Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4::
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  ,: Load x[0...3]
+  VLD1.16      d6, [r4]!
+  ,: Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  ,: Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+,: Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2::
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  ,: Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  ,: Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  ,: Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  ,: Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  ,: instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+,: Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1::
+  ,: Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  ,: y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+,: Now process 1 last sample, not reading ahead.
+  ,: Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  ,: Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+	.size xcorr_kernel_neon, .-xcorr_kernel_neon  ,: @ ENDP
+
+,: opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+,:  opus_val32 *xcorr, int len, int max_pitch)
+@ celt_pitch_xcorr_neon: @ PROC
+  ,: input:
+  ,:   r0  = opus_val16 *_x
+  ,:   r1  = opus_val16 *_y
+  ,:   r2  = opus_val32 *xcorr
+  ,:   r3  = int         len
+  ,: output:
+  ,:   r0  = int         maxcorr
+  ,: internal usage:
+  ,:   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  ,:   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  ,:   r6  = int         max_pitch
+  ,:   r12 = int         j
+  ,:   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  ,: if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4::
+  ,: xcorr_kernel_neon parameters:
+  ,: r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  ,: xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  ,: So we don't save/restore any other registers.
+  BL xcorr_kernel_neon_start
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  ,: _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  ,: if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+,: We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done::
+  ADDS         r6, r6, #4
+  ,: Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  ,: if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+,: Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining::
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+,: Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8::
+  ,: Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  ,: Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+,: Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4::
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  ,: Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  ,: Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done::
+  ,: Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+,: Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1::
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done::
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  ,: _y++
+  ADD          r1, r1, #2
+  ,: if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done::
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  ,: @ ENDP
+
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+
+,: This will get used on ARMv7 devices without NEON, so it has been optimized
+,: to take advantage of dual-issuing where possible.
+@ xcorr_kernel_edsp: @ PROC
+xcorr_kernel_edsp_start::
+  ,: input:
+  ,:   r3      = int         len
+  ,:   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ,:   r5      = opus_val16 *_y (must be 32-bit aligned)
+  ,:   r6...r9 = opus_val32  sum[4]
+  ,: output:
+  ,:   r6...r9 = opus_val32  sum[4]
+  ,: preserved: r0-r5
+  ,: internal usage
+  ,:   r2      = int         j
+  ,:   r12,r14 = opus_val16  x[4]
+  ,:   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ,: Load y[0...1]
+  SUBS         r2, r3, #4         ,: j = len-4
+  LDR          r11, [r5], #4      ,: Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      ,: Load x[0...1]
+  ,: Stall
+xcorr_kernel_edsp_process4::
+  ,: The multiplies must issue from pipeline 0, and can't dual-issue with each
+  ,: other. Every other instruction here dual-issues with a multiply, and is
+  ,: thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   ,: sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      ,: Load x[2...3]
+  SMLABT       r7, r12, r10, r7   ,: sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         ,: j-=4
+  SMLABB       r8, r12, r11, r8   ,: sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   ,: sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   ,: sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      ,: Load y[4...5]
+  SMLATB       r7, r12, r11, r7   ,: sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   ,: sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   ,: sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      ,: Load x[0...1]
+  SMLABB       r6, r14, r11, r6   ,: sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   ,: sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   ,: sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   ,: sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   ,: sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      ,: Load y[6...7]
+  SMLATB       r7, r14, r10, r7   ,: sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   ,: sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   ,: sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done::
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      ,: r12 = *x++
+  SUBS         r2, r2, #1         ,: j--
+  ,: Stall
+  SMLABB       r6, r12, r10, r6   ,: sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRHGT       r14, [r4], #2      ,: r14 = *x++
+  SMLABT       r7, r12, r10, r7   ,: sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   ,: sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   ,: sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   ,: sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         ,: j--
+  SMLABB       r7, r14, r11, r7   ,: sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      ,: r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   ,: sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRHGT       r12, [r4], #2      ,: r12 = *x++
+  SMLABB       r9, r14, r10, r9   ,: sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   ,: sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             ,: j--
+  SMLABT       r7, r12, r11, r7   ,: sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       ,: r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   ,: sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRHGT       r14, [r4]          ,: r14 = *x
+  SMLABB       r9, r12, r2, r9    ,: sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   ,: sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          ,: r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   ,: sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    ,: sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   ,: sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done::
+  LDMFD        sp!, {r2,r4,r5,pc}
+	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  ,: @ ENDP
+
+@ celt_pitch_xcorr_edsp: @ PROC
+  ,: input:
+  ,:   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ,:   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  ,:   r2  = opus_val32 *xcorr
+  ,:   r3  = int         len
+  ,: output:
+  ,:   r0  = maxcorr
+  ,: internal usage
+  ,:   r4  = opus_val16 *x
+  ,:   r5  = opus_val16 *y
+  ,:   r6  = opus_val32  sum0
+  ,:   r7  = opus_val32  sum1
+  ,:   r8  = opus_val32  sum2
+  ,:   r9  = opus_val32  sum3
+  ,:   r1  = int         max_pitch
+  ,:   r12 = int         j
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  ,: maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+,: Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ,: r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4::
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     ,: sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     ,: sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     ,: sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         ,: j-=4
+  SMLATB       r14, r7, r8, r14     ,: sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done::
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1::
+  LDRHGE       r6, [r4], #2
+  ,: Stall
+  SMLABBGE     r14, r6, r8, r14    ,: sum = MAC16_16(sum, *x, *y)
+  SUBSGE       r12, r12, #1
+  LDRHGT       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ,: Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ,: Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ,: maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ,: xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done::
+  ,: if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4::
+  ,: xcorr_kernel_edsp parameters:
+  ,: r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp_start  ,: xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  ,: maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  ,: _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2::
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ,: {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4::
+  SMLABB       r10, r6, r8, r10     ,: sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ,: sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ,: j-=4
+  SMLATT       r10, r6, r8, r10     ,: sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ,: sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ,: sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ,: sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ,: sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ,: sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done::
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ,: Stall
+  SMLABB       r10, r6, r8, r10     ,: sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ,: sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ,: sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ,: sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1::
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ,: Stall
+  SMLABB       r10, r6, r8, r10     ,: sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRHGT       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ,: sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ,: sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ,: sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done::
+  ,: Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ,: Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ,: maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ,: maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ,: xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a::
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  ,: r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4::
+  SMLABB       r14, r6, r8, r14     ,: sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ,: j-=4
+  SMLATT       r14, r6, r8, r14     ,: sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     ,: sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     ,: sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done::
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  ,: Stall
+  SMLABBGE     r14, r6, r8, r14     ,: sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     ,: sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRHGE       r6, [r4], #2
+  LDRHGE       r8, [r5], #2
+  ,: Stall
+  SMLABBGE     r14, r6, r8, r14     ,: sum = MAC16_16(sum, *x, *y)
+  ,: maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ,: xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done::
+  LDMFD        sp!, {r4-r11, pc}
+	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  ,: @ ENDP
+
+ .endif
+
+,: @ END:
+    .section	.note.GNU-stack,"",%progbits
diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s
deleted file mode 100644
index 09917b1..0000000
--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ /dev/null
@@ -1,545 +0,0 @@
-; Copyright (c) 2007-2008 CSIRO
-; Copyright (c) 2007-2009 Xiph.Org Foundation
-; Copyright (c) 2013      Parrot
-; Written by Aurélien Zanelli
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
-; are met:
-;
-; - Redistributions of source code must retain the above copyright
-; notice, this list of conditions and the following disclaimer.
-;
-; - Redistributions in binary form must reproduce the above copyright
-; notice, this list of conditions and the following disclaimer in the
-; documentation and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-  AREA  |.text|, CODE, READONLY
-
-  GET    celt/arm/armopts.s
-
-IF OPUS_ARM_MAY_HAVE_EDSP
-  EXPORT celt_pitch_xcorr_edsp
-ENDIF
-
-IF OPUS_ARM_MAY_HAVE_NEON
-  EXPORT celt_pitch_xcorr_neon
-ENDIF
-
-IF OPUS_ARM_MAY_HAVE_NEON
-
-; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
-xcorr_kernel_neon PROC
-  ; input:
-  ;   r3     = int         len
-  ;   r4     = opus_val16 *x
-  ;   r5     = opus_val16 *y
-  ;   q0     = opus_val32  sum[4]
-  ; output:
-  ;   q0     = opus_val32  sum[4]
-  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
-  ; internal usage:
-  ;   r12 = int j
-  ;   d3  = y_3|y_2|y_1|y_0
-  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
-  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
-  ;   q8  = scratch
-  ;
-  ; Load y[0...3]
-  ; This requires len>0 to always be valid (which we assert in the C code).
-  VLD1.16      {d5}, [r5]!
-  SUBS         r12, r3, #8
-  BLE xcorr_kernel_neon_process4
-; Process 8 samples at a time.
-; This loop loads one y value more than we actually need. Therefore we have to
-; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
-; reading past the end of the array.
-xcorr_kernel_neon_process8
-  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
-  ; - 2 cycles of ARM insrtuctions,
-  ; - 10 cycles of load/store/byte permute instructions, and
-  ; - 9 cycles of data processing instructions.
-  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
-  ; latter two categories, meaning the whole loop should run in 10 cycles per
-  ; iteration, barring cache misses.
-  ;
-  ; Load x[0...7]
-  VLD1.16      {d6, d7}, [r4]!
-  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
-  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
-  VAND         d3, d5, d5
-  SUBS         r12, r12, #8
-  ; Load y[4...11]
-  VLD1.16      {d4, d5}, [r5]!
-  VMLAL.S16    q0, d3, d6[0]
-  VEXT.16      d16, d3, d4, #1
-  VMLAL.S16    q0, d4, d7[0]
-  VEXT.16      d17, d4, d5, #1
-  VMLAL.S16    q0, d16, d6[1]
-  VEXT.16      d16, d3, d4, #2
-  VMLAL.S16    q0, d17, d7[1]
-  VEXT.16      d17, d4, d5, #2
-  VMLAL.S16    q0, d16, d6[2]
-  VEXT.16      d16, d3, d4, #3
-  VMLAL.S16    q0, d17, d7[2]
-  VEXT.16      d17, d4, d5, #3
-  VMLAL.S16    q0, d16, d6[3]
-  VMLAL.S16    q0, d17, d7[3]
-  BGT xcorr_kernel_neon_process8
-; Process 4 samples here if we have > 4 left (still reading one extra y value).
-xcorr_kernel_neon_process4
-  ADDS         r12, r12, #4
-  BLE xcorr_kernel_neon_process2
-  ; Load x[0...3]
-  VLD1.16      d6, [r4]!
-  ; Use VAND since it's a data processing instruction again.
-  VAND         d4, d5, d5
-  SUB          r12, r12, #4
-  ; Load y[4...7]
-  VLD1.16      d5, [r5]!
-  VMLAL.S16    q0, d4, d6[0]
-  VEXT.16      d16, d4, d5, #1
-  VMLAL.S16    q0, d16, d6[1]
-  VEXT.16      d16, d4, d5, #2
-  VMLAL.S16    q0, d16, d6[2]
-  VEXT.16      d16, d4, d5, #3
-  VMLAL.S16    q0, d16, d6[3]
-; Process 2 samples here if we have > 2 left (still reading one extra y value).
-xcorr_kernel_neon_process2
-  ADDS         r12, r12, #2
-  BLE xcorr_kernel_neon_process1
-  ; Load x[0...1]
-  VLD2.16      {d6[],d7[]}, [r4]!
-  ; Use VAND since it's a data processing instruction again.
-  VAND         d4, d5, d5
-  SUB          r12, r12, #2
-  ; Load y[4...5]
-  VLD1.32      {d5[]}, [r5]!
-  VMLAL.S16    q0, d4, d6
-  VEXT.16      d16, d4, d5, #1
-  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
-  ; instead of VEXT, since it's a data-processing instruction.
-  VSRI.64      d5, d4, #32
-  VMLAL.S16    q0, d16, d7
-; Process 1 sample using the extra y value we loaded above.
-xcorr_kernel_neon_process1
-  ; Load next *x
-  VLD1.16      {d6[]}, [r4]!
-  ADDS         r12, r12, #1
-  ; y[0...3] are left in d5 from prior iteration(s) (if any)
-  VMLAL.S16    q0, d5, d6
-  MOVLE        pc, lr
-; Now process 1 last sample, not reading ahead.
-  ; Load last *y
-  VLD1.16      {d4[]}, [r5]!
-  VSRI.64      d4, d5, #16
-  ; Load last *x
-  VLD1.16      {d6[]}, [r4]!
-  VMLAL.S16    q0, d4, d6
-  MOV          pc, lr
-  ENDP
-
-; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-;  opus_val32 *xcorr, int len, int max_pitch)
-celt_pitch_xcorr_neon PROC
-  ; input:
-  ;   r0  = opus_val16 *_x
-  ;   r1  = opus_val16 *_y
-  ;   r2  = opus_val32 *xcorr
-  ;   r3  = int         len
-  ; output:
-  ;   r0  = int         maxcorr
-  ; internal usage:
-  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
-  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
-  ;   r6  = int         max_pitch
-  ;   r12 = int         j
-  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
-  STMFD        sp!, {r4-r6, lr}
-  LDR          r6, [sp, #16]
-  VMOV.S32     q15, #1
-  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-  SUBS         r6, r6, #4
-  BLT celt_pitch_xcorr_neon_process4_done
-celt_pitch_xcorr_neon_process4
-  ; xcorr_kernel_neon parameters:
-  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
-  MOV          r4, r0
-  MOV          r5, r1
-  VEOR         q0, q0, q0
-  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
-  ; So we don't save/restore any other registers.
-  BL xcorr_kernel_neon
-  SUBS         r6, r6, #4
-  VST1.32      {q0}, [r2]!
-  ; _y += 4
-  ADD          r1, r1, #8
-  VMAX.S32     q15, q15, q0
-  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-  BGE celt_pitch_xcorr_neon_process4
-; We have less than 4 sums left to compute.
-celt_pitch_xcorr_neon_process4_done
-  ADDS         r6, r6, #4
-  ; Reduce maxcorr to a single value
-  VMAX.S32     d30, d30, d31
-  VPMAX.S32    d30, d30, d30
-  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
-  BLE celt_pitch_xcorr_neon_done
-; Now compute each remaining sum one at a time.
-celt_pitch_xcorr_neon_process_remaining
-  MOV          r4, r0
-  MOV          r5, r1
-  VMOV.I32     q0, #0
-  SUBS         r12, r3, #8
-  BLT celt_pitch_xcorr_neon_process_remaining4
-; Sum terms 8 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop8
-  ; Load x[0...7]
-  VLD1.16      {q1}, [r4]!
-  ; Load y[0...7]
-  VLD1.16      {q2}, [r5]!
-  SUBS         r12, r12, #8
-  VMLAL.S16    q0, d4, d2
-  VMLAL.S16    q0, d5, d3
-  BGE celt_pitch_xcorr_neon_process_remaining_loop8
-; Sum terms 4 at a time.
-celt_pitch_xcorr_neon_process_remaining4
-  ADDS         r12, r12, #4
-  BLT celt_pitch_xcorr_neon_process_remaining4_done
-  ; Load x[0...3]
-  VLD1.16      {d2}, [r4]!
-  ; Load y[0...3]
-  VLD1.16      {d3}, [r5]!
-  SUB          r12, r12, #4
-  VMLAL.S16    q0, d3, d2
-celt_pitch_xcorr_neon_process_remaining4_done
-  ; Reduce the sum to a single value.
-  VADD.S32     d0, d0, d1
-  VPADDL.S32   d0, d0
-  ADDS         r12, r12, #4
-  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
-; Sum terms 1 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop1
-  VLD1.16      {d2[]}, [r4]!
-  VLD1.16      {d3[]}, [r5]!
-  SUBS         r12, r12, #1
-  VMLAL.S16    q0, d2, d3
-  BGT celt_pitch_xcorr_neon_process_remaining_loop1
-celt_pitch_xcorr_neon_process_remaining_loop_done
-  VST1.32      {d0[0]}, [r2]!
-  VMAX.S32     d30, d30, d0
-  SUBS         r6, r6, #1
-  ; _y++
-  ADD          r1, r1, #2
-  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
-  BGT celt_pitch_xcorr_neon_process_remaining
-celt_pitch_xcorr_neon_done
-  VMOV.32      r0, d30[0]
-  LDMFD        sp!, {r4-r6, pc}
-  ENDP
-
-ENDIF
-
-IF OPUS_ARM_MAY_HAVE_EDSP
-
-; This will get used on ARMv7 devices without NEON, so it has been optimized
-; to take advantage of dual-issuing where possible.
-xcorr_kernel_edsp PROC
-  ; input:
-  ;   r3      = int         len
-  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
-  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
-  ;   r6...r9 = opus_val32  sum[4]
-  ; output:
-  ;   r6...r9 = opus_val32  sum[4]
-  ; preserved: r0-r5
-  ; internal usage
-  ;   r2      = int         j
-  ;   r12,r14 = opus_val16  x[4]
-  ;   r10,r11 = opus_val16  y[4]
-  STMFD        sp!, {r2,r4,r5,lr}
-  LDR          r10, [r5], #4      ; Load y[0...1]
-  SUBS         r2, r3, #4         ; j = len-4
-  LDR          r11, [r5], #4      ; Load y[2...3]
-  BLE xcorr_kernel_edsp_process4_done
-  LDR          r12, [r4], #4      ; Load x[0...1]
-  ; Stall
-xcorr_kernel_edsp_process4
-  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
-  ; other. Every other instruction here dual-issues with a multiply, and is
-  ; thus "free". There should be no stalls in the body of the loop.
-  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
-  LDR          r14, [r4], #4      ; Load x[2...3]
-  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
-  SUBS         r2, r2, #4         ; j-=4
-  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
-  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
-  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
-  LDR          r10, [r5], #4      ; Load y[4...5]
-  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
-  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
-  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
-  LDRGT        r12, [r4], #4      ; Load x[0...1]
-  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
-  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
-  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
-  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
-  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
-  LDR          r11, [r5], #4      ; Load y[6...7]
-  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
-  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
-  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
-  BGT xcorr_kernel_edsp_process4
-xcorr_kernel_edsp_process4_done
-  ADDS         r2, r2, #4
-  BLE xcorr_kernel_edsp_done
-  LDRH         r12, [r4], #2      ; r12 = *x++
-  SUBS         r2, r2, #1         ; j--
-  ; Stall
-  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
-  LDRGTH       r14, [r4], #2      ; r14 = *x++
-  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
-  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
-  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
-  BLE xcorr_kernel_edsp_done
-  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
-  SUBS         r2, r2, #1         ; j--
-  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
-  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
-  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
-  LDRGTH       r12, [r4], #2      ; r12 = *x++
-  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
-  BLE xcorr_kernel_edsp_done
-  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
-  CMP          r2, #1             ; j--
-  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
-  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
-  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
-  LDRGTH       r14, [r4]          ; r14 = *x
-  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
-  BLE xcorr_kernel_edsp_done
-  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
-  LDRH         r11, [r5]          ; r11 = y_6 = *y
-  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
-  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
-  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
-xcorr_kernel_edsp_done
-  LDMFD        sp!, {r2,r4,r5,pc}
-  ENDP
-
-celt_pitch_xcorr_edsp PROC
-  ; input:
-  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
-  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
-  ;   r2  = opus_val32 *xcorr
-  ;   r3  = int         len
-  ; output:
-  ;   r0  = maxcorr
-  ; internal usage
-  ;   r4  = opus_val16 *x
-  ;   r5  = opus_val16 *y
-  ;   r6  = opus_val32  sum0
-  ;   r7  = opus_val32  sum1
-  ;   r8  = opus_val32  sum2
-  ;   r9  = opus_val32  sum3
-  ;   r1  = int         max_pitch
-  ;   r12 = int         j
-  STMFD        sp!, {r4-r11, lr}
-  MOV          r5, r1
-  LDR          r1, [sp, #36]
-  MOV          r4, r0
-  TST          r5, #3
-  ; maxcorr = 1
-  MOV          r0, #1
-  BEQ          celt_pitch_xcorr_edsp_process1u_done
-; Compute one sum at the start to make y 32-bit aligned.
-  SUBS         r12, r3, #4
-  ; r14 = sum = 0
-  MOV          r14, #0
-  LDRH         r8, [r5], #2
-  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
-  LDR          r6, [r4], #4
-  MOV          r8, r8, LSL #16
-celt_pitch_xcorr_edsp_process1u_loop4
-  LDR          r9, [r5], #4
-  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
-  LDR          r7, [r4], #4
-  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
-  LDR          r8, [r5], #4
-  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
-  SUBS         r12, r12, #4         ; j-=4
-  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
-  LDRGT        r6, [r4], #4
-  BGT celt_pitch_xcorr_edsp_process1u_loop4
-  MOV          r8, r8, LSR #16
-celt_pitch_xcorr_edsp_process1u_loop4_done
-  ADDS         r12, r12, #4
-celt_pitch_xcorr_edsp_process1u_loop1
-  LDRGEH       r6, [r4], #2
-  ; Stall
-  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
-  SUBGES       r12, r12, #1
-  LDRGTH       r8, [r5], #2
-  BGT celt_pitch_xcorr_edsp_process1u_loop1
-  ; Restore _x
-  SUB          r4, r4, r3, LSL #1
-  ; Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
-  ; maxcorr = max(maxcorr, sum)
-  CMP          r0, r14
-  ADD          r5, r5, #2
-  MOVLT        r0, r14
-  SUBS         r1, r1, #1
-  ; xcorr[i] = sum
-  STR          r14, [r2], #4
-  BLE celt_pitch_xcorr_edsp_done
-celt_pitch_xcorr_edsp_process1u_done
-  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
-  SUBS         r1, r1, #4
-  BLT celt_pitch_xcorr_edsp_process2
-celt_pitch_xcorr_edsp_process4
-  ; xcorr_kernel_edsp parameters:
-  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
-  MOV          r6, #0
-  MOV          r7, #0
-  MOV          r8, #0
-  MOV          r9, #0
-  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
-  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
-  CMP          r0, r6
-  ; _y+=4
-  ADD          r5, r5, #8
-  MOVLT        r0, r6
-  CMP          r0, r7
-  MOVLT        r0, r7
-  CMP          r0, r8
-  MOVLT        r0, r8
-  CMP          r0, r9
-  MOVLT        r0, r9
-  STMIA        r2!, {r6-r9}
-  SUBS         r1, r1, #4
-  BGE celt_pitch_xcorr_edsp_process4
-celt_pitch_xcorr_edsp_process2
-  ADDS         r1, r1, #2
-  BLT celt_pitch_xcorr_edsp_process1a
-  SUBS         r12, r3, #4
-  ; {r10, r11} = {sum0, sum1} = {0, 0}
-  MOV          r10, #0
-  MOV          r11, #0
-  LDR          r8, [r5], #4
-  BLE celt_pitch_xcorr_edsp_process2_loop_done
-  LDR          r6, [r4], #4
-  LDR          r9, [r5], #4
-celt_pitch_xcorr_edsp_process2_loop4
-  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
-  LDR          r7, [r4], #4
-  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
-  SUBS         r12, r12, #4         ; j-=4
-  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
-  LDR          r8, [r5], #4
-  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
-  LDRGT        r6, [r4], #4
-  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
-  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
-  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
-  LDRGT        r9, [r5], #4
-  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
-  BGT celt_pitch_xcorr_edsp_process2_loop4
-celt_pitch_xcorr_edsp_process2_loop_done
-  ADDS         r12, r12, #2
-  BLE  celt_pitch_xcorr_edsp_process2_1
-  LDR          r6, [r4], #4
-  ; Stall
-  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
-  LDR          r9, [r5], #4
-  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
-  SUB          r12, r12, #2
-  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
-  MOV          r8, r9
-  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
-celt_pitch_xcorr_edsp_process2_1
-  LDRH         r6, [r4], #2
-  ADDS         r12, r12, #1
-  ; Stall
-  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
-  LDRGTH       r7, [r4], #2
-  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
-  BLE celt_pitch_xcorr_edsp_process2_done
-  LDRH         r9, [r5], #2
-  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
-  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
-celt_pitch_xcorr_edsp_process2_done
-  ; Restore _x
-  SUB          r4, r4, r3, LSL #1
-  ; Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
-  ; maxcorr = max(maxcorr, sum0)
-  CMP          r0, r10
-  ADD          r5, r5, #2
-  MOVLT        r0, r10
-  SUB          r1, r1, #2
-  ; maxcorr = max(maxcorr, sum1)
-  CMP          r0, r11
-  ; xcorr[i] = sum
-  STR          r10, [r2], #4
-  MOVLT        r0, r11
-  STR          r11, [r2], #4
-celt_pitch_xcorr_edsp_process1a
-  ADDS         r1, r1, #1
-  BLT celt_pitch_xcorr_edsp_done
-  SUBS         r12, r3, #4
-  ; r14 = sum = 0
-  MOV          r14, #0
-  BLT celt_pitch_xcorr_edsp_process1a_loop_done
-  LDR          r6, [r4], #4
-  LDR          r8, [r5], #4
-  LDR          r7, [r4], #4
-  LDR          r9, [r5], #4
-celt_pitch_xcorr_edsp_process1a_loop4
-  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
-  SUBS         r12, r12, #4         ; j-=4
-  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
-  LDRGE        r6, [r4], #4
-  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
-  LDRGE        r8, [r5], #4
-  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
-  LDRGE        r7, [r4], #4
-  LDRGE        r9, [r5], #4
-  BGE celt_pitch_xcorr_edsp_process1a_loop4
-celt_pitch_xcorr_edsp_process1a_loop_done
-  ADDS         r12, r12, #2
-  LDRGE        r6, [r4], #4
-  LDRGE        r8, [r5], #4
-  ; Stall
-  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
-  SUBGE        r12, r12, #2
-  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
-  ADDS         r12, r12, #1
-  LDRGEH       r6, [r4], #2
-  LDRGEH       r8, [r5], #2
-  ; Stall
-  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
-  ; maxcorr = max(maxcorr, sum)
-  CMP          r0, r14
-  ; xcorr[i] = sum
-  STR          r14, [r2], #4
-  MOVLT        r0, r14
-celt_pitch_xcorr_edsp_done
-  LDMFD        sp!, {r4-r11, pc}
-  ENDP
-
-ENDIF
-
-END
diff --git a/celt/arm/celt_pitch_xcorr_arm_gnu.s b/celt/arm/celt_pitch_xcorr_arm_gnu.s
new file mode 100644
index 0000000..4be1705
--- /dev/null
+++ b/celt/arm/celt_pitch_xcorr_arm_gnu.s
@@ -0,0 +1,551 @@
+    .syntax unified
+@ Copyright (c) 2007-2008 CSIRO
+@ Copyright (c) 2007-2009 Xiph.Org Foundation
+@ Copyright (c) 2013      Parrot
+@ Written by Aurélien Zanelli
+@
+@ Redistribution and use in source and binary forms, with or without
+@ modification, are permitted provided that the following conditions
+@ are met:
+@
+@ - Redistributions of source code must retain the above copyright
+@ notice, this list of conditions and the following disclaimer.
+@
+@ - Redistributions in binary form must reproduce the above copyright
+@ notice, this list of conditions and the following disclaimer in the
+@ documentation and/or other materials provided with the distribution.
+@
+@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    .text;   .p2align 2;   .arch armv7-a
+   .fpu neon
+   .object_arch armv4t
+
+  .include "celt/arm/armopts_gnu.s"
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+  .global celt_pitch_xcorr_edsp
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+  .global celt_pitch_xcorr_neon
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+
+@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+; xcorr_kernel_neon: @ PROC
+xcorr_kernel_neon_start:
+  @ input:
+  @   r3     = int         len
+  @   r4     = opus_val16 *x
+  @   r5     = opus_val16 *y
+  @   q0     = opus_val32  sum[4]
+  @ output:
+  @   q0     = opus_val32  sum[4]
+  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  @ internal usage:
+  @   r12 = int j
+  @   d3  = y_3|y_2|y_1|y_0
+  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  @   q8  = scratch
+  @
+  @ Load y[0...3]
+  @ This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+@ Process 8 samples at a time.
+@ This loop loads one y value more than we actually need. Therefore we have to
+@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+@ reading past the end of the array.
+xcorr_kernel_neon_process8:
+  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
+  @ - 2 cycles of ARM insrtuctions,
+  @ - 10 cycles of load/store/byte permute instructions, and
+  @ - 9 cycles of data processing instructions.
+  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  @ latter two categories, meaning the whole loop should run in 10 cycles per
+  @ iteration, barring cache misses.
+  @
+  @ Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  @ Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+@ Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4:
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  @ Load x[0...3]
+  VLD1.16      d6, [r4]!
+  @ Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  @ Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+@ Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2:
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  @ Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  @ Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  @ Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  @ instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+@ Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1:
+  @ Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  @ y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+@ Now process 1 last sample, not reading ahead.
+  @ Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  @ Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
+
+@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+@  opus_val32 *xcorr, int len, int max_pitch)
+; celt_pitch_xcorr_neon: @ PROC
+  @ input:
+  @   r0  = opus_val16 *_x
+  @   r1  = opus_val16 *_y
+  @   r2  = opus_val32 *xcorr
+  @   r3  = int         len
+  @ output:
+  @   r0  = int         maxcorr
+  @ internal usage:
+  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  @   r6  = int         max_pitch
+  @   r12 = int         j
+  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4:
+  @ xcorr_kernel_neon parameters:
+  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  @ So we don't save/restore any other registers.
+  BL xcorr_kernel_neon_start
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  @ _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+@ We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done:
+  ADDS         r6, r6, #4
+  @ Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+@ Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining:
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+@ Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8:
+  @ Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  @ Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+@ Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4:
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  @ Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  @ Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done:
+  @ Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+@ Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1:
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done:
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  @ _y++
+  ADD          r1, r1, #2
+  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done:
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
+
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+
+@ This will get used on ARMv7 devices without NEON, so it has been optimized
+@ to take advantage of dual-issuing where possible.
+; xcorr_kernel_edsp: @ PROC
+xcorr_kernel_edsp_start:
+  @ input:
+  @   r3      = int         len
+  @   r4      = opus_val16 *_x (must be 32-bit aligned)
+  @   r5      = opus_val16 *_y (must be 32-bit aligned)
+  @   r6...r9 = opus_val32  sum[4]
+  @ output:
+  @   r6...r9 = opus_val32  sum[4]
+  @ preserved: r0-r5
+  @ internal usage
+  @   r2      = int         j
+  @   r12,r14 = opus_val16  x[4]
+  @   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      @ Load y[0...1]
+  SUBS         r2, r3, #4         @ j = len-4
+  LDR          r11, [r5], #4      @ Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      @ Load x[0...1]
+  @ Stall
+xcorr_kernel_edsp_process4:
+  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
+  @ other. Every other instruction here dual-issues with a multiply, and is
+  @ thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      @ Load x[2...3]
+  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         @ j-=4
+  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      @ Load y[4...5]
+  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      @ Load x[0...1]
+  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      @ Load y[6...7]
+  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done:
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      @ r12 = *x++
+  SUBS         r2, r2, #1         @ j--
+  @ Stall
+  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRHGT       r14, [r4], #2      @ r14 = *x++
+  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         @ j--
+  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRHGT       r12, [r4], #2      @ r12 = *x++
+  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             @ j--
+  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRHGT       r14, [r4]          @ r14 = *x
+  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          @ r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done:
+  LDMFD        sp!, {r2,r4,r5,pc}
+	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
+
+; celt_pitch_xcorr_edsp: @ PROC
+  @ input:
+  @   r0  = opus_val16 *_x (must be 32-bit aligned)
+  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  @   r2  = opus_val32 *xcorr
+  @   r3  = int         len
+  @ output:
+  @   r0  = maxcorr
+  @ internal usage
+  @   r4  = opus_val16 *x
+  @   r5  = opus_val16 *y
+  @   r6  = opus_val32  sum0
+  @   r7  = opus_val32  sum1
+  @   r8  = opus_val32  sum2
+  @   r9  = opus_val32  sum3
+  @   r1  = int         max_pitch
+  @   r12 = int         j
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  @ maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+@ Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  @ r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4:
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done:
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1:
+  LDRHGE       r6, [r4], #2
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
+  SUBSGE       r12, r12, #1
+  LDRHGT       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  @ Restore _x
+  SUB          r4, r4, r3, LSL #1
+  @ Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  @ maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  @ xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done:
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4:
+  @ xcorr_kernel_edsp parameters:
+  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  @ _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2:
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  @ {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4:
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done:
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  @ Stall
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1:
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  @ Stall
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRHGT       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done:
+  @ Restore _x
+  SUB          r4, r4, r3, LSL #1
+  @ Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  @ maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  @ maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  @ xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a:
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  @ r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4:
+  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done:
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRHGE       r6, [r4], #2
+  LDRHGE       r8, [r5], #2
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
+  @ maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  @ xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done:
+  LDMFD        sp!, {r4-r11, pc}
+	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
+
+ .endif
+
+@ END:
+    .section	.note.GNU-stack,"",%progbits
diff --git a/celt/arm/fft_arm.h b/celt/arm/fft_arm.h
new file mode 100644
index 0000000..0cb55d8
--- /dev/null
+++ b/celt/arm/fft_arm.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file fft_arm.h
+   @brief ARM Neon Intrinsic optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#if !defined(FFT_ARM_H)
+#define FFT_ARM_H
+
+#include "config.h"
+#include "kiss_fft.h"
+
+#if defined(HAVE_ARM_NE10)
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st);
+void opus_fft_free_arm_neon(kiss_fft_state *st);
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout);
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout);
+
+#if !defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_OPUS_FFT (1)
+
+#define opus_fft_alloc_arch(_st, arch) \
+   ((void)(arch), opus_fft_alloc_arm_neon(_st))
+
+#define opus_fft_free_arch(_st, arch) \
+   ((void)(arch), opus_fft_free_arm_neon(_st))
+
+#define opus_fft(_st, _fin, _fout, arch) \
+   ((void)(arch), opus_fft_neon(_st, _fin, _fout))
+
+#define opus_ifft(_st, _fin, _fout, arch) \
+   ((void)(arch), opus_ifft_neon(_st, _fin, _fout))
+
+#endif /* OPUS_HAVE_RTCD */
+
+#endif /* HAVE_ARM_NE10 */
+
+#endif
diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h
index b690bc8..efb3b18 100644
--- a/celt/arm/fixed_armv4.h
+++ b/celt/arm/fixed_armv4.h
@@ -68,6 +68,10 @@
 #undef MAC16_32_Q15
 #define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
 
+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q16
+#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
 
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #undef MULT32_32_Q31
diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h
index 1194a7d..36a6321 100644
--- a/celt/arm/fixed_armv5e.h
+++ b/celt/arm/fixed_armv5e.h
@@ -82,6 +82,23 @@
 }
 #define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
 
+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q16
+static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_32_Q16\n\t"
+      "smlawb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(b), "r"(a), "r"(c)
+  );
+  return res;
+}
+#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
+
 /** 16x16 multiply-add where the result fits in 32 bits */
 #undef MAC16_16
 static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
@@ -113,4 +130,22 @@
 }
 #define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
 
+#ifdef OPUS_ARM_INLINE_MEDIA
+
+#undef SIG2WORD16
+static OPUS_INLINE opus_val16 SIG2WORD16_armv6(opus_val32 x)
+{
+   celt_sig res;
+   __asm__(
+       "#SIG2WORD16\n\t"
+       "ssat %0, #16, %1, ASR #12\n\t"
+       : "=r"(res)
+       : "r"(x+2048)
+   );
+   return EXTRACT16(res);
+}
+#define SIG2WORD16(x) (SIG2WORD16_armv6(x))
+
+#endif /* OPUS_ARM_INLINE_MEDIA */
+
 #endif
diff --git a/celt/arm/mdct_arm.h b/celt/arm/mdct_arm.h
new file mode 100644
index 0000000..49cbb44
--- /dev/null
+++ b/celt/arm/mdct_arm.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file arm_mdct.h
+   @brief ARM Neon Intrinsic optimizations for mdct using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(MDCT_ARM_H)
+#define MDCT_ARM_H
+
+#include "config.h"
+#include "mdct.h"
+
+#if defined(HAVE_ARM_NE10)
+/** Compute a forward MDCT and scale by 4/N, trashes the input array */
+void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
+                           kiss_fft_scalar * OPUS_RESTRICT out,
+                           const opus_val16 *window, int overlap,
+                           int shift, int stride, int arch);
+
+void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
+                            kiss_fft_scalar * OPUS_RESTRICT out,
+                            const opus_val16 *window, int overlap,
+                            int shift, int stride, int arch);
+
+#if !defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_OPUS_MDCT (1)
+#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
+      clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
+#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
+      clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
+#endif /* OPUS_HAVE_RTCD */
+#endif /* HAVE_ARM_NE10 */
+
+#endif
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index a07f8ac..8626ed7 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -52,6 +52,17 @@
   ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
 #  endif
 
-# endif
+#else /* Start !FIXED_POINT */
+/* Float case */
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+                                 opus_val32 *xcorr, int len, int max_pitch);
+#if !defined(OPUS_HAVE_RTCD) || defined(OPUS_ARM_PRESUME_NEON_INTR)
+#define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+   ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+#endif
+#endif
 
+#endif /* end !FIXED_POINT */
 #endif
diff --git a/celt/bands.c b/celt/bands.c
index cce56e2..25f229e 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -92,11 +92,11 @@
 
 #ifdef FIXED_POINT
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM)
 {
    int i, c, N;
    const opus_int16 *eBands = m->eBands;
-   N = M*m->shortMdctSize;
+   N = m->shortMdctSize<<LM;
    c=0; do {
       for (i=0;i<end;i++)
       {
@@ -104,18 +104,23 @@
          opus_val32 maxval=0;
          opus_val32 sum = 0;
 
-         j=M*eBands[i]; do {
-            maxval = MAX32(maxval, X[j+c*N]);
-            maxval = MAX32(maxval, -X[j+c*N]);
-         } while (++j<M*eBands[i+1]);
-
+         maxval = celt_maxabs32(&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
          if (maxval > 0)
          {
-            int shift = celt_ilog2(maxval)-10;
-            j=M*eBands[i]; do {
-               sum = MAC16_16(sum, EXTRACT16(VSHR32(X[j+c*N],shift)),
-                                   EXTRACT16(VSHR32(X[j+c*N],shift)));
-            } while (++j<M*eBands[i+1]);
+            int shift = celt_ilog2(maxval) - 14 + (((m->logN[i]>>BITRES)+LM+1)>>1);
+            j=eBands[i]<<LM;
+            if (shift>0)
+            {
+               do {
+                  sum = MAC16_16(sum, EXTRACT16(SHR32(X[j+c*N],shift)),
+                        EXTRACT16(SHR32(X[j+c*N],shift)));
+               } while (++j<eBands[i+1]<<LM);
+            } else {
+               do {
+                  sum = MAC16_16(sum, EXTRACT16(SHL32(X[j+c*N],-shift)),
+                        EXTRACT16(SHL32(X[j+c*N],-shift)));
+               } while (++j<eBands[i+1]<<LM);
+            }
             /* We're adding one here to ensure the normalized band isn't larger than unity norm */
             bandE[i+c*m->nbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift);
          } else {
@@ -150,18 +155,16 @@
 
 #else /* FIXED_POINT */
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM)
 {
    int i, c, N;
    const opus_int16 *eBands = m->eBands;
-   N = M*m->shortMdctSize;
+   N = m->shortMdctSize<<LM;
    c=0; do {
       for (i=0;i<end;i++)
       {
-         int j;
-         opus_val32 sum = 1e-27f;
-         for (j=M*eBands[i];j<M*eBands[i+1];j++)
-            sum += X[j+c*N]*X[j+c*N];
+         opus_val32 sum;
+         sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
          bandE[i+c*m->nbEBands] = celt_sqrt(sum);
          /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
       }
@@ -190,74 +193,80 @@
 
 /* De-normalise the energy to produce the synthesis from the unit-energy bands */
 void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
-      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start, int end, int C, int M)
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start,
+      int end, int M, int downsample, int silence)
 {
-   int i, c, N;
+   int i, N;
+   int bound;
+   celt_sig * OPUS_RESTRICT f;
+   const celt_norm * OPUS_RESTRICT x;
    const opus_int16 *eBands = m->eBands;
    N = M*m->shortMdctSize;
-   celt_assert2(C<=2, "denormalise_bands() not implemented for >2 channels");
-   c=0; do {
-      celt_sig * OPUS_RESTRICT f;
-      const celt_norm * OPUS_RESTRICT x;
-      f = freq+c*N;
-      x = X+c*N+M*eBands[start];
-      for (i=0;i<M*eBands[start];i++)
-         *f++ = 0;
-      for (i=start;i<end;i++)
-      {
-         int j, band_end;
-         opus_val16 g;
-         opus_val16 lg;
+   bound = M*eBands[end];
+   if (downsample!=1)
+      bound = IMIN(bound, N/downsample);
+   if (silence)
+   {
+      bound = 0;
+      start = end = 0;
+   }
+   f = freq;
+   x = X+M*eBands[start];
+   for (i=0;i<M*eBands[start];i++)
+      *f++ = 0;
+   for (i=start;i<end;i++)
+   {
+      int j, band_end;
+      opus_val16 g;
+      opus_val16 lg;
 #ifdef FIXED_POINT
-         int shift;
+      int shift;
 #endif
-         j=M*eBands[i];
-         band_end = M*eBands[i+1];
-         lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6));
+      j=M*eBands[i];
+      band_end = M*eBands[i+1];
+      lg = ADD16(bandLogE[i], SHL16((opus_val16)eMeans[i],6));
 #ifndef FIXED_POINT
-         g = celt_exp2(lg);
+      g = celt_exp2(lg);
 #else
-         /* Handle the integer part of the log energy */
-         shift = 16-(lg>>DB_SHIFT);
-         if (shift>31)
-         {
-            shift=0;
-            g=0;
-         } else {
-            /* Handle the fractional part. */
-            g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
-         }
-         /* Handle extreme gains with negative shift. */
-         if (shift<0)
-         {
-            /* For shift < -2 we'd be likely to overflow, so we're capping
+      /* Handle the integer part of the log energy */
+      shift = 16-(lg>>DB_SHIFT);
+      if (shift>31)
+      {
+         shift=0;
+         g=0;
+      } else {
+         /* Handle the fractional part. */
+         g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
+      }
+      /* Handle extreme gains with negative shift. */
+      if (shift<0)
+      {
+         /* For shift < -2 we'd be likely to overflow, so we're capping
                the gain here. This shouldn't happen unless the bitstream is
                already corrupted. */
-            if (shift < -2)
-            {
-               g = 32767;
-               shift = -2;
-            }
-            do {
-               *f++ = SHL32(MULT16_16(*x++, g), -shift);
-            } while (++j<band_end);
-         } else
+         if (shift < -2)
+         {
+            g = 32767;
+            shift = -2;
+         }
+         do {
+            *f++ = SHL32(MULT16_16(*x++, g), -shift);
+         } while (++j<band_end);
+      } else
 #endif
          /* Be careful of the fixed-point "else" just above when changing this code */
          do {
             *f++ = SHR32(MULT16_16(*x++, g), shift);
          } while (++j<band_end);
-      }
-      celt_assert(start <= end);
-      for (i=M*eBands[end];i<N;i++)
-         *f++ = 0;
-   } while (++c<C);
+   }
+   celt_assert(start <= end);
+   OPUS_CLEAR(&freq[bound], N-bound);
 }
 
 /* This prevents energy collapse for transients with multiple short MDCTs */
 void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed)
+      int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch)
 {
    int c, i, j, k;
    for (i=start;i<end;i++)
@@ -272,7 +281,8 @@
 
       N0 = m->eBands[i+1]-m->eBands[i];
       /* depth in 1/8 bits */
-      depth = (1+pulses[i])/((m->eBands[i+1]-m->eBands[i])<<LM);
+      celt_assert(pulses[i]>=0);
+      depth = celt_udiv(1+pulses[i], (m->eBands[i+1]-m->eBands[i]))>>LM;
 
 #ifdef FIXED_POINT
       thresh32 = SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1);
@@ -345,12 +355,12 @@
          }
          /* We just added some energy, so we need to renormalise */
          if (renormalize)
-            renormalise_vector(X, N0<<LM, Q15ONE);
+            renormalise_vector(X, N0<<LM, Q15ONE, arch);
       } while (++c<C);
    }
 }
 
-static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, const celt_ener *bandE, int bandID, int N)
+static void intensity_stereo(const CELTMode *m, celt_norm * OPUS_RESTRICT X, const celt_norm * OPUS_RESTRICT Y, const celt_ener *bandE, int bandID, int N)
 {
    int i = bandID;
    int j;
@@ -370,25 +380,25 @@
       celt_norm r, l;
       l = X[j];
       r = Y[j];
-      X[j] = MULT16_16_Q14(a1,l) + MULT16_16_Q14(a2,r);
+      X[j] = EXTRACT16(SHR32(MAC16_16(MULT16_16(a1, l), a2, r), 14));
       /* Side is not encoded, no need to calculate */
    }
 }
 
-static void stereo_split(celt_norm *X, celt_norm *Y, int N)
+static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, int N)
 {
    int j;
    for (j=0;j<N;j++)
    {
-      celt_norm r, l;
-      l = MULT16_16_Q15(QCONST16(.70710678f,15), X[j]);
-      r = MULT16_16_Q15(QCONST16(.70710678f,15), Y[j]);
-      X[j] = l+r;
-      Y[j] = r-l;
+      opus_val32 r, l;
+      l = MULT16_16(QCONST16(.70710678f, 15), X[j]);
+      r = MULT16_16(QCONST16(.70710678f, 15), Y[j]);
+      X[j] = EXTRACT16(SHR32(ADD32(l, r), 15));
+      Y[j] = EXTRACT16(SHR32(SUB32(r, l), 15));
    }
 }
 
-static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
+static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N, int arch)
 {
    int j;
    opus_val32 xp=0, side=0;
@@ -400,7 +410,7 @@
    opus_val32 t, lgain, rgain;
 
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   dual_inner_prod(Y, X, Y, N, &xp, &side);
+   dual_inner_prod(Y, X, Y, N, &xp, &side, arch);
    /* Compensating for the mid normalization */
    xp = MULT16_32_Q15(mid, xp);
    /* mid and side are in Q15, not Q14 like X and Y */
@@ -409,8 +419,7 @@
    Er = MULT16_16(mid2, mid2) + side + 2*xp;
    if (Er < QCONST32(6e-4f, 28) || El < QCONST32(6e-4f, 28))
    {
-      for (j=0;j<N;j++)
-         Y[j] = X[j];
+      OPUS_COPY(Y, X, N);
       return;
    }
 
@@ -434,7 +443,7 @@
    {
       celt_norm r, l;
       /* Apply mid scaling (side is already scaled) */
-      l = MULT16_16_Q15(mid, X[j]);
+      l = MULT16_16_P15(mid, X[j]);
       r = Y[j];
       X[j] = EXTRACT16(PSHR32(MULT16_16(lgain, SUB16(l,r)), kl+1));
       Y[j] = EXTRACT16(PSHR32(MULT16_16(rgain, ADD16(l,r)), kr+1));
@@ -442,7 +451,7 @@
 }
 
 /* Decide whether we should spread the pulses in the current frame */
-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
       int last_decision, int *hf_average, int *tapset_decision, int update_hf,
       int end, int C, int M)
 {
@@ -463,7 +472,7 @@
       {
          int j, N, tmp=0;
          int tcount[3] = {0,0,0};
-         celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
+         const celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
          N = M*(eBands[i+1]-eBands[i]);
          if (N<=8)
             continue;
@@ -483,7 +492,7 @@
 
          /* Only include four last bands (8 kHz and up) */
          if (i>m->nbEBands-4)
-            hf_sum += 32*(tcount[1]+tcount[0])/N;
+            hf_sum += celt_udiv(32*(tcount[1]+tcount[0]), N);
          tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N);
          sum += tmp*256;
          nbBands++;
@@ -493,7 +502,7 @@
    if (update_hf)
    {
       if (hf_sum)
-         hf_sum /= C*(4-m->nbEBands+end);
+         hf_sum = celt_udiv(hf_sum, C*(4-m->nbEBands+end));
       *hf_average = (*hf_average+hf_sum)>>1;
       hf_sum = *hf_average;
       if (*tapset_decision==2)
@@ -509,7 +518,8 @@
    }
    /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
    celt_assert(nbBands>0); /* end has to be non-zero */
-   sum /= nbBands;
+   celt_assert(sum>=0);
+   sum = celt_udiv(sum, nbBands);
    /* Recursive averaging */
    sum = (sum+*average)>>1;
    *average = sum;
@@ -567,8 +577,7 @@
          for (j=0;j<N0;j++)
             tmp[i*N0+j] = X[j*stride+i];
    }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
    RESTORE_STACK;
 }
 
@@ -591,8 +600,7 @@
          for (j=0;j<N0;j++)
             tmp[j*stride+i] = X[i*N0+j];
    }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
    RESTORE_STACK;
 }
 
@@ -603,11 +611,11 @@
    for (i=0;i<stride;i++)
       for (j=0;j<N0;j++)
       {
-         celt_norm tmp1, tmp2;
-         tmp1 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*2*j+i]);
-         tmp2 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
-         X[stride*2*j+i] = tmp1 + tmp2;
-         X[stride*(2*j+1)+i] = tmp1 - tmp2;
+         opus_val32 tmp1, tmp2;
+         tmp1 = MULT16_16(QCONST16(.70710678f,15), X[stride*2*j+i]);
+         tmp2 = MULT16_16(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
+         X[stride*2*j+i] = EXTRACT16(PSHR32(ADD32(tmp1, tmp2), 15));
+         X[stride*(2*j+1)+i] = EXTRACT16(PSHR32(SUB32(tmp1, tmp2), 15));
       }
 }
 
@@ -622,7 +630,8 @@
    /* The upper limit ensures that in a stereo split with itheta==16384, we'll
        always have enough bits left over to code at least one pulse in the
        side; otherwise it would collapse, since it doesn't get folded. */
-   qb = IMIN(b-pulse_cap-(4<<BITRES), (b+N2*offset)/N2);
+   qb = celt_sudiv(b+N2*offset, N2);
+   qb = IMIN(b-pulse_cap-(4<<BITRES), qb);
 
    qb = IMIN(8<<BITRES, qb);
 
@@ -647,6 +656,7 @@
    opus_int32 remaining_bits;
    const celt_ener *bandE;
    opus_uint32 seed;
+   int arch;
 };
 
 struct split_ctx {
@@ -698,7 +708,7 @@
          side and mid. With just that parameter, we can re-scale both
          mid and side because we know that 1) they have unit norm and
          2) they are orthogonal. */
-      itheta = stereo_itheta(X, Y, stereo, N);
+      itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
    }
    tell = ec_tell_frac(ec);
    if (qn!=1)
@@ -769,7 +779,8 @@
             ec_dec_update(ec, fl, fl+fs, ft);
          }
       }
-      itheta = (opus_int32)itheta*16384/qn;
+      celt_assert(itheta>=0);
+      itheta = celt_udiv((opus_int32)itheta*16384, qn);
       if (encode && stereo)
       {
          if (itheta==0)
@@ -1021,8 +1032,7 @@
             fill &= cm_mask;
             if (!fill)
             {
-               for (j=0;j<N;j++)
-                  X[j] = 0;
+               OPUS_CLEAR(X, N);
             } else {
                if (lowband == NULL)
                {
@@ -1046,7 +1056,7 @@
                   }
                   cm = fill;
                }
-               renormalise_vector(X, N, gain);
+               renormalise_vector(X, N, gain, ctx->arch);
             }
          }
       }
@@ -1084,7 +1094,7 @@
 
    longBlocks = B0==1;
 
-   N_B /= B;
+   N_B = celt_udiv(N_B, B);
 
    /* Special case for one sample */
    if (N==1)
@@ -1098,9 +1108,7 @@
 
    if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
    {
-      int j;
-      for (j=0;j<N;j++)
-         lowband_scratch[j] = lowband[j];
+      OPUS_COPY(lowband_scratch, lowband, N);
       lowband = lowband_scratch;
    }
 
@@ -1340,7 +1348,7 @@
    if (resynth)
    {
       if (N!=2)
-         stereo_merge(X, Y, mid, N);
+         stereo_merge(X, Y, mid, N, ctx->arch);
       if (inv)
       {
          int j;
@@ -1353,9 +1361,11 @@
 
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
+      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
+      opus_uint32 *seed, int arch)
 {
    int i;
    opus_int32 remaining_bits;
@@ -1397,6 +1407,7 @@
    ctx.m = m;
    ctx.seed = *seed;
    ctx.spread = spread;
+   ctx.arch = arch;
    for (i=start;i<end;i++)
    {
       opus_int32 tell;
@@ -1428,7 +1439,7 @@
       ctx.remaining_bits = remaining_bits;
       if (i <= codedBands-1)
       {
-         curr_balance = balance / IMIN(3, codedBands-i);
+         curr_balance = celt_sudiv(balance, IMIN(3, codedBands-i));
          b = IMAX(0, IMIN(16383, IMIN(remaining_bits+1,pulses[i]+curr_balance)));
       } else {
          b = 0;
diff --git a/celt/bands.h b/celt/bands.h
index 96ba52a..e8bef4b 100644
--- a/celt/bands.h
+++ b/celt/bands.h
@@ -41,7 +41,7 @@
  * @param X Spectrum
  * @param bandE Square root of the energy for each band (returned)
  */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M);
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM);
 
 /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/
 
@@ -59,14 +59,15 @@
  * @param bandE Square root of the energy for each band
  */
 void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
-      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start, int end, int C, int M);
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start,
+      int end, int M, int downsample, int silence);
 
 #define SPREAD_NONE       (0)
 #define SPREAD_LIGHT      (1)
 #define SPREAD_NORMAL     (2)
 #define SPREAD_AGGRESSIVE (3)
 
-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
       int last_decision, int *hf_average, int *tapset_decision, int update_hf,
       int end, int C, int M);
 
@@ -97,15 +98,20 @@
  * @param LM log2() of the number of 2.5 subframes in the frame
  * @param codedBands Last band to receive bits + 1
  * @param seed Random generator seed
+ * @param arch Run-time architecture (see opus_select_arch())
  */
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
+      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
+      int arch);
 
-void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed);
+void anti_collapse(const CELTMode *m, celt_norm *X_,
+      unsigned char *collapse_masks, int LM, int C, int size, int start,
+      int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
+      int arch);
 
 opus_uint32 celt_lcg_rand(opus_uint32 seed);
 
diff --git a/celt/celt.c b/celt/celt.c
index 3e0ce6e..b121c51 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -54,6 +54,10 @@
 #define PACKAGE_VERSION "unknown"
 #endif
 
+#if defined(MIPSr1_ASM)
+#include "mips/celt_mipsr1.h"
+#endif
+
 
 int resampling_factor(opus_int32 rate)
 {
@@ -85,8 +89,71 @@
    return ret;
 }
 
-#ifndef OVERRIDE_COMB_FILTER_CONST
-static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+#if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C)
+/* This version should be faster on ARM */
+#ifdef OPUS_ARM_ASM
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = SHL32(x[-T-2], 1);
+   x3 = SHL32(x[-T-1], 1);
+   x2 = SHL32(x[-T], 1);
+   x1 = SHL32(x[-T+1], 1);
+   for (i=0;i<N-4;i+=5)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=SHL32(x[i-T+3],1);
+      t = MAC16_32_Q16(x[i+1], g10, x1);
+      t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
+      t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
+      y[i+1] = t;
+      x3=SHL32(x[i-T+4],1);
+      t = MAC16_32_Q16(x[i+2], g10, x0);
+      t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
+      t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
+      y[i+2] = t;
+      x2=SHL32(x[i-T+5],1);
+      t = MAC16_32_Q16(x[i+3], g10, x4);
+      t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
+      t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
+      y[i+3] = t;
+      x1=SHL32(x[i-T+6],1);
+      t = MAC16_32_Q16(x[i+4], g10, x3);
+      t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
+      t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
+      y[i+4] = t;
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+#endif
+}
+#else
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
    opus_val32 x0, x1, x2, x3, x4;
@@ -110,10 +177,12 @@
 
 }
 #endif
+#endif
 
+#ifndef OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap)
+      const opus_val16 *window, int overlap, int arch)
 {
    int i;
    /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
@@ -131,16 +200,19 @@
          OPUS_MOVE(y, x, N);
       return;
    }
-   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
+   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
+   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
+   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
+   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
+   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
    x1 = x[-T1+1];
    x2 = x[-T1  ];
    x3 = x[-T1-1];
    x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
    for (i=0;i<overlap;i++)
    {
       opus_val16 f;
@@ -168,8 +240,9 @@
    }
 
    /* Compute the part with the constant filter. */
-   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);
 }
+#endif /* OVERRIDE_comb_filter */
 
 const signed char tf_select_table[4][8] = {
       {0, -1, 0, -1,    0,-1, 0,-1},
@@ -213,6 +286,9 @@
 const char *opus_get_version_string(void)
 {
     return "libopus " PACKAGE_VERSION
+    /* Applications may rely on the presence of this substring in the version
+       string to determine if they have a fixed-point or floating-point build
+       at runtime. */
 #ifdef FIXED_POINT
           "-fixed"
 #endif
diff --git a/celt/celt.h b/celt/celt.h
index 5deea1f..a423b95 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -134,7 +134,8 @@
 
 int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
 
-int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec);
+int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
 
 #define celt_encoder_ctl opus_custom_encoder_ctl
 #define celt_decoder_ctl opus_custom_decoder_ctl
@@ -200,15 +201,25 @@
 
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
-      const opus_val16 *window, int overlap);
+      const opus_val16 *window, int overlap, int arch);
+
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch)		\
+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
 
 void init_caps(const CELTMode *m,int *cap,int LM,int C);
 
 #ifdef RESYNTH
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch);
-
-void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
-      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM);
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+      opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
+      int LM, int downsample, int silence);
 #endif
 
 #ifdef __cplusplus
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 830398e..b688f2a 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -51,6 +51,9 @@
 #include "celt_lpc.h"
 #include "vq.h"
 
+#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
+#define NORM_ALIASING_HACK
+#endif
 /**********************************************************************/
 /*                                                                    */
 /*                             DECODER                                */
@@ -175,28 +178,24 @@
 }
 #endif /* CUSTOM_MODES */
 
-static OPUS_INLINE opus_val16 SIG2WORD16(celt_sig x)
-{
-#ifdef FIXED_POINT
-   x = PSHR32(x, SIG_SHIFT);
-   x = MAX32(x, -32768);
-   x = MIN32(x, 32767);
-   return EXTRACT16(x);
-#else
-   return (opus_val16)x;
-#endif
-}
 
 #ifndef RESYNTH
 static
 #endif
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch)
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef,
+      celt_sig *mem, int accum)
 {
    int c;
    int Nd;
    int apply_downsampling=0;
    opus_val16 coef0;
-
+   VARDECL(celt_sig, scratch);
+   SAVE_STACK;
+#ifndef FIXED_POINT
+   (void)accum;
+   celt_assert(accum==0);
+#endif
+   ALLOC(scratch, N, celt_sig);
    coef0 = coef[0];
    Nd = N/downsample;
    c=0; do {
@@ -234,11 +233,24 @@
          apply_downsampling=1;
       } else {
          /* Shortcut for the standard (non-custom modes) case */
-         for (j=0;j<N;j++)
+#ifdef FIXED_POINT
+         if (accum)
          {
-            celt_sig tmp = x[j] + m + VERY_SMALL;
-            m = MULT16_32_Q15(coef0, tmp);
-            y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = x[j] + m + VERY_SMALL;
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = SAT16(ADD32(y[j*C], SCALEOUT(SIG2WORD16(tmp))));
+            }
+         } else
+#endif
+         {
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = x[j] + m + VERY_SMALL;
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+            }
          }
       }
       mem[c] = m;
@@ -246,41 +258,95 @@
       if (apply_downsampling)
       {
          /* Perform down-sampling */
-         for (j=0;j<Nd;j++)
-            y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+#ifdef FIXED_POINT
+         if (accum)
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = SAT16(ADD32(y[j*C], SCALEOUT(SIG2WORD16(scratch[j*downsample]))));
+         } else
+#endif
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+         }
       }
    } while (++c<C);
+   RESTORE_STACK;
 }
 
-/** Compute the IMDCT and apply window for all sub-frames and
-    all channels in a frame */
 #ifndef RESYNTH
 static
 #endif
-void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
-      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM)
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+                    opus_val16 *oldBandE, int start, int effEnd, int C, int CC,
+                    int isTransient, int LM, int downsample,
+                    int silence, int arch)
 {
-   int b, c;
+   int c, i;
+   int M;
+   int b;
    int B;
-   int N;
+   int N, NB;
    int shift;
-   const int overlap = OVERLAP(mode);
+   int nbEBands;
+   int overlap;
+   VARDECL(celt_sig, freq);
+   SAVE_STACK;
 
-   if (shortBlocks)
+   overlap = mode->overlap;
+   nbEBands = mode->nbEBands;
+   N = mode->shortMdctSize<<LM;
+   ALLOC(freq, N, celt_sig); /**< Interleaved signal MDCTs */
+   M = 1<<LM;
+
+   if (isTransient)
    {
-      B = shortBlocks;
-      N = mode->shortMdctSize;
+      B = M;
+      NB = mode->shortMdctSize;
       shift = mode->maxLM;
    } else {
       B = 1;
-      N = mode->shortMdctSize<<LM;
+      NB = mode->shortMdctSize<<LM;
       shift = mode->maxLM-LM;
    }
-   c=0; do {
-      /* IMDCT on the interleaved the sub-frames, overlap-add is performed by the IMDCT */
+
+   if (CC==2&&C==1)
+   {
+      /* Copying a mono streams to two channels */
+      celt_sig *freq2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Store a temporary copy in the output buffer because the IMDCT destroys its input. */
+      freq2 = out_syn[1]+overlap/2;
+      OPUS_COPY(freq2, freq, N);
       for (b=0;b<B;b++)
-         clt_mdct_backward(&mode->mdct, &X[b+c*N*B], out_mem[c]+N*b, mode->window, overlap, shift, B);
-   } while (++c<C);
+         clt_mdct_backward(&mode->mdct, &freq2[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch);
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[1]+NB*b, mode->window, overlap, shift, B, arch);
+   } else if (CC==1&&C==2)
+   {
+      /* Downmixing a stereo stream to mono */
+      celt_sig *freq2;
+      freq2 = out_syn[0]+overlap/2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Use the output buffer as temp array before downmixing. */
+      denormalise_bands(mode, X+N, freq2, oldBandE+nbEBands, start, effEnd, M,
+            downsample, silence);
+      for (i=0;i<N;i++)
+         freq[i] = HALF32(ADD32(freq[i],freq2[i]));
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch);
+   } else {
+      /* Normal case (mono or stereo) */
+      c=0; do {
+         denormalise_bands(mode, X+c*N, freq, oldBandE+c*nbEBands, start, effEnd, M,
+               downsample, silence);
+         for (b=0;b<B;b++)
+            clt_mdct_backward(&mode->mdct, &freq[b], out_syn[c]+NB*b, mode->window, overlap, shift, B, arch);
+      } while (++c<CC);
+   }
+   RESTORE_STACK;
 }
 
 static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
@@ -330,7 +396,23 @@
    pitch of 480 Hz. */
 #define PLC_PITCH_LAG_MIN (100)
 
-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_RESTRICT pcm, int N, int LM)
+static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
+{
+   int pitch_index;
+   VARDECL( opus_val16, lp_pitch_buf );
+   SAVE_STACK;
+   ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
+   pitch_downsample(decode_mem, lp_pitch_buf,
+         DECODE_BUFFER_SIZE, C, arch);
+   pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
+         DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
+         PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, arch);
+   pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
+   RESTORE_STACK;
+   return pitch_index;
+}
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 {
    int c;
    int i;
@@ -343,11 +425,9 @@
    int nbEBands;
    int overlap;
    int start;
-   int downsample;
    int loss_count;
    int noise_based;
    const opus_int16 *eBands;
-   VARDECL(celt_sig, scratch);
    SAVE_STACK;
 
    mode = st->mode;
@@ -367,40 +447,37 @@
 
    loss_count = st->loss_count;
    start = st->start;
-   downsample = st->downsample;
    noise_based = loss_count >= 5 || start != 0;
-   ALLOC(scratch, noise_based?N*C:N, celt_sig);
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
-      celt_sig *freq;
+#ifdef NORM_ALIASING_HACK
+      celt_norm *X;
+#else
       VARDECL(celt_norm, X);
+#endif
       opus_uint32 seed;
-      opus_val16 *plcLogE;
       int end;
       int effEnd;
-
+      opus_val16 decay;
       end = st->end;
       effEnd = IMAX(start, IMIN(end, mode->effEBands));
 
-      /* Share the interleaved signal MDCT coefficient buffer with the
-         deemphasis scratch buffer. */
-      freq = scratch;
+#ifdef NORM_ALIASING_HACK
+      /* This is an ugly hack that breaks aliasing rules and would be easily broken,
+         but it saves almost 4kB of stack. */
+      X = (celt_norm*)(out_syn[C-1]+overlap/2);
+#else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+#endif
 
-      if (loss_count >= 5)
-         plcLogE = backgroundLogE;
-      else {
-         /* Energy decay */
-         opus_val16 decay = loss_count==0 ?
-               QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
-         c=0; do
-         {
-            for (i=start;i<end;i++)
-               oldBandE[c*nbEBands+i] -= decay;
-         } while (++c<C);
-         plcLogE = oldBandE;
-      }
+      /* Energy decay */
+      decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+      c=0; do
+      {
+         for (i=start;i<end;i++)
+            oldBandE[c*nbEBands+i] = MAX16(backgroundLogE[c*nbEBands+i], oldBandE[c*nbEBands+i] - decay);
+      } while (++c<C);
       seed = st->rng;
       for (c=0;c<C;c++)
       {
@@ -416,25 +493,17 @@
                seed = celt_lcg_rand(seed);
                X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
             }
-            renormalise_vector(X+boffs, blen, Q15ONE);
+            renormalise_vector(X+boffs, blen, Q15ONE, st->arch);
          }
       }
       st->rng = seed;
 
-      denormalise_bands(mode, X, freq, plcLogE, start, effEnd, C, 1<<LM);
-
-      c=0; do {
-         int bound = eBands[effEnd]<<LM;
-         if (downsample!=1)
-            bound = IMIN(bound, N/downsample);
-         for (i=bound;i<N;i++)
-            freq[c*N+i] = 0;
-      } while (++c<C);
       c=0; do {
          OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
                DECODE_BUFFER_SIZE-N+(overlap>>1));
       } while (++c<C);
-      compute_inv_mdcts(mode, 0, freq, out_syn, C, LM);
+
+      celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
    } else {
       /* Pitch-based PLC */
       const opus_val16 *window;
@@ -445,15 +514,7 @@
 
       if (loss_count == 0)
       {
-         VARDECL( opus_val16, lp_pitch_buf );
-         ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
-         pitch_downsample(decode_mem, lp_pitch_buf,
-               DECODE_BUFFER_SIZE, C, st->arch);
-         pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
-               DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
-               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);
-         pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
-         st->last_pitch_index = pitch_index;
+         st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
          pitch_index = st->last_pitch_index;
          fade = QCONST16(.8f,15);
@@ -516,7 +577,7 @@
             }
             /* Compute the excitation for exc_length samples before the loss. */
             celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
          }
 
          /* Check if the waveform is decaying, and if so how fast.
@@ -583,7 +644,7 @@
                the signal domain. */
             celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
                   buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
-                  lpc_mem);
+                  lpc_mem, st->arch);
          }
 
          /* Check if the synthesis energy is higher than expected, which can
@@ -631,7 +692,7 @@
          comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
               st->postfilter_period, st->postfilter_period, overlap,
               -st->postfilter_gain, -st->postfilter_gain,
-              st->postfilter_tapset, st->postfilter_tapset, NULL, 0);
+              st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch);
 
          /* Simulate TDAC on the concealed audio so that it blends with the
             MDCT of the next frame. */
@@ -644,22 +705,23 @@
       } while (++c<C);
    }
 
-   deemphasis(out_syn, pcm, N, C, downsample,
-         mode->preemph, st->preemph_memD, scratch);
-
    st->loss_count = loss_count+1;
 
    RESTORE_STACK;
 }
 
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
 {
    int c, i, N;
    int spread_decision;
    opus_int32 bits;
    ec_dec _dec;
-   VARDECL(celt_sig, freq);
+#ifdef NORM_ALIASING_HACK
+   celt_norm *X;
+#else
    VARDECL(celt_norm, X);
+#endif
    VARDECL(int, fine_quant);
    VARDECL(int, pulses);
    VARDECL(int, cap);
@@ -677,6 +739,8 @@
    int intra_ener;
    const int CC = st->channels;
    int LM, M;
+   int start;
+   int end;
    int effEnd;
    int codedBands;
    int alloc_trim;
@@ -703,11 +767,10 @@
    nbEBands = mode->nbEBands;
    overlap = mode->overlap;
    eBands = mode->eBands;
+   start = st->start;
+   end = st->end;
    frame_size *= st->downsample;
 
-   c=0; do {
-      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
-   } while (++c<CC);
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
    oldBandE = lpc+CC*LPC_ORDER;
    oldLogE = oldBandE + 2*nbEBands;
@@ -725,7 +788,7 @@
          if (data0<0)
             return OPUS_INVALID_PACKET;
       }
-      st->end = IMAX(1, mode->effEBands-2*(data0>>5));
+      st->end = end = IMAX(1, mode->effEBands-2*(data0>>5));
       LM = (data0>>3)&0x3;
       C = 1 + ((data0>>2)&0x1);
       data++;
@@ -752,14 +815,19 @@
       return OPUS_BAD_ARG;
 
    N = M*mode->shortMdctSize;
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<CC);
 
-   effEnd = st->end;
+   effEnd = end;
    if (effEnd > mode->effEBands)
       effEnd = mode->effEBands;
 
    if (data == NULL || len<=1)
    {
-      celt_decode_lost(st, pcm, N, LM);
+      celt_decode_lost(st, N, LM);
+      deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
       RESTORE_STACK;
       return frame_size/st->downsample;
    }
@@ -795,7 +863,7 @@
    postfilter_gain = 0;
    postfilter_pitch = 0;
    postfilter_tapset = 0;
-   if (st->start==0 && tell+16 <= total_bits)
+   if (start==0 && tell+16 <= total_bits)
    {
       if(ec_dec_bit_logp(dec, 1))
       {
@@ -826,11 +894,11 @@
    /* Decode the global flags (first symbols in the stream) */
    intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
    /* Get band energies */
-   unquant_coarse_energy(mode, st->start, st->end, oldBandE,
+   unquant_coarse_energy(mode, start, end, oldBandE,
          intra_ener, dec, C, LM);
 
    ALLOC(tf_res, nbEBands, int);
-   tf_decode(st->start, st->end, isTransient, tf_res, LM, dec);
+   tf_decode(start, end, isTransient, tf_res, LM, dec);
 
    tell = ec_tell(dec);
    spread_decision = SPREAD_NORMAL;
@@ -846,7 +914,7 @@
    dynalloc_logp = 6;
    total_bits<<=BITRES;
    tell = ec_tell_frac(dec);
-   for (i=st->start;i<st->end;i++)
+   for (i=start;i<end;i++)
    {
       int width, quanta;
       int dynalloc_loop_logp;
@@ -885,84 +953,62 @@
    ALLOC(pulses, nbEBands, int);
    ALLOC(fine_priority, nbEBands, int);
 
-   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+   codedBands = compute_allocation(mode, start, end, offsets, cap,
          alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
          fine_quant, fine_priority, C, LM, dec, 0, 0, 0);
 
-   unquant_fine_energy(mode, st->start, st->end, oldBandE, fine_quant, dec, C);
+   unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
+
+   c=0; do {
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+   } while (++c<CC);
 
    /* Decode fixed codebook */
    ALLOC(collapse_masks, C*nbEBands, unsigned char);
-   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
 
-   quant_all_bands(0, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+#ifdef NORM_ALIASING_HACK
+   /* This is an ugly hack that breaks aliasing rules and would be easily broken,
+      but it saves almost 4kB of stack. */
+   X = (celt_norm*)(out_syn[CC-1]+overlap/2);
+#else
+   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+#endif
+
+   quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
          NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
-         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
+         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch);
 
    if (anti_collapse_rsv > 0)
    {
       anti_collapse_on = ec_dec_bits(dec, 1);
    }
 
-   unquant_energy_finalise(mode, st->start, st->end, oldBandE,
+   unquant_energy_finalise(mode, start, end, oldBandE,
          fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);
 
    if (anti_collapse_on)
       anti_collapse(mode, X, collapse_masks, LM, C, N,
-            st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
-
-   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
+            start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
 
    if (silence)
    {
       for (i=0;i<C*nbEBands;i++)
          oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
-      for (i=0;i<C*N;i++)
-         freq[i] = 0;
-   } else {
-      /* Synthesis */
-      denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
-   }
-   c=0; do {
-      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
-   } while (++c<CC);
-
-   c=0; do {
-      int bound = M*eBands[effEnd];
-      if (st->downsample!=1)
-         bound = IMIN(bound, N/st->downsample);
-      for (i=bound;i<N;i++)
-         freq[c*N+i] = 0;
-   } while (++c<C);
-
-   c=0; do {
-      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
-   } while (++c<CC);
-
-   if (CC==2&&C==1)
-   {
-      for (i=0;i<N;i++)
-         freq[N+i] = freq[i];
-   }
-   if (CC==1&&C==2)
-   {
-      for (i=0;i<N;i++)
-         freq[i] = HALF32(ADD32(freq[i],freq[N+i]));
    }
 
-   /* Compute inverse MDCTs */
-   compute_inv_mdcts(mode, shortBlocks, freq, out_syn, CC, LM);
+   celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd,
+                  C, CC, isTransient, LM, st->downsample, silence, st->arch);
 
    c=0; do {
       st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
       st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);
       comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize,
             st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,
-            mode->window, overlap);
+            mode->window, overlap, st->arch);
       if (LM!=0)
          comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize,
                st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,
-               mode->window, overlap);
+               mode->window, overlap, st->arch);
 
    } while (++c<CC);
    st->postfilter_period_old = st->postfilter_period;
@@ -978,32 +1024,36 @@
       st->postfilter_tapset_old = st->postfilter_tapset;
    }
 
-   if (C==1) {
-      for (i=0;i<nbEBands;i++)
-         oldBandE[nbEBands+i]=oldBandE[i];
-   }
+   if (C==1)
+      OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
 
    /* In case start or end were to change */
    if (!isTransient)
    {
+      opus_val16 max_background_increase;
+      OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
+      OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
+      /* In normal circumstances, we only allow the noise floor to increase by
+         up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
+         increase for each update.*/
+      if (st->loss_count < 10)
+         max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
+      else
+         max_background_increase = QCONST16(1.f,DB_SHIFT);
       for (i=0;i<2*nbEBands;i++)
-         oldLogE2[i] = oldLogE[i];
-      for (i=0;i<2*nbEBands;i++)
-         oldLogE[i] = oldBandE[i];
-      for (i=0;i<2*nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + M*QCONST16(0.001f,DB_SHIFT), oldBandE[i]);
+         backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
    } else {
       for (i=0;i<2*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
    }
    c=0; do
    {
-      for (i=0;i<st->start;i++)
+      for (i=0;i<start;i++)
       {
          oldBandE[c*nbEBands+i]=0;
          oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
       }
-      for (i=st->end;i<nbEBands;i++)
+      for (i=end;i<nbEBands;i++)
       {
          oldBandE[c*nbEBands+i]=0;
          oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
@@ -1011,8 +1061,7 @@
    } while (++c<2);
    st->rng = dec->rng;
 
-   /* We reuse freq[] as scratch space for the de-emphasis */
-   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, freq);
+   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
    st->loss_count = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
@@ -1028,7 +1077,7 @@
 #ifdef FIXED_POINT
 int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
 {
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
 }
 
 #ifndef DISABLE_FLOAT_API
@@ -1045,7 +1094,7 @@
    N = frame_size;
 
    ALLOC(out, C*N, opus_int16);
-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
    if (ret>0)
       for (j=0;j<C*ret;j++)
          pcm[j]=out[j]*(1.f/32768.f);
@@ -1059,7 +1108,7 @@
 
 int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
 {
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
 }
 
 int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
@@ -1075,7 +1124,7 @@
    N = frame_size;
    ALLOC(out, C*N, celt_sig);
 
-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
 
    if (ret>0)
       for (j=0;j<C*ret;j++)
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index ffff077..41fbfd4 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -57,7 +57,6 @@
  */
 struct OpusCustomEncoder {
    const OpusCustomMode *mode;     /**< Mode used by the encoder */
-   int overlap;
    int channels;
    int stream_channels;
 
@@ -173,7 +172,6 @@
    OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels));
 
    st->mode = mode;
-   st->overlap = mode->overlap;
    st->stream_channels = st->channels = channels;
 
    st->upsample = 1;
@@ -276,8 +274,7 @@
       }
       /*printf("\n");*/
       /* First few samples are bad because we don't propagate the memory */
-      for (i=0;i<12;i++)
-         tmp[i] = 0;
+      OPUS_CLEAR(tmp, 12);
 
 #ifdef FIXED_POINT
       /* Normalize tmp to max range */
@@ -346,9 +343,9 @@
       {
          int id;
 #ifdef FIXED_POINT
-         id = IMAX(0,IMIN(127,MULT16_32_Q15(tmp[i],norm))); /* Do not round to nearest */
+         id = MAX32(0,MIN32(127,MULT16_32_Q15(tmp[i]+EPSILON,norm))); /* Do not round to nearest */
 #else
-         id = IMAX(0,IMIN(127,(int)floor(64*norm*tmp[i]))); /* Do not round to nearest */
+         id = (int)MAX32(0,MIN32(127,floor(64*norm*(tmp[i]+EPSILON)))); /* Do not round to nearest */
 #endif
          unmask += inv_table[id];
       }
@@ -366,7 +363,7 @@
    /* Arbitrary metric for VBR boost */
    tf_max = MAX16(0,celt_sqrt(27*mask_metric)-42);
    /* *tf_estimate = 1 + MIN16(1, sqrt(MAX16(0, tf_max-30))/20); */
-   *tf_estimate = celt_sqrt(MAX16(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28)));
+   *tf_estimate = celt_sqrt(MAX32(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28)));
    /*printf("%d %f\n", tf_max, mask_metric);*/
    RESTORE_STACK;
 #ifdef FUZZING
@@ -378,8 +375,8 @@
 
 /* Looks for sudden increases of energy to decide whether we need to patch
    the transient decision */
-int patch_transient_decision(opus_val16 *newE, opus_val16 *oldE, int nbEBands,
-      int end, int C)
+static int patch_transient_decision(opus_val16 *newE, opus_val16 *oldE, int nbEBands,
+      int start, int end, int C)
 {
    int i, c;
    opus_val32 mean_diff=0;
@@ -388,28 +385,28 @@
       avoid false detection caused by irrelevant bands */
    if (C==1)
    {
-      spread_old[0] = oldE[0];
-      for (i=1;i<end;i++)
+      spread_old[start] = oldE[start];
+      for (i=start+1;i<end;i++)
          spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT), oldE[i]);
    } else {
-      spread_old[0] = MAX16(oldE[0],oldE[nbEBands]);
-      for (i=1;i<end;i++)
+      spread_old[start] = MAX16(oldE[start],oldE[start+nbEBands]);
+      for (i=start+1;i<end;i++)
          spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT),
                                MAX16(oldE[i],oldE[i+nbEBands]));
    }
-   for (i=end-2;i>=0;i--)
+   for (i=end-2;i>=start;i--)
       spread_old[i] = MAX16(spread_old[i], spread_old[i+1]-QCONST16(1.0f, DB_SHIFT));
    /* Compute mean increase */
    c=0; do {
-      for (i=2;i<end-1;i++)
+      for (i=IMAX(2,start);i<end-1;i++)
       {
          opus_val16 x1, x2;
-         x1 = MAX16(0, newE[i]);
+         x1 = MAX16(0, newE[i + c*nbEBands]);
          x2 = MAX16(0, spread_old[i]);
          mean_diff = ADD32(mean_diff, EXTEND32(MAX16(0, SUB16(x1, x2))));
       }
    } while (++c<C);
-   mean_diff = DIV32(mean_diff, C*(end-3));
+   mean_diff = DIV32(mean_diff, C*(end-1-IMAX(2,start)));
    /*printf("%f %f %d\n", mean_diff, max_diff, count);*/
    return mean_diff > QCONST16(1.f, DB_SHIFT);
 }
@@ -417,9 +414,10 @@
 /** Apply window and compute the MDCT for all sub-frames and
     all channels in a frame */
 static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in,
-                          celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample)
+                          celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample,
+                          int arch)
 {
-   const int overlap = OVERLAP(mode);
+   const int overlap = mode->overlap;
    int N;
    int B;
    int shift;
@@ -438,7 +436,9 @@
       for (b=0;b<B;b++)
       {
          /* Interleaving the sub-frames while doing the MDCTs */
-         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N, &out[b+c*N*B], mode->window, overlap, shift, B);
+         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N,
+                          &out[b+c*N*B], mode->window, overlap, shift, B,
+                          arch);
       }
    } while (++c<CC);
    if (CC==2&&C==1)
@@ -453,8 +453,7 @@
          int bound = B*N/upsample;
          for (i=0;i<bound;i++)
             out[c*B*N+i] *= upsample;
-         for (;i<B*N;i++)
-            out[c*B*N+i] = 0;
+         OPUS_CLEAR(&out[c*B*N+bound], B*N-bound);
       } while (++c<C);
    }
 }
@@ -469,26 +468,30 @@
    int Nu;
 
    coef0 = coef[0];
+   m = *mem;
 
+   /* Fast path for the normal 48kHz case and no clipping */
+   if (coef[1] == 0 && upsample == 1 && !clip)
+   {
+      for (i=0;i<N;i++)
+      {
+         opus_val16 x;
+         x = SCALEIN(pcmp[CC*i]);
+         /* Apply pre-emphasis */
+         inp[i] = SHL32(x, SIG_SHIFT) - m;
+         m = SHR32(MULT16_16(coef0, x), 15-SIG_SHIFT);
+      }
+      *mem = m;
+      return;
+   }
 
    Nu = N/upsample;
    if (upsample!=1)
    {
-      for (i=0;i<N;i++)
-         inp[i] = 0;
+      OPUS_CLEAR(inp, N);
    }
    for (i=0;i<Nu;i++)
-   {
-      celt_sig x;
-
-      x = SCALEIN(pcmp[CC*i]);
-#ifndef FIXED_POINT
-      /* Replace NaNs with zeros */
-      if (!(x==x))
-         x = 0;
-#endif
-      inp[i*upsample] = x;
-   }
+      inp[i*upsample] = SCALEIN(pcmp[CC*i]);
 
 #ifndef FIXED_POINT
    if (clip)
@@ -500,7 +503,6 @@
 #else
    (void)clip; /* Avoids a warning about clip being unused. */
 #endif
-   m = *mem;
 #ifdef CUSTOM_MODES
    if (coef[1] != 0)
    {
@@ -520,11 +522,11 @@
    {
       for (i=0;i<N;i++)
       {
-         celt_sig x;
-         x = SHL32(inp[i], SIG_SHIFT);
+         opus_val16 x;
+         x = inp[i];
          /* Apply pre-emphasis */
-         inp[i] = x + m;
-         m = - MULT16_32_Q15(coef0, x);
+         inp[i] = SHL32(x, SIG_SHIFT) - m;
+         m = SHR32(MULT16_16(coef0, x), 15-SIG_SHIFT);
       }
    }
    *mem = m;
@@ -575,15 +577,14 @@
    *tf_sum = 0;
    for (i=0;i<len;i++)
    {
-      int j, k, N;
+      int k, N;
       int narrow;
       opus_val32 L1, best_L1;
       int best_level=0;
       N = (m->eBands[i+1]-m->eBands[i])<<LM;
       /* band is too narrow to be split down to LM=-1 */
       narrow = (m->eBands[i+1]-m->eBands[i])==1;
-      for (j=0;j<N;j++)
-         tmp[j] = X[tf_chan*N0 + j+(m->eBands[i]<<LM)];
+      OPUS_COPY(tmp, &X[tf_chan*N0 + (m->eBands[i]<<LM)], N);
       /* Just add the right channel if we're in stereo */
       /*if (C==2)
          for (j=0;j<N;j++)
@@ -593,8 +594,7 @@
       /* Check the -1 case for transients */
       if (isTransient && !narrow)
       {
-         for (j=0;j<N;j++)
-            tmp_1[j] = tmp[j];
+         OPUS_COPY(tmp_1, tmp, N);
          haar1(tmp_1, N>>LM, 1<<LM);
          L1 = l1_metric(tmp_1, N, LM+1, bias);
          if (L1<best_L1)
@@ -754,12 +754,12 @@
 static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       const opus_val16 *bandLogE, int end, int LM, int C, int N0,
       AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
-      int intensity, opus_val16 surround_trim)
+      int intensity, opus_val16 surround_trim, int arch)
 {
    int i;
    opus_val32 diff=0;
    int c;
-   int trim_index = 5;
+   int trim_index;
    opus_val16 trim = QCONST16(5.f, 8);
    opus_val16 logXC, logXC2;
    if (C==2)
@@ -769,10 +769,9 @@
       /* Compute inter-channel correlation for low frequencies */
       for (i=0;i<8;i++)
       {
-         int j;
-         opus_val32 partial = 0;
-         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-            partial = MAC16_16(partial, X[j], X[N0+j]);
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
          sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
       }
       sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
@@ -780,22 +779,13 @@
       minXC = sum;
       for (i=8;i<intensity;i++)
       {
-         int j;
-         opus_val32 partial = 0;
-         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-            partial = MAC16_16(partial, X[j], X[N0+j]);
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
          minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
       }
       minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
       /*printf ("%f\n", sum);*/
-      if (sum > QCONST16(.995f,10))
-         trim_index-=4;
-      else if (sum > QCONST16(.92f,10))
-         trim_index-=3;
-      else if (sum > QCONST16(.85f,10))
-         trim_index-=2;
-      else if (sum > QCONST16(.8f,10))
-         trim_index-=1;
       /* mid-side savings estimations based on the LF average*/
       logXC = celt_log2(QCONST32(1.001f, 20)-MULT16_16(sum, sum));
       /* mid-side savings estimations based on min correlation */
@@ -819,14 +809,6 @@
    } while (++c<C);
    diff /= C*(end-1);
    /*printf("%f\n", diff);*/
-   if (diff > QCONST16(2.f, DB_SHIFT))
-      trim_index--;
-   if (diff > QCONST16(8.f, DB_SHIFT))
-      trim_index--;
-   if (diff < -QCONST16(4.f, DB_SHIFT))
-      trim_index++;
-   if (diff < -QCONST16(10.f, DB_SHIFT))
-      trim_index++;
    trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), SHR16(diff+QCONST16(1.f, DB_SHIFT),DB_SHIFT-8)/6 ));
    trim -= SHR16(surround_trim, DB_SHIFT-8);
    trim -= 2*SHR16(tf_estimate, 14-8);
@@ -836,6 +818,8 @@
       trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8),
             (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f))));
    }
+#else
+   (void)analysis;
 #endif
 
 #ifdef FIXED_POINT
@@ -843,10 +827,7 @@
 #else
    trim_index = (int)floor(.5f+trim);
 #endif
-   if (trim_index<0)
-      trim_index = 0;
-   if (trim_index>10)
-      trim_index = 10;
+   trim_index = IMAX(0, IMIN(10, trim_index));
    /*printf("%d\n", trim_index);*/
 #ifdef FUZZING
    trim_index = rand()%11;
@@ -886,6 +867,66 @@
          > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR);
 }
 
+#define MSWAP(a,b) do {opus_val16 tmp = a;a=b;b=tmp;} while(0)
+static opus_val16 median_of_5(const opus_val16 *x)
+{
+   opus_val16 t0, t1, t2, t3, t4;
+   t2 = x[2];
+   if (x[0] > x[1])
+   {
+      t0 = x[1];
+      t1 = x[0];
+   } else {
+      t0 = x[0];
+      t1 = x[1];
+   }
+   if (x[3] > x[4])
+   {
+      t3 = x[4];
+      t4 = x[3];
+   } else {
+      t3 = x[3];
+      t4 = x[4];
+   }
+   if (t0 > t3)
+   {
+      MSWAP(t0, t3);
+      MSWAP(t1, t4);
+   }
+   if (t2 > t1)
+   {
+      if (t1 < t3)
+         return MIN16(t2, t3);
+      else
+         return MIN16(t4, t1);
+   } else {
+      if (t2 < t3)
+         return MIN16(t1, t3);
+      else
+         return MIN16(t2, t4);
+   }
+}
+
+static opus_val16 median_of_3(const opus_val16 *x)
+{
+   opus_val16 t0, t1, t2;
+   if (x[0] > x[1])
+   {
+      t0 = x[1];
+      t1 = x[0];
+   } else {
+      t0 = x[0];
+      t1 = x[1];
+   }
+   t2 = x[2];
+   if (t1 < t2)
+      return t1;
+   else if (t0 < t2)
+      return t2;
+   else
+      return t0;
+}
+
 static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
       int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
       int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
@@ -899,8 +940,7 @@
    SAVE_STACK;
    ALLOC(follower, C*nbEBands, opus_val16);
    ALLOC(noise_floor, C*nbEBands, opus_val16);
-   for (i=0;i<nbEBands;i++)
-      offsets[i] = 0;
+   OPUS_CLEAR(offsets, nbEBands);
    /* Dynamic allocation code */
    maxDepth=-QCONST16(31.9f, DB_SHIFT);
    for (i=0;i<end;i++)
@@ -922,7 +962,11 @@
       int last=0;
       c=0;do
       {
-         follower[c*nbEBands] = bandLogE2[c*nbEBands];
+         opus_val16 offset;
+         opus_val16 tmp;
+         opus_val16 *f;
+         f = &follower[c*nbEBands];
+         f[0] = bandLogE2[c*nbEBands];
          for (i=1;i<end;i++)
          {
             /* The last band to be at least 3 dB higher than the previous one
@@ -930,12 +974,26 @@
                bandlimited signals. */
             if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
                last=i;
-            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+            f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
          }
          for (i=last-1;i>=0;i--)
-            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i], MIN16(follower[c*nbEBands+i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+            f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+
+         /* Combine with a median filter to avoid dynalloc triggering unnecessarily.
+            The "offset" value controls how conservative we are -- a higher offset
+            reduces the impact of the median filter and makes dynalloc use more bits. */
+         offset = QCONST16(1.f, DB_SHIFT);
+         for (i=2;i<end-2;i++)
+            f[i] = MAX16(f[i], median_of_5(&bandLogE2[c*nbEBands+i-2])-offset);
+         tmp = median_of_3(&bandLogE2[c*nbEBands])-offset;
+         f[0] = MAX16(f[0], tmp);
+         f[1] = MAX16(f[1], tmp);
+         tmp = median_of_3(&bandLogE2[c*nbEBands+end-3])-offset;
+         f[end-2] = MAX16(f[end-2], tmp);
+         f[end-1] = MAX16(f[end-1], tmp);
+
          for (i=0;i<end;i++)
-            follower[c*nbEBands+i] = MAX16(follower[c*nbEBands+i], noise_floor[i]);
+            f[i] = MAX16(f[i], noise_floor[i]);
       } while (++c<C);
       if (C==2)
       {
@@ -1016,9 +1074,11 @@
    opus_val16 pf_threshold;
    int pf_on;
    int qg;
+   int overlap;
    SAVE_STACK;
 
    mode = st->mode;
+   overlap = mode->overlap;
    ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig);
 
    pre[0] = _pre;
@@ -1027,7 +1087,7 @@
 
    c=0; do {
       OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD);
-      OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+st->overlap)+st->overlap, N);
+      OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+overlap)+overlap, N);
    } while (++c<CC);
 
    if (enabled)
@@ -1044,7 +1104,7 @@
       pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
 
       gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
-            N, &pitch_index, st->prefilter_period, st->prefilter_gain);
+            N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
       if (pitch_index > COMBFILTER_MAXPERIOD-2)
          pitch_index = COMBFILTER_MAXPERIOD-2;
       gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
@@ -1100,18 +1160,18 @@
    /*printf("%d %f\n", pitch_index, gain1);*/
 
    c=0; do {
-      int offset = mode->shortMdctSize-st->overlap;
+      int offset = mode->shortMdctSize-overlap;
       st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
-      OPUS_COPY(in+c*(N+st->overlap), st->in_mem+c*(st->overlap), st->overlap);
+      OPUS_COPY(in+c*(N+overlap), st->in_mem+c*(overlap), overlap);
       if (offset)
-         comb_filter(in+c*(N+st->overlap)+st->overlap, pre[c]+COMBFILTER_MAXPERIOD,
+         comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD,
                st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,
-               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch);
 
-      comb_filter(in+c*(N+st->overlap)+st->overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
+      comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
             st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,
-            st->prefilter_tapset, prefilter_tapset, mode->window, st->overlap);
-      OPUS_COPY(st->in_mem+c*(st->overlap), in+c*(N+st->overlap)+N, st->overlap);
+            st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch);
+      OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap);
 
       if (N>COMBFILTER_MAXPERIOD)
       {
@@ -1196,6 +1256,9 @@
       /*printf("%f %f ", analysis->tonality, tonal);*/
       target = tonal_target;
    }
+#else
+   (void)analysis;
+   (void)pitch_change;
 #endif
 
    if (has_surround_mask&&!lfe)
@@ -1273,6 +1336,8 @@
    int LM, M;
    int tf_select;
    int nbFilledBytes, nbAvailableBytes;
+   int start;
+   int end;
    int effEnd;
    int codedBands;
    int tf_sum;
@@ -1316,6 +1381,8 @@
    nbEBands = mode->nbEBands;
    overlap = mode->overlap;
    eBands = mode->eBands;
+   start = st->start;
+   end = st->end;
    tf_estimate = 0;
    if (nbCompressedBytes<2 || pcm==NULL)
    {
@@ -1335,8 +1402,8 @@
    M=1<<LM;
    N = M*mode->shortMdctSize;
 
-   prefilter_mem = st->in_mem+CC*(st->overlap);
-   oldBandE = (opus_val16*)(st->in_mem+CC*(st->overlap+COMBFILTER_MAXPERIOD));
+   prefilter_mem = st->in_mem+CC*(overlap);
+   oldBandE = (opus_val16*)(st->in_mem+CC*(overlap+COMBFILTER_MAXPERIOD));
    oldLogE = oldBandE + CC*nbEBands;
    oldLogE2 = oldLogE + CC*nbEBands;
 
@@ -1352,8 +1419,8 @@
 #ifdef CUSTOM_MODES
    if (st->signalling && enc==NULL)
    {
-      int tmp = (mode->effEBands-st->end)>>1;
-      st->end = IMAX(1, mode->effEBands-tmp);
+      int tmp = (mode->effEBands-end)>>1;
+      end = st->end = IMAX(1, mode->effEBands-tmp);
       compressed[0] = tmp<<5;
       compressed[0] |= LM<<3;
       compressed[0] |= (C==2)<<2;
@@ -1436,11 +1503,11 @@
    }
    total_bits = nbCompressedBytes*8;
 
-   effEnd = st->end;
+   effEnd = end;
    if (effEnd > mode->effEBands)
       effEnd = mode->effEBands;
 
-   ALLOC(in, CC*(N+st->overlap), celt_sig);
+   ALLOC(in, CC*(N+overlap), celt_sig);
 
    sample_max=MAX32(st->overlap_max, celt_maxabs16(pcm, C*(N-overlap)/st->upsample));
    st->overlap_max=celt_maxabs16(pcm+C*(N-overlap)/st->upsample, C*overlap/st->upsample);
@@ -1474,8 +1541,12 @@
       enc->nbits_total+=tell-ec_tell(enc);
    }
    c=0; do {
-      celt_preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample,
-                  mode->preemph, st->preemph_memE+c, st->clip);
+      int need_clip=0;
+#ifndef FIXED_POINT
+      need_clip = st->clip && sample_max>65536.f;
+#endif
+      celt_preemphasis(pcm+c, in+c*(N+overlap)+overlap, N, CC, st->upsample,
+                  mode->preemph, st->preemph_memE+c, need_clip);
    } while (++c<CC);
 
 
@@ -1484,7 +1555,7 @@
    {
       int enabled;
       int qg;
-      enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf
+      enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && start==0 && !silence && !st->disable_pf
             && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE);
 
       prefilter_tapset = st->tapset_decision;
@@ -1494,7 +1565,7 @@
          pitch_change = 1;
       if (pf_on==0)
       {
-         if(st->start==0 && tell+16<=total_bits)
+         if(start==0 && tell+16<=total_bits)
             ec_enc_bit_logp(enc, 0, 1);
       } else {
          /*This block is not gated by a total bits check only because
@@ -1515,7 +1586,7 @@
    shortBlocks = 0;
    if (st->complexity >= 1 && !st->lfe)
    {
-      isTransient = transient_analysis(in, N+st->overlap, CC,
+      isTransient = transient_analysis(in, N+overlap, CC,
             &tf_estimate, &tf_chan);
    }
    if (LM>0 && ec_tell(enc)+3<=total_bits)
@@ -1535,33 +1606,32 @@
    ALLOC(bandLogE2, C*nbEBands, opus_val16);
    if (secondMdct)
    {
-      compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample);
-      compute_band_energies(mode, freq, bandE, effEnd, C, M);
-      amp2Log2(mode, effEnd, st->end, bandE, bandLogE2, C);
+      compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
+      compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+      amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
       for (i=0;i<C*nbEBands;i++)
          bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
    }
 
-   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
+   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
    if (CC==2&&C==1)
       tf_chan = 0;
-   compute_band_energies(mode, freq, bandE, effEnd, C, M);
+   compute_band_energies(mode, freq, bandE, effEnd, C, LM);
 
    if (st->lfe)
    {
-      for (i=2;i<st->end;i++)
+      for (i=2;i<end;i++)
       {
          bandE[i] = IMIN(bandE[i], MULT16_32_Q15(QCONST16(1e-4f,15),bandE[0]));
          bandE[i] = MAX32(bandE[i], EPSILON);
       }
    }
-   amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+   amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
 
    ALLOC(surround_dynalloc, C*nbEBands, opus_val16);
-   for(i=0;i<st->end;i++)
-      surround_dynalloc[i] = 0;
+   OPUS_CLEAR(surround_dynalloc, end);
    /* This computes how much masking takes place between surround channels */
-   if (st->start==0&&st->energy_mask&&!st->lfe)
+   if (start==0&&st->energy_mask&&!st->lfe)
    {
       int mask_end;
       int midband;
@@ -1584,6 +1654,7 @@
             diff += MULT16_16(mask, 1+2*i-mask_end);
          }
       }
+      celt_assert(count>0);
       mask_avg = DIV32_16(mask_avg,count);
       mask_avg += QCONST16(.2f, DB_SHIFT);
       diff = diff*6/(C*(mask_end-1)*(mask_end+1)*mask_end);
@@ -1621,8 +1692,7 @@
                disabling masking. */
             mask_avg = 0;
             diff = 0;
-            for(i=0;i<mask_end;i++)
-               surround_dynalloc[i] = 0;
+            OPUS_CLEAR(surround_dynalloc, mask_end);
          } else {
             for(i=0;i<mask_end;i++)
                surround_dynalloc[i] = MAX16(0, surround_dynalloc[i]-QCONST16(.25f, DB_SHIFT));
@@ -1640,14 +1710,14 @@
       opus_val16 follow=-QCONST16(10.0f,DB_SHIFT);
       opus_val32 frame_avg=0;
       opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
-      for(i=st->start;i<st->end;i++)
+      for(i=start;i<end;i++)
       {
          follow = MAX16(follow-QCONST16(1.f, DB_SHIFT), bandLogE[i]-offset);
          if (C==2)
             follow = MAX16(follow, bandLogE[i+nbEBands]-offset);
          frame_avg += follow;
       }
-      frame_avg /= (st->end-st->start);
+      frame_avg /= (end-start);
       temporal_vbr = SUB16(frame_avg,st->spec_avg);
       temporal_vbr = MIN16(QCONST16(3.f, DB_SHIFT), MAX16(-QCONST16(1.5f, DB_SHIFT), temporal_vbr));
       st->spec_avg += MULT16_16_Q15(QCONST16(.02f, 15), temporal_vbr);
@@ -1658,21 +1728,20 @@
 
    if (!secondMdct)
    {
-      for (i=0;i<C*nbEBands;i++)
-         bandLogE2[i] = bandLogE[i];
+      OPUS_COPY(bandLogE2, bandLogE, C*nbEBands);
    }
 
    /* Last chance to catch any transient we might have missed in the
       time-domain analysis */
    if (LM>0 && ec_tell(enc)+3<=total_bits && !isTransient && st->complexity>=5 && !st->lfe)
    {
-      if (patch_transient_decision(bandLogE, oldBandE, nbEBands, st->end, C))
+      if (patch_transient_decision(bandLogE, oldBandE, nbEBands, start, end, C))
       {
          isTransient = 1;
          shortBlocks = M;
-         compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
-         compute_band_energies(mode, freq, bandE, effEnd, C, M);
-         amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+         compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
+         compute_band_energies(mode, freq, bandE, effEnd, C, LM);
+         amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
          for (i=0;i<C*nbEBands;i++)
             bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
@@ -1690,7 +1759,7 @@
 
    ALLOC(tf_res, nbEBands, int);
    /* Disable variable tf resolution for hybrid and at very low bitrate */
-   if (effectiveBytes>=15*C && st->start==0 && st->complexity>=2 && !st->lfe)
+   if (effectiveBytes>=15*C && start==0 && st->complexity>=2 && !st->lfe)
    {
       int lambda;
       if (effectiveBytes<40)
@@ -1703,22 +1772,22 @@
          lambda = 3;
       lambda*=2;
       tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, &tf_sum, tf_estimate, tf_chan);
-      for (i=effEnd;i<st->end;i++)
+      for (i=effEnd;i<end;i++)
          tf_res[i] = tf_res[effEnd-1];
    } else {
       tf_sum = 0;
-      for (i=0;i<st->end;i++)
+      for (i=0;i<end;i++)
          tf_res[i] = isTransient;
       tf_select=0;
    }
 
    ALLOC(error, C*nbEBands, opus_val16);
-   quant_coarse_energy(mode, st->start, st->end, effEnd, bandLogE,
+   quant_coarse_energy(mode, start, end, effEnd, bandLogE,
          oldBandE, total_bits, error, enc,
          C, LM, nbAvailableBytes, st->force_intra,
          &st->delayedIntra, st->complexity >= 4, st->loss_rate, st->lfe);
 
-   tf_encode(st->start, st->end, isTransient, tf_res, LM, tf_select, enc);
+   tf_encode(start, end, isTransient, tf_res, LM, tf_select, enc);
 
    if (ec_tell(enc)+4<=total_bits)
    {
@@ -1726,7 +1795,7 @@
       {
          st->tapset_decision = 0;
          st->spread_decision = SPREAD_NORMAL;
-      } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C || st->start != 0)
+      } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C || start != 0)
       {
          if (st->complexity == 0)
             st->spread_decision = SPREAD_NONE;
@@ -1760,7 +1829,7 @@
 
    ALLOC(offsets, nbEBands, int);
 
-   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, st->start, st->end, C, offsets,
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
          st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
          eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc);
    /* For LFE, everything interesting is in the first band */
@@ -1773,7 +1842,7 @@
    total_bits<<=BITRES;
    total_boost = 0;
    tell = ec_tell_frac(enc);
-   for (i=st->start;i<st->end;i++)
+   for (i=start;i<end;i++)
    {
       int width, quanta;
       int dynalloc_loop_logp;
@@ -1818,7 +1887,7 @@
 
       st->intensity = hysteresis_decision((opus_val16)(equiv_rate/1000),
             intensity_thresholds, intensity_histeresis, 21, st->intensity);
-      st->intensity = IMIN(st->end,IMAX(st->start, st->intensity));
+      st->intensity = IMIN(end,IMAX(start, st->intensity));
    }
 
    alloc_trim = 5;
@@ -1828,7 +1897,8 @@
          alloc_trim = 5;
       else
          alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
-            st->end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
+            end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
+            st->intensity, surround_trim, st->arch);
       ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
       tell = ec_tell_frac(enc);
    }
@@ -1930,7 +2000,7 @@
    bits = (((opus_int32)nbCompressedBytes*8)<<BITRES) - ec_tell_frac(enc) - 1;
    anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
    bits -= anti_collapse_rsv;
-   signalBandwidth = st->end-1;
+   signalBandwidth = end-1;
 #ifndef DISABLE_FLOAT_API
    if (st->analysis.valid)
    {
@@ -1950,7 +2020,7 @@
 #endif
    if (st->lfe)
       signalBandwidth = 1;
-   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+   codedBands = compute_allocation(mode, start, end, offsets, cap,
          alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses,
          fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth);
    if (st->lastCodedBands)
@@ -1958,13 +2028,14 @@
    else
       st->lastCodedBands = codedBands;
 
-   quant_fine_energy(mode, st->start, st->end, oldBandE, error, fine_quant, enc, C);
+   quant_fine_energy(mode, start, end, oldBandE, error, fine_quant, enc, C);
 
    /* Residual quantisation */
    ALLOC(collapse_masks, C*nbEBands, unsigned char);
-   quant_all_bands(1, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
-         bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
-         nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
+   quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
+         bandE, pulses, shortBlocks, st->spread_decision,
+         dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
+         balance, enc, LM, codedBands, &st->rng, st->arch);
 
    if (anti_collapse_rsv > 0)
    {
@@ -1974,7 +2045,7 @@
 #endif
       ec_enc_bits(enc, anti_collapse_on, 1);
    }
-   quant_energy_finalise(mode, st->start, st->end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
+   quant_energy_finalise(mode, start, end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
 
    if (silence)
    {
@@ -1990,40 +2061,26 @@
       if (anti_collapse_on)
       {
          anti_collapse(mode, X, collapse_masks, LM, C, N,
-               st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
-      }
-
-      if (silence)
-      {
-         for (i=0;i<C*N;i++)
-            freq[i] = 0;
-      } else {
-         /* Synthesis */
-         denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
+               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
       }
 
       c=0; do {
          OPUS_MOVE(st->syn_mem[c], st->syn_mem[c]+N, 2*MAX_PERIOD-N+overlap/2);
       } while (++c<CC);
 
-      if (CC==2&&C==1)
-      {
-         for (i=0;i<N;i++)
-            freq[N+i] = freq[i];
-      }
-
       c=0; do {
          out_mem[c] = st->syn_mem[c]+2*MAX_PERIOD-N;
       } while (++c<CC);
 
-      compute_inv_mdcts(mode, shortBlocks, freq, out_mem, CC, LM);
+      celt_synthesis(mode, X, out_mem, oldBandE, start, effEnd,
+                     C, CC, isTransient, LM, st->upsample, silence, st->arch);
 
       c=0; do {
          st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
          st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
          comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
                st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
-               mode->window, st->overlap);
+               mode->window, overlap);
          if (LM!=0)
             comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
                   st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
@@ -2031,7 +2088,7 @@
       } while (++c<CC);
 
       /* We reuse freq[] as scratch space for the de-emphasis */
-      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, freq);
+      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD);
       st->prefilter_period_old = st->prefilter_period;
       st->prefilter_gain_old = st->prefilter_gain;
       st->prefilter_tapset_old = st->prefilter_tapset;
@@ -2051,16 +2108,13 @@
 #endif
 
    if (CC==2&&C==1) {
-      for (i=0;i<nbEBands;i++)
-         oldBandE[nbEBands+i]=oldBandE[i];
+      OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
    }
 
    if (!isTransient)
    {
-      for (i=0;i<CC*nbEBands;i++)
-         oldLogE2[i] = oldLogE[i];
-      for (i=0;i<CC*nbEBands;i++)
-         oldLogE[i] = oldBandE[i];
+      OPUS_COPY(oldLogE2, oldLogE, CC*nbEBands);
+      OPUS_COPY(oldLogE, oldBandE, CC*nbEBands);
    } else {
       for (i=0;i<CC*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
@@ -2068,12 +2122,12 @@
    /* In case start or end were to change */
    c=0; do
    {
-      for (i=0;i<st->start;i++)
+      for (i=0;i<start;i++)
       {
          oldBandE[c*nbEBands+i]=0;
          oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
       }
-      for (i=st->end;i<nbEBands;i++)
+      for (i=end;i<nbEBands;i++)
       {
          oldBandE[c*nbEBands+i]=0;
          oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
@@ -2274,7 +2328,7 @@
       {
          int i;
          opus_val16 *oldBandE, *oldLogE, *oldLogE2;
-         oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->overlap+COMBFILTER_MAXPERIOD));
+         oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->mode->overlap+COMBFILTER_MAXPERIOD));
          oldLogE = oldBandE + st->channels*st->mode->nbEBands;
          oldLogE2 = oldLogE + st->channels*st->mode->nbEBands;
          OPUS_CLEAR((char*)&st->ENCODER_RESET_START,
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index fa29d62..f02145a 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -88,12 +88,15 @@
 #endif
 }
 
-void celt_fir(const opus_val16 *_x,
+
+void celt_fir_c(
+         const opus_val16 *_x,
          const opus_val16 *num,
          opus_val16 *_y,
          int N,
          int ord,
-         opus_val16 *mem)
+         opus_val16 *mem,
+         int arch)
 {
    int i,j;
    VARDECL(opus_val16, rnum);
@@ -111,6 +114,7 @@
    for(i=0;i<ord;i++)
       mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
+   (void)arch;
    for (i=0;i<N;i++)
    {
       opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
@@ -124,7 +128,7 @@
    for (i=0;i<N-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(rnum, x+i, sum, ord);
+      xcorr_kernel(rnum, x+i, sum, ord, arch);
       _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
       _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
       _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
@@ -146,10 +150,12 @@
          opus_val32 *_y,
          int N,
          int ord,
-         opus_val16 *mem)
+         opus_val16 *mem,
+         int arch)
 {
 #ifdef SMALL_FOOTPRINT
    int i,j;
+   (void)arch;
    for (i=0;i<N;i++)
    {
       opus_val32 sum = _x[i];
@@ -187,7 +193,7 @@
       sum[1]=_x[i+1];
       sum[2]=_x[i+2];
       sum[3]=_x[i+3];
-      xcorr_kernel(rden, y+i, sum, ord);
+      xcorr_kernel(rden, y+i, sum, ord, arch);
 
       /* Patch up the result to compensate for the fact that this is an IIR */
       y[i+ord  ] = -ROUND16(sum[0],SIG_SHIFT);
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index dc2a0a3..323459e 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -29,24 +29,37 @@
 #define PLC_H
 
 #include "arch.h"
+#include "cpu_support.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.h"
+#endif
 
 #define LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
 
-void celt_fir(const opus_val16 *x,
+void celt_fir_c(
+         const opus_val16 *x,
          const opus_val16 *num,
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem);
+         opus_val16 *mem,
+         int arch);
+
+#if !defined(OVERRIDE_CELT_FIR)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+    (celt_fir_c(x, num, y, N, ord, mem, arch))
+#endif
 
 void celt_iir(const opus_val32 *x,
          const opus_val16 *den,
          opus_val32 *y,
          int N,
          int ord,
-         opus_val16 *mem);
+         opus_val16 *mem,
+         int arch);
 
 int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
          const opus_val16 *window, int overlap, int lag, int n, int arch);
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index d68dbe6..68fc606 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -31,7 +31,8 @@
 #include "opus_types.h"
 #include "opus_defines.h"
 
-#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
+#if defined(OPUS_HAVE_RTCD) && \
+  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 #include "arm/armcpu.h"
 
 /* We currently support 4 ARM variants:
@@ -42,6 +43,22 @@
  */
 #define OPUS_ARCHMASK 3
 
+#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+
+#include "x86/x86cpu.h"
+/* We currently support 5 x86 variants:
+ * arch[0] -> non-sse
+ * arch[1] -> sse
+ * arch[2] -> sse2
+ * arch[3] -> sse4.1
+ * arch[4] -> avx
+ */
+#define OPUS_ARCHMASK 7
+int opus_select_arch(void);
+
 #else
 #define OPUS_ARCHMASK 0
 
@@ -50,5 +67,4 @@
   return 0;
 }
 #endif
-
 #endif
diff --git a/celt/cwrs.c b/celt/cwrs.c
index ad980cc..2fa9f89 100644
--- a/celt/cwrs.c
+++ b/celt/cwrs.c
@@ -460,10 +460,12 @@
   ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
 }
 
-static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
   opus_uint32 p;
   int         s;
   int         k0;
+  opus_int16  val;
+  opus_val32  yy=0;
   celt_assert(_k>0);
   celt_assert(_n>1);
   while(_n>2){
@@ -487,7 +489,9 @@
       }
       else for(p=row[_k];p>_i;p=row[_k])_k--;
       _i-=p;
-      *_y++=(k0-_k+s)^s;
+      val=(k0-_k+s)^s;
+      *_y++=val;
+      yy=MAC16_16(yy,val,val);
     }
     /*Lots of dimensions case:*/
     else{
@@ -507,7 +511,9 @@
         do p=CELT_PVQ_U_ROW[--_k][_n];
         while(p>_i);
         _i-=p;
-        *_y++=(k0-_k+s)^s;
+        val=(k0-_k+s)^s;
+        *_y++=val;
+        yy=MAC16_16(yy,val,val);
       }
     }
     _n--;
@@ -519,14 +525,19 @@
   k0=_k;
   _k=(_i+1)>>1;
   if(_k)_i-=2*_k-1;
-  *_y++=(k0-_k+s)^s;
+  val=(k0-_k+s)^s;
+  *_y++=val;
+  yy=MAC16_16(yy,val,val);
   /*_n==1*/
   s=-(int)_i;
-  *_y=(_k+s)^s;
+  val=(_k+s)^s;
+  *_y=val;
+  yy=MAC16_16(yy,val,val);
+  return yy;
 }
 
-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
-  cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  return cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
 }
 
 #else /* SMALL_FOOTPRINT */
@@ -591,8 +602,10 @@
   _y: Returns the vector of pulses.
   _u: Must contain entries [0..._k+1] of row _n of U() on input.
       Its contents will be destructively modified.*/
-static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
   int j;
+  opus_int16 val;
+  opus_val32 yy=0;
   celt_assert(_n>0);
   j=0;
   do{
@@ -607,10 +620,13 @@
     while(p>_i)p=_u[--_k];
     _i-=p;
     yj-=_k;
-    _y[j]=(yj+s)^s;
+    val=(yj+s)^s;
+    _y[j]=val;
+    yy=MAC16_16(yy,val,val);
     uprev(_u,_k+2,0);
   }
   while(++j<_n);
+  return yy;
 }
 
 /*Returns the index of the given combination of K elements chosen from a set
@@ -685,13 +701,15 @@
   RESTORE_STACK;
 }
 
-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
   VARDECL(opus_uint32,u);
+  int ret;
   SAVE_STACK;
   celt_assert(_k>0);
   ALLOC(u,_k+2U,opus_uint32);
-  cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
+  ret = cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
   RESTORE_STACK;
+  return ret;
 }
 
 #endif /* SMALL_FOOTPRINT */
diff --git a/celt/cwrs.h b/celt/cwrs.h
index 7dfbd07..7cd4717 100644
--- a/celt/cwrs.h
+++ b/celt/cwrs.h
@@ -43,6 +43,6 @@
 
 void encode_pulses(const int *_y, int N, int K, ec_enc *enc);
 
-void decode_pulses(int *_y, int N, int K, ec_dec *dec);
+opus_val32 decode_pulses(int *_y, int N, int K, ec_dec *dec);
 
 #endif /* CWRS_H */
diff --git a/celt/entcode.c b/celt/entcode.c
index fa5d7c7..70f3201 100644
--- a/celt/entcode.c
+++ b/celt/entcode.c
@@ -62,6 +62,27 @@
 }
 #endif
 
+#if 1
+/* This is a faster version of ec_tell_frac() that takes advantage
+   of the low (1/8 bit) resolution to use just a linear function
+   followed by a lookup to determine the exact transition thresholds. */
+opus_uint32 ec_tell_frac(ec_ctx *_this){
+  static const unsigned correction[8] =
+    {35733, 38967, 42495, 46340,
+     50535, 55109, 60097, 65535};
+  opus_uint32 nbits;
+  opus_uint32 r;
+  int         l;
+  unsigned    b;
+  nbits=_this->nbits_total<<BITRES;
+  l=EC_ILOG(_this->rng);
+  r=_this->rng>>(l-16);
+  b = (r>>12)-8;
+  b += r>correction[b];
+  l = (l<<3)+b;
+  return nbits-l;
+}
+#else
 opus_uint32 ec_tell_frac(ec_ctx *_this){
   opus_uint32 nbits;
   opus_uint32 r;
@@ -91,3 +112,42 @@
   }
   return nbits-l;
 }
+#endif
+
+#ifdef USE_SMALL_DIV_TABLE
+/* Result of 2^32/(2*i+1), except for i=0. */
+const opus_uint32 SMALL_DIV_TABLE[129] = {
+   0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924,
+   0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111,
+   0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C,
+   0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084,
+   0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906,
+   0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A,
+   0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A,
+   0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104,
+   0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1,
+   0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2,
+   0x0329161F, 0x03159721, 0x03030303, 0x02F14990,
+   0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46,
+   0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597,
+   0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17,
+   0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902,
+   0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810,
+   0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC,
+   0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30,
+   0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364,
+   0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14,
+   0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F,
+   0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE,
+   0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6,
+   0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3,
+   0x01539094, 0x01501501, 0x014CAB88, 0x0149539E,
+   0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A,
+   0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190,
+   0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227,
+   0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4,
+   0x01194538, 0x0116E068, 0x011485F0, 0x0112358E,
+   0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3,
+   0x01073260, 0x0105197F, 0x0103091B, 0x01010101
+};
+#endif
diff --git a/celt/entcode.h b/celt/entcode.h
index dd13e49..13d6c84 100644
--- a/celt/entcode.h
+++ b/celt/entcode.h
@@ -34,6 +34,12 @@
 # include <stddef.h>
 # include "ecintrin.h"
 
+extern const opus_uint32 SMALL_DIV_TABLE[129];
+
+#ifdef OPUS_ARM_ASM
+#define USE_SMALL_DIV_TABLE
+#endif
+
 /*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
    larger type, you can speed up the decoder by using it here.*/
 typedef opus_uint32           ec_window;
@@ -114,4 +120,33 @@
            rounding error is in the positive direction).*/
 opus_uint32 ec_tell_frac(ec_ctx *_this);
 
+/* Tested exhaustively for all n and for 1<=d<=256 */
+static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
+   celt_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (d>256)
+      return n/d;
+   else {
+      opus_uint32 t, q;
+      t = EC_ILOG(d&-d);
+      q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32;
+      return q+(n-q*d >= d);
+   }
+#else
+   return n/d;
+#endif
+}
+
+static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) {
+   celt_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (n<0)
+      return -(opus_int32)celt_udiv(-n, d);
+   else
+      return celt_udiv(n, d);
+#else
+   return n/d;
+#endif
+}
+
 #endif
diff --git a/celt/entdec.c b/celt/entdec.c
index 3c26468..0b3433e 100644
--- a/celt/entdec.c
+++ b/celt/entdec.c
@@ -138,7 +138,7 @@
 
 unsigned ec_decode(ec_dec *_this,unsigned _ft){
   unsigned s;
-  _this->ext=_this->rng/_ft;
+  _this->ext=celt_udiv(_this->rng,_ft);
   s=(unsigned)(_this->val/_this->ext);
   return _ft-EC_MINI(s+1,_ft);
 }
diff --git a/celt/entenc.c b/celt/entenc.c
index a7e34ec..f1750d2 100644
--- a/celt/entenc.c
+++ b/celt/entenc.c
@@ -98,7 +98,7 @@
   else _this->ext++;
 }
 
-static void ec_enc_normalize(ec_enc *_this){
+static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
   /*If the range is too small, output some bits and rescale it.*/
   while(_this->rng<=EC_CODE_BOT){
     ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));
@@ -127,7 +127,7 @@
 
 void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
   opus_uint32 r;
-  r=_this->rng/_ft;
+  r=celt_udiv(_this->rng,_ft);
   if(_fl>0){
     _this->val+=_this->rng-IMUL32(r,(_ft-_fl));
     _this->rng=IMUL32(r,(_fh-_fl));
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index 80bc949..d28227f 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -496,6 +496,7 @@
 
 #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
 #define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))
+#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b))))
 
 static OPUS_INLINE int SATURATE(int a, int b)
 {
@@ -767,6 +768,16 @@
    return res;
 }
 
+static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
+{
+   x = PSHR32(x, SIG_SHIFT);
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return EXTRACT16(x);
+}
+#define SIG2WORD16(x) (SIG2WORD16_generic(x))
+
+
 #undef PRINT_MIPS
 #define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
 
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index ecf018a..ac67d37 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -113,7 +113,11 @@
 /** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
     b must fit in 31 bits.
     Result fits in 32 bits. */
-#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+
+/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add.
+    Results fits in 32 bits */
+#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)))
 
 #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
 #define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
@@ -131,4 +135,17 @@
 /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
 #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
 
+#if defined(MIPSr1_ASM)
+#include "mips/fixed_generic_mipsr1.h"
+#endif
+
+static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
+{
+   x = PSHR32(x, SIG_SHIFT);
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return EXTRACT16(x);
+}
+#define SIG2WORD16(x) (SIG2WORD16_generic(x))
+
 #endif
diff --git a/celt/float_cast.h b/celt/float_cast.h
index ede6574..ed5a39b 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -90,14 +90,14 @@
 #include <math.h>
 #define float2int(x) lrint(x)
 
-#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN64) || defined (_WIN64))
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_X64)
         #include <xmmintrin.h>
 
         __inline long int float2int(float value)
         {
                 return _mm_cvtss_si32(_mm_load_ss(&value));
         }
-#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && (defined (WIN32) || defined (_WIN32))
+#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86)
         #include <math.h>
 
         /*      Win32 doesn't seem to have these functions.
diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
index ad706c7..4ed37d2 100644
--- a/celt/kiss_fft.c
+++ b/celt/kiss_fft.c
@@ -47,64 +47,56 @@
 
 static void kf_bfly2(
                      kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
                      int m,
-                     int N,
-                     int mm
+                     int N
                     )
 {
    kiss_fft_cpx * Fout2;
-   const kiss_twiddle_cpx * tw1;
-   int i,j;
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
+   int i;
+   (void)m;
+#ifdef CUSTOM_MODES
+   if (m==1)
    {
-      Fout = Fout_beg + i*mm;
-      Fout2 = Fout + m;
-      tw1 = st->twiddles;
-      for(j=0;j<m;j++)
+      celt_assert(m==1);
+      for (i=0;i<N;i++)
       {
          kiss_fft_cpx t;
-         Fout->r = SHR32(Fout->r, 1);Fout->i = SHR32(Fout->i, 1);
-         Fout2->r = SHR32(Fout2->r, 1);Fout2->i = SHR32(Fout2->i, 1);
-         C_MUL (t,  *Fout2 , *tw1);
-         tw1 += fstride;
+         Fout2 = Fout + 1;
+         t = *Fout2;
          C_SUB( *Fout2 ,  *Fout , t );
          C_ADDTO( *Fout ,  t );
-         ++Fout2;
-         ++Fout;
+         Fout += 2;
       }
-   }
-}
-
-static void ki_bfly2(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   kiss_fft_cpx * Fout2;
-   const kiss_twiddle_cpx * tw1;
-   kiss_fft_cpx t;
-   int i,j;
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
+   } else
+#endif
    {
-      Fout = Fout_beg + i*mm;
-      Fout2 = Fout + m;
-      tw1 = st->twiddles;
-      for(j=0;j<m;j++)
+      opus_val16 tw;
+      tw = QCONST16(0.7071067812f, 15);
+      /* We know that m==4 here because the radix-2 is just after a radix-4 */
+      celt_assert(m==4);
+      for (i=0;i<N;i++)
       {
-         C_MULC (t,  *Fout2 , *tw1);
-         tw1 += fstride;
-         C_SUB( *Fout2 ,  *Fout , t );
-         C_ADDTO( *Fout ,  t );
-         ++Fout2;
-         ++Fout;
+         kiss_fft_cpx t;
+         Fout2 = Fout + 4;
+         t = Fout2[0];
+         C_SUB( Fout2[0] ,  Fout[0] , t );
+         C_ADDTO( Fout[0] ,  t );
+
+         t.r = S_MUL(Fout2[1].r+Fout2[1].i, tw);
+         t.i = S_MUL(Fout2[1].i-Fout2[1].r, tw);
+         C_SUB( Fout2[1] ,  Fout[1] , t );
+         C_ADDTO( Fout[1] ,  t );
+
+         t.r = Fout2[2].i;
+         t.i = -Fout2[2].r;
+         C_SUB( Fout2[2] ,  Fout[2] , t );
+         C_ADDTO( Fout[2] ,  t );
+
+         t.r = S_MUL(Fout2[3].i-Fout2[3].r, tw);
+         t.i = S_MUL(-Fout2[3].i-Fout2[3].r, tw);
+         C_SUB( Fout2[3] ,  Fout[3] , t );
+         C_ADDTO( Fout[3] ,  t );
+         Fout += 8;
       }
    }
 }
@@ -118,88 +110,66 @@
                      int mm
                     )
 {
-   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
-   kiss_fft_cpx scratch[6];
-   const size_t m2=2*m;
-   const size_t m3=3*m;
-   int i, j;
+   int i;
 
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
+   if (m==1)
    {
-      Fout = Fout_beg + i*mm;
-      tw3 = tw2 = tw1 = st->twiddles;
-      for (j=0;j<m;j++)
+      /* Degenerate case where all the twiddles are 1. */
+      for (i=0;i<N;i++)
       {
-         C_MUL4(scratch[0],Fout[m] , *tw1 );
-         C_MUL4(scratch[1],Fout[m2] , *tw2 );
-         C_MUL4(scratch[2],Fout[m3] , *tw3 );
+         kiss_fft_cpx scratch0, scratch1;
 
-         Fout->r = PSHR32(Fout->r, 2);
-         Fout->i = PSHR32(Fout->i, 2);
-         C_SUB( scratch[5] , *Fout, scratch[1] );
-         C_ADDTO(*Fout, scratch[1]);
-         C_ADD( scratch[3] , scratch[0] , scratch[2] );
-         C_SUB( scratch[4] , scratch[0] , scratch[2] );
-         C_SUB( Fout[m2], *Fout, scratch[3] );
-         tw1 += fstride;
-         tw2 += fstride*2;
-         tw3 += fstride*3;
-         C_ADDTO( *Fout , scratch[3] );
+         C_SUB( scratch0 , *Fout, Fout[2] );
+         C_ADDTO(*Fout, Fout[2]);
+         C_ADD( scratch1 , Fout[1] , Fout[3] );
+         C_SUB( Fout[2], *Fout, scratch1 );
+         C_ADDTO( *Fout , scratch1 );
+         C_SUB( scratch1 , Fout[1] , Fout[3] );
 
-         Fout[m].r = scratch[5].r + scratch[4].i;
-         Fout[m].i = scratch[5].i - scratch[4].r;
-         Fout[m3].r = scratch[5].r - scratch[4].i;
-         Fout[m3].i = scratch[5].i + scratch[4].r;
-         ++Fout;
+         Fout[1].r = scratch0.r + scratch1.i;
+         Fout[1].i = scratch0.i - scratch1.r;
+         Fout[3].r = scratch0.r - scratch1.i;
+         Fout[3].i = scratch0.i + scratch1.r;
+         Fout+=4;
+      }
+   } else {
+      int j;
+      kiss_fft_cpx scratch[6];
+      const kiss_twiddle_cpx *tw1,*tw2,*tw3;
+      const int m2=2*m;
+      const int m3=3*m;
+      kiss_fft_cpx * Fout_beg = Fout;
+      for (i=0;i<N;i++)
+      {
+         Fout = Fout_beg + i*mm;
+         tw3 = tw2 = tw1 = st->twiddles;
+         /* m is guaranteed to be a multiple of 4. */
+         for (j=0;j<m;j++)
+         {
+            C_MUL(scratch[0],Fout[m] , *tw1 );
+            C_MUL(scratch[1],Fout[m2] , *tw2 );
+            C_MUL(scratch[2],Fout[m3] , *tw3 );
+
+            C_SUB( scratch[5] , *Fout, scratch[1] );
+            C_ADDTO(*Fout, scratch[1]);
+            C_ADD( scratch[3] , scratch[0] , scratch[2] );
+            C_SUB( scratch[4] , scratch[0] , scratch[2] );
+            C_SUB( Fout[m2], *Fout, scratch[3] );
+            tw1 += fstride;
+            tw2 += fstride*2;
+            tw3 += fstride*3;
+            C_ADDTO( *Fout , scratch[3] );
+
+            Fout[m].r = scratch[5].r + scratch[4].i;
+            Fout[m].i = scratch[5].i - scratch[4].r;
+            Fout[m3].r = scratch[5].r - scratch[4].i;
+            Fout[m3].i = scratch[5].i + scratch[4].r;
+            ++Fout;
+         }
       }
    }
 }
 
-static void ki_bfly4(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
-   kiss_fft_cpx scratch[6];
-   const size_t m2=2*m;
-   const size_t m3=3*m;
-   int i, j;
-
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      tw3 = tw2 = tw1 = st->twiddles;
-      for (j=0;j<m;j++)
-      {
-         C_MULC(scratch[0],Fout[m] , *tw1 );
-         C_MULC(scratch[1],Fout[m2] , *tw2 );
-         C_MULC(scratch[2],Fout[m3] , *tw3 );
-
-         C_SUB( scratch[5] , *Fout, scratch[1] );
-         C_ADDTO(*Fout, scratch[1]);
-         C_ADD( scratch[3] , scratch[0] , scratch[2] );
-         C_SUB( scratch[4] , scratch[0] , scratch[2] );
-         C_SUB( Fout[m2], *Fout, scratch[3] );
-         tw1 += fstride;
-         tw2 += fstride*2;
-         tw3 += fstride*3;
-         C_ADDTO( *Fout , scratch[3] );
-
-         Fout[m].r = scratch[5].r - scratch[4].i;
-         Fout[m].i = scratch[5].i + scratch[4].r;
-         Fout[m3].r = scratch[5].r + scratch[4].i;
-         Fout[m3].i = scratch[5].i - scratch[4].r;
-         ++Fout;
-      }
-   }
-}
 
 #ifndef RADIX_TWO_ONLY
 
@@ -220,14 +190,19 @@
    kiss_twiddle_cpx epi3;
 
    kiss_fft_cpx * Fout_beg = Fout;
+#ifdef FIXED_POINT
+   epi3.r = -16384;
+   epi3.i = -28378;
+#else
    epi3 = st->twiddles[fstride*m];
+#endif
    for (i=0;i<N;i++)
    {
       Fout = Fout_beg + i*mm;
       tw1=tw2=st->twiddles;
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
       k=m;
       do {
-         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
 
          C_MUL(scratch[1],Fout[m] , *tw1);
          C_MUL(scratch[2],Fout[m2] , *tw2);
@@ -255,56 +230,8 @@
    }
 }
 
-static void ki_bfly3(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   int i, k;
-   const size_t m2 = 2*m;
-   const kiss_twiddle_cpx *tw1,*tw2;
-   kiss_fft_cpx scratch[5];
-   kiss_twiddle_cpx epi3;
 
-   kiss_fft_cpx * Fout_beg = Fout;
-   epi3 = st->twiddles[fstride*m];
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      tw1=tw2=st->twiddles;
-      k=m;
-      do{
-
-         C_MULC(scratch[1],Fout[m] , *tw1);
-         C_MULC(scratch[2],Fout[m2] , *tw2);
-
-         C_ADD(scratch[3],scratch[1],scratch[2]);
-         C_SUB(scratch[0],scratch[1],scratch[2]);
-         tw1 += fstride;
-         tw2 += fstride*2;
-
-         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
-         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
-
-         C_MULBYSCALAR( scratch[0] , -epi3.i );
-
-         C_ADDTO(*Fout,scratch[3]);
-
-         Fout[m2].r = Fout[m].r + scratch[0].i;
-         Fout[m2].i = Fout[m].i - scratch[0].r;
-
-         Fout[m].r -= scratch[0].i;
-         Fout[m].i += scratch[0].r;
-
-         ++Fout;
-      }while(--k);
-   }
-}
-
+#ifndef OVERRIDE_kf_bfly5
 static void kf_bfly5(
                      kiss_fft_cpx * Fout,
                      const size_t fstride,
@@ -317,13 +244,19 @@
    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
    int i, u;
    kiss_fft_cpx scratch[13];
-   const kiss_twiddle_cpx * twiddles = st->twiddles;
    const kiss_twiddle_cpx *tw;
    kiss_twiddle_cpx ya,yb;
    kiss_fft_cpx * Fout_beg = Fout;
 
-   ya = twiddles[fstride*m];
-   yb = twiddles[fstride*2*m];
+#ifdef FIXED_POINT
+   ya.r = 10126;
+   ya.i = -31164;
+   yb.r = -26510;
+   yb.i = -19261;
+#else
+   ya = st->twiddles[fstride*m];
+   yb = st->twiddles[fstride*2*m];
+#endif
    tw=st->twiddles;
 
    for (i=0;i<N;i++)
@@ -335,8 +268,8 @@
       Fout3=Fout0+3*m;
       Fout4=Fout0+4*m;
 
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
       for ( u=0; u<m; ++u ) {
-         C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
          scratch[0] = *Fout0;
 
          C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
@@ -373,74 +306,8 @@
       }
    }
 }
+#endif /* OVERRIDE_kf_bfly5 */
 
-static void ki_bfly5(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-   int i, u;
-   kiss_fft_cpx scratch[13];
-   const kiss_twiddle_cpx * twiddles = st->twiddles;
-   const kiss_twiddle_cpx *tw;
-   kiss_twiddle_cpx ya,yb;
-   kiss_fft_cpx * Fout_beg = Fout;
-
-   ya = twiddles[fstride*m];
-   yb = twiddles[fstride*2*m];
-   tw=st->twiddles;
-
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      for ( u=0; u<m; ++u ) {
-         scratch[0] = *Fout0;
-
-         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
-         C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
-         C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
-         C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
-
-         C_ADD( scratch[7],scratch[1],scratch[4]);
-         C_SUB( scratch[10],scratch[1],scratch[4]);
-         C_ADD( scratch[8],scratch[2],scratch[3]);
-         C_SUB( scratch[9],scratch[2],scratch[3]);
-
-         Fout0->r += scratch[7].r + scratch[8].r;
-         Fout0->i += scratch[7].i + scratch[8].i;
-
-         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
-         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
-
-         scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
-         scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
-
-         C_SUB(*Fout1,scratch[5],scratch[6]);
-         C_ADD(*Fout4,scratch[5],scratch[6]);
-
-         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
-         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
-         scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
-         scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
-
-         C_ADD(*Fout2,scratch[11],scratch[12]);
-         C_SUB(*Fout3,scratch[11],scratch[12]);
-
-         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-   }
-}
 
 #endif
 
@@ -488,6 +355,9 @@
 int kf_factor(int n,opus_int16 * facbuf)
 {
     int p=4;
+    int i;
+    int stages=0;
+    int nbak = n;
 
     /*factor out powers of 4, powers of 2, then any remaining primes */
     do {
@@ -509,9 +379,30 @@
         {
            return 0;
         }
-        *facbuf++ = p;
-        *facbuf++ = n;
+        facbuf[2*stages] = p;
+        if (p==2 && stages > 1)
+        {
+           facbuf[2*stages] = 4;
+           facbuf[2] = 2;
+        }
+        stages++;
     } while (n > 1);
+    n = nbak;
+    /* Reverse the order to get the radix 4 at the end, so we can use the
+       fast degenerate case. It turns out that reversing the order also
+       improves the noise behaviour. */
+    for (i=0;i<stages/2;i++)
+    {
+       int tmp;
+       tmp = facbuf[2*i];
+       facbuf[2*i] = facbuf[2*(stages-i-1)];
+       facbuf[2*(stages-i-1)] = tmp;
+    }
+    for (i=0;i<stages;i++)
+    {
+        n /= facbuf[2*i];
+        facbuf[2*i+1] = n;
+    }
     return 1;
 }
 
@@ -532,13 +423,19 @@
 #endif
 }
 
+int opus_fft_alloc_arch_c(kiss_fft_state *st) {
+   (void)st;
+   return 0;
+}
+
 /*
  *
  * Allocates all necessary storage space for the fft and ifft.
  * The return value is a contiguous block of memory.  As such,
  * It can be freed with free().
  * */
-kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,  const kiss_fft_state *base)
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
+                                        const kiss_fft_state *base, int arch)
 {
     kiss_fft_state *st=NULL;
     size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
@@ -555,14 +452,20 @@
         kiss_twiddle_cpx *twiddles;
 
         st->nfft=nfft;
-#ifndef FIXED_POINT
+#ifdef FIXED_POINT
+        st->scale_shift = celt_ilog2(st->nfft);
+        if (st->nfft == 1<<st->scale_shift)
+           st->scale = Q15ONE;
+        else
+           st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
+#else
         st->scale = 1.f/nfft;
 #endif
         if (base != NULL)
         {
            st->twiddles = base->twiddles;
            st->shift = 0;
-           while (nfft<<st->shift != base->nfft && st->shift < 32)
+           while (st->shift < 32 && nfft<<st->shift != base->nfft)
               st->shift++;
            if (st->shift>=32)
               goto fail;
@@ -581,22 +484,31 @@
         if (st->bitrev==NULL)
             goto fail;
         compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
+
+        /* Initialize architecture specific fft parameters */
+        if (opus_fft_alloc_arch(st, arch))
+            goto fail;
     }
     return st;
 fail:
-    opus_fft_free(st);
+    opus_fft_free(st, arch);
     return NULL;
 }
 
-kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem )
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
 {
-   return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL);
+   return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
 }
 
-void opus_fft_free(const kiss_fft_state *cfg)
+void opus_fft_free_arch_c(kiss_fft_state *st) {
+   (void)st;
+}
+
+void opus_fft_free(const kiss_fft_state *cfg, int arch)
 {
    if (cfg)
    {
+      opus_fft_free_arch((kiss_fft_state *)cfg, arch);
       opus_free((opus_int16*)cfg->bitrev);
       if (cfg->shift < 0)
          opus_free((kiss_twiddle_cpx*)cfg->twiddles);
@@ -606,7 +518,7 @@
 
 #endif /* CUSTOM_MODES */
 
-void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
 {
     int m2, m;
     int p;
@@ -618,17 +530,6 @@
     /* st->shift can be -1 */
     shift = st->shift>0 ? st->shift : 0;
 
-    celt_assert2 (fin != fout, "In-place FFT not supported");
-    /* Bit-reverse the input */
-    for (i=0;i<st->nfft;i++)
-    {
-       fout[st->bitrev[i]] = fin[i];
-#ifndef FIXED_POINT
-       fout[st->bitrev[i]].r *= st->scale;
-       fout[st->bitrev[i]].i *= st->scale;
-#endif
-    }
-
     fstride[0] = 1;
     L=0;
     do {
@@ -647,7 +548,7 @@
        switch (st->factors[2*i])
        {
        case 2:
-          kf_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          kf_bfly2(fout, m, fstride[i]);
           break;
        case 4:
           kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
@@ -665,55 +566,39 @@
     }
 }
 
-void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
 {
-   int m2, m;
-   int p;
-   int L;
-   int fstride[MAXFACTORS];
    int i;
-   int shift;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
+   scale = st->scale;
 
-   /* st->shift can be -1 */
-   shift = st->shift>0 ? st->shift : 0;
+   celt_assert2 (fin != fout, "In-place FFT not supported");
+   /* Bit-reverse the input */
+   for (i=0;i<st->nfft;i++)
+   {
+      kiss_fft_cpx x = fin[i];
+      fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
+      fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
+   }
+   opus_fft_impl(st, fout);
+}
+
+
+void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   int i;
    celt_assert2 (fin != fout, "In-place FFT not supported");
    /* Bit-reverse the input */
    for (i=0;i<st->nfft;i++)
       fout[st->bitrev[i]] = fin[i];
-
-   fstride[0] = 1;
-   L=0;
-   do {
-      p = st->factors[2*L];
-      m = st->factors[2*L+1];
-      fstride[L+1] = fstride[L]*p;
-      L++;
-   } while(m!=1);
-   m = st->factors[2*L-1];
-   for (i=L-1;i>=0;i--)
-   {
-      if (i!=0)
-         m2 = st->factors[2*i-1];
-      else
-         m2 = 1;
-      switch (st->factors[2*i])
-      {
-      case 2:
-         ki_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-      case 4:
-         ki_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-#ifndef RADIX_TWO_ONLY
-      case 3:
-         ki_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-      case 5:
-         ki_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-#endif
-      }
-      m = m2;
-   }
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
+   opus_fft_impl(st, fout);
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
 }
-
diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
index 66332e3..bffa2bf 100644
--- a/celt/kiss_fft.h
+++ b/celt/kiss_fft.h
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 #include <math.h>
 #include "arch.h"
+#include "cpu_support.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -77,17 +78,28 @@
  4*4*4*2
  */
 
+typedef struct arch_fft_state{
+   int is_supported;
+   void *priv;
+} arch_fft_state;
+
 typedef struct kiss_fft_state{
     int nfft;
-#ifndef FIXED_POINT
-    kiss_fft_scalar scale;
+    opus_val16 scale;
+#ifdef FIXED_POINT
+    int scale_shift;
 #endif
     int shift;
     opus_int16 factors[2*MAXFACTORS];
     const opus_int16 *bitrev;
     const kiss_twiddle_cpx *twiddles;
+    arch_fft_state *arch_fft;
 } kiss_fft_state;
 
+#if defined(HAVE_ARM_NE10)
+#include "arm/fft_arm.h"
+#endif
+
 /*typedef struct kiss_fft_state* kiss_fft_cfg;*/
 
 /**
@@ -113,9 +125,9 @@
  *      buffer size in *lenmem.
  * */
 
-kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base);
+kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch);
 
-kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
+kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch);
 
 /**
  * opus_fft(cfg,in_out_buf)
@@ -127,10 +139,59 @@
  * Note that each element is complex and can be accessed like
     f[k].r and f[k].i
  * */
-void opus_fft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
-void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
 
-void opus_fft_free(const kiss_fft_state *cfg);
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
+void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
+
+void opus_fft_free(const kiss_fft_state *cfg, int arch);
+
+
+void opus_fft_free_arch_c(kiss_fft_state *st);
+int opus_fft_alloc_arch_c(kiss_fft_state *st);
+
+#if !defined(OVERRIDE_OPUS_FFT)
+/* Is run-time CPU detection enabled on this platform? */
+#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
+
+extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(
+ kiss_fft_state *st);
+
+#define opus_fft_alloc_arch(_st, arch) \
+         ((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
+
+extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(
+ kiss_fft_state *st);
+#define opus_fft_free_arch(_st, arch) \
+         ((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
+
+extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+ const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
+#define opus_fft(_cfg, _fin, _fout, arch) \
+   ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
+
+extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+ const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
+#define opus_ifft(_cfg, _fin, _fout, arch) \
+   ((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
+
+#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
+
+#define opus_fft_alloc_arch(_st, arch) \
+         ((void)(arch), opus_fft_alloc_arch_c(_st))
+
+#define opus_fft_free_arch(_st, arch) \
+         ((void)(arch), opus_fft_free_arch_c(_st))
+
+#define opus_fft(_cfg, _fin, _fout, arch) \
+         ((void)(arch), opus_fft_c(_cfg, _fin, _fout))
+
+#define opus_ifft(_cfg, _fin, _fout, arch) \
+         ((void)(arch), opus_ifft_c(_cfg, _fin, _fout))
+
+#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
+#endif /* end if !defined(OVERRIDE_OPUS_FFT) */
 
 #ifdef __cplusplus
 }
diff --git a/celt/mdct.c b/celt/mdct.c
index 90a214a..5315ad1 100644
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -53,76 +53,100 @@
 #include "mathops.h"
 #include "stack_alloc.h"
 
+#if defined(MIPSr1_ASM)
+#include "mips/mdct_mipsr1.h"
+#endif
+
+
 #ifdef CUSTOM_MODES
 
-int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
+int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
 {
    int i;
-   int N4;
    kiss_twiddle_scalar *trig;
-#if defined(FIXED_POINT)
+   int shift;
    int N2=N>>1;
-#endif
    l->n = N;
-   N4 = N>>2;
    l->maxshift = maxshift;
    for (i=0;i<=maxshift;i++)
    {
       if (i==0)
-         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0);
+         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
       else
-         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
+         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
 #ifndef ENABLE_TI_DSPLIB55
       if (l->kfft[i]==NULL)
          return 0;
 #endif
    }
-   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
    if (l->trig==NULL)
      return 0;
-   /* We have enough points that sine isn't necessary */
+   for (shift=0;shift<=maxshift;shift++)
+   {
+      /* We have enough points that sine isn't necessary */
 #if defined(FIXED_POINT)
-   for (i=0;i<=N4;i++)
-      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
+#if 1
+      for (i=0;i<N2;i++)
+         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
 #else
-   for (i=0;i<=N4;i++)
-      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
 #endif
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
+#endif
+      trig += N2;
+      N2 >>= 1;
+      N >>= 1;
+   }
    return 1;
 }
 
-void clt_mdct_clear(mdct_lookup *l)
+void clt_mdct_clear(mdct_lookup *l, int arch)
 {
    int i;
    for (i=0;i<=l->maxshift;i++)
-      opus_fft_free(l->kfft[i]);
+      opus_fft_free(l->kfft[i], arch);
    opus_free((kiss_twiddle_scalar*)l->trig);
 }
 
 #endif /* CUSTOM_MODES */
 
 /* Forward MDCT trashes the input array */
-void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 *window, int overlap, int shift, int stride)
+#ifndef OVERRIDE_clt_mdct_forward
+void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 *window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
-   kiss_twiddle_scalar sine;
    VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_scalar, f2);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
    SAVE_STACK;
+   (void)arch;
+   scale = st->scale;
+
    N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
    N2 = N>>1;
    N4 = N>>2;
+
    ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N2, kiss_fft_scalar);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
+   ALLOC(f2, N4, kiss_fft_cpx);
 
    /* Consider the input to be composed of four blocks: [a, b, c, d] */
    /* Window, shuffle, fold */
@@ -167,123 +191,131 @@
    /* Pre-rotation */
    {
       kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
       for(i=0;i<N4;i++)
       {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
          kiss_fft_scalar re, im, yr, yi;
-         re = yp[0];
-         im = yp[1];
-         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
-         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr + S_MUL(yi,sine);
-         *yp++ = yi - S_MUL(yr,sine);
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
       }
    }
 
-   /* N/4 complex FFT, down-scales by 4/N */
-   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
+   /* N/4 complex FFT, does not downscale anymore */
+   opus_fft_impl(st, f2);
 
    /* Post-rotate */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
       kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
       kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
       /* Temp pointers to make it really clear to the compiler what we're doing */
       for(i=0;i<N4;i++)
       {
          kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
-         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp1 = yr - S_MUL(yi,sine);
-         *yp2 = yi + S_MUL(yr,sine);;
-         fp += 2;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
          yp1 += 2*stride;
          yp2 -= 2*stride;
       }
    }
    RESTORE_STACK;
 }
+#endif /* OVERRIDE_clt_mdct_forward */
 
-void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
+#ifndef OVERRIDE_clt_mdct_backward
+void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
-   kiss_twiddle_scalar sine;
-   VARDECL(kiss_fft_scalar, f2);
-   SAVE_STACK;
+   const kiss_twiddle_scalar *trig;
+   (void) arch;
+
    N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
    N2 = N>>1;
    N4 = N>>2;
-   ALLOC(f2, N2, kiss_fft_scalar);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
 
    /* Pre-rotate */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
       const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
       const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
       for(i=0;i<N4;i++)
       {
+         int rev;
          kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr - S_MUL(yi,sine);
-         *yp++ = yi + S_MUL(yr,sine);
+         rev = *bitrev++;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
          xp1+=2*stride;
          xp2-=2*stride;
       }
    }
 
-   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
-   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
 
    /* Post-rotate and de-shuffle from both ends of the buffer at once to make
       it in-place. */
    {
-      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
       /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
          middle pair will be computed twice. */
       for(i=0;i<(N4+1)>>1;i++)
       {
          kiss_fft_scalar re, im, yr, yi;
          kiss_twiddle_scalar t0, t1;
-         re = yp0[0];
-         im = yp0[1];
-         t0 = t[i<<shift];
-         t1 = t[(N4-i)<<shift];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp0[1];
+         im = yp0[0];
+         t0 = t[i];
+         t1 = t[N4+i];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
-         re = yp1[0];
-         im = yp1[1];
-         /* works because the cos is nearly one */
-         yp0[0] = -(yr - S_MUL(yi,sine));
-         yp1[1] = yi + S_MUL(yr,sine);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp1[1];
+         im = yp1[0];
+         yp0[0] = yr;
+         yp1[1] = yi;
 
-         t0 = t[(N4-i-1)<<shift];
-         t1 = t[(i+1)<<shift];
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
-         /* works because the cos is nearly one */
-         yp1[0] = -(yr - S_MUL(yi,sine));
-         yp0[1] = yi + S_MUL(yr,sine);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
          yp0 += 2;
          yp1 -= 2;
       }
@@ -307,5 +339,5 @@
          wp2--;
       }
    }
-   RESTORE_STACK;
 }
+#endif /* OVERRIDE_clt_mdct_backward */
diff --git a/celt/mdct.h b/celt/mdct.h
index d721821..160ae4e 100644
--- a/celt/mdct.h
+++ b/celt/mdct.h
@@ -53,18 +53,60 @@
    const kiss_twiddle_scalar * OPUS_RESTRICT trig;
 } mdct_lookup;
 
-int clt_mdct_init(mdct_lookup *l,int N, int maxshift);
-void clt_mdct_clear(mdct_lookup *l);
+#if defined(HAVE_ARM_NE10)
+#include "arm/mdct_arm.h"
+#endif
+
+
+int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch);
+void clt_mdct_clear(mdct_lookup *l, int arch);
 
 /** Compute a forward MDCT and scale by 4/N, trashes the input array */
-void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in,
-      kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 *window, int overlap, int shift, int stride);
+void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in,
+                        kiss_fft_scalar * OPUS_RESTRICT out,
+                        const opus_val16 *window, int overlap,
+                        int shift, int stride, int arch);
 
 /** Compute a backward MDCT (no scaling) and performs weighted overlap-add
     (scales implicitly by 1/2) */
-void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in,
+void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in,
       kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride);
+      const opus_val16 * OPUS_RESTRICT window,
+      int overlap, int shift, int stride, int arch);
+
+#if !defined(OVERRIDE_OPUS_MDCT)
+/* Is run-time CPU detection enabled on this platform? */
+#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10)
+
+extern void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(
+      const mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window,
+      int overlap, int shift, int stride, int arch);
+
+#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   ((*CLT_MDCT_FORWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
+                                                   _window, _overlap, _shift, \
+                                                   _stride, _arch))
+
+extern void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(
+      const mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window,
+      int overlap, int shift, int stride, int arch);
+
+#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   (*CLT_MDCT_BACKWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
+                                                   _window, _overlap, _shift, \
+                                                   _stride, _arch)
+
+#else /* if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) */
+
+#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   clt_mdct_forward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
+
+#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   clt_mdct_backward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
+
+#endif /* end if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) && !defined(FIXED_POINT) */
+#endif /* end if !defined(OVERRIDE_OPUS_MDCT) */
 
 #endif
diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h
new file mode 100644
index 0000000..e85661a
--- /dev/null
+++ b/celt/mips/celt_mipsr1.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __CELT_MIPSR1_H__
+#define __CELT_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_C
+
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+#define OVERRIDE_comb_filter
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const opus_val16 *window, int overlap, int arch)
+{
+   int i;
+   opus_val32 x0, x1, x2, x3, x4;
+
+   (void)arch;
+
+   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
+   opus_val16 g00, g01, g02, g10, g11, g12;
+   static const opus_val16 gains[3][3] = {
+         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+
+   if (g0==0 && g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y, x, N);
+      return;
+   }
+
+   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
+   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
+   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
+   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
+   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
+   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
+   x1 = x[-T1+1];
+   x2 = x[-T1  ];
+   x3 = x[-T1-1];
+   x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
+
+   for (i=0;i<overlap;i++)
+   {
+      opus_val16 f;
+      opus_val32 res;
+      f = MULT16_16_Q15(window[i],window[i]);
+      x0= x[i-T1+2];
+
+      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0]));
+
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1])));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2])));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1)));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0)));
+
+      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
+
+      y[i] = x[i] + res;
+
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+
+   x4 = x[i-T1-2];
+   x3 = x[i-T1-1];
+   x2 = x[i-T1];
+   x1 = x[i-T1+1];
+
+   if (g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y+overlap, x+overlap, N-overlap);
+      return;
+   }
+
+   for (i=overlap;i<N;i++)
+   {
+      opus_val32 res;
+      x0=x[i-T1+2];
+
+      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)g10), "r" ((int)x2));
+
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g11), "r" ((int)ADD32(x3,x1)));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g12), "r" ((int)ADD32(x4,x0)));
+      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
+      y[i] = x[i] + res;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+}
+
+#endif /* __CELT_MIPSR1_H__ */
diff --git a/celt/mips/fixed_generic_mipsr1.h b/celt/mips/fixed_generic_mipsr1.h
new file mode 100644
index 0000000..4a05efb
--- /dev/null
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -0,0 +1,126 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO */
+/**
+   @file fixed_generic.h
+   @brief Generic fixed-point operations
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_FIXED_GENERIC_MIPSR1_H
+#define CELT_FIXED_GENERIC_MIPSR1_H
+
+#undef MULT16_32_Q15_ADD
+static inline int MULT16_32_Q15_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_32_Q15_SUB
+static inline int MULT16_32_Q15_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_16_Q15_ADD
+static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_16_Q15_SUB
+static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+
+#undef MULT16_32_Q16
+static inline int MULT16_32_Q16(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1,%0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (16));
+    return c;
+}
+
+#undef MULT16_32_P16
+static inline int MULT16_32_P16(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR_R.W %0,$ac1, %1" : "=r" (c): "i" (16));
+    return c;
+}
+
+#undef MULT16_32_Q15
+static inline int MULT16_32_Q15(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (15));
+    return c;
+}
+
+#undef MULT32_32_Q31
+static inline int MULT32_32_Q31(int a, int b)
+{
+    int r;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (r): "i" (31));
+    return r;
+}
+
+#undef PSHR32
+static inline int PSHR32(int a, int shift)
+{
+    int r;
+    asm volatile ("SHRAV_R.W %0, %1, %2" :"=r" (r): "r" (a), "r" (shift));
+    return r;
+}
+
+#undef MULT16_16_P15
+static inline int MULT16_16_P15(int a, int b)
+{
+    int r;
+    asm volatile ("mul %0, %1, %2" :"=r" (r): "r" (a), "r" (b));
+    asm volatile ("SHRA_R.W %0, %1, %2" : "+r" (r):  "0" (r), "i"(15));
+    return r;
+}
+
+#endif /* CELT_FIXED_GENERIC_MIPSR1_H */
diff --git a/celt/mips/kiss_fft_mipsr1.h b/celt/mips/kiss_fft_mipsr1.h
new file mode 100644
index 0000000..400ca4d
--- /dev/null
+++ b/celt/mips/kiss_fft_mipsr1.h
@@ -0,0 +1,167 @@
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_MIPSR1_H
+#define KISS_FFT_MIPSR1_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef FIXED_POINT
+
+#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d))
+#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d))
+
+#undef S_MUL_ADD
+static inline int S_MUL_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef S_MUL_SUB
+static inline int S_MUL_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef C_MUL
+#   define C_MUL(m,a,b) (m=C_MUL_fun(a,b))
+static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
+
+    return m;
+}
+#undef C_MULC
+#   define C_MULC(m,a,b) (m=C_MULC_fun(a,b))
+static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
+
+    return m;
+}
+
+#endif /* FIXED_POINT */
+
+#define OVERRIDE_kf_bfly5
+static void kf_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int i, u;
+   kiss_fft_cpx scratch[13];
+
+   const kiss_twiddle_cpx *tw;
+   kiss_twiddle_cpx ya,yb;
+   kiss_fft_cpx * Fout_beg = Fout;
+
+#ifdef FIXED_POINT
+   ya.r = 10126;
+   ya.i = -31164;
+   yb.r = -26510;
+   yb.i = -19261;
+#else
+   ya = st->twiddles[fstride*m];
+   yb = st->twiddles[fstride*2*m];
+#endif
+
+   tw=st->twiddles;
+
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
+      for ( u=0; u<m; ++u ) {
+         scratch[0] = *Fout0;
+
+
+         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_SUB( scratch[10],scratch[1],scratch[4]);
+         C_ADD( scratch[8],scratch[2],scratch[3]);
+         C_SUB( scratch[9],scratch[2],scratch[3]);
+
+         Fout0->r += scratch[7].r + scratch[8].r;
+         Fout0->i += scratch[7].i + scratch[8].i;
+         scratch[5].r = scratch[0].r + S_MUL_ADD(scratch[7].r,ya.r,scratch[8].r,yb.r);
+         scratch[5].i = scratch[0].i + S_MUL_ADD(scratch[7].i,ya.r,scratch[8].i,yb.r);
+
+         scratch[6].r =  S_MUL_ADD(scratch[10].i,ya.i,scratch[9].i,yb.i);
+         scratch[6].i =  -S_MUL_ADD(scratch[10].r,ya.i,scratch[9].r,yb.i);
+
+         C_SUB(*Fout1,scratch[5],scratch[6]);
+         C_ADD(*Fout4,scratch[5],scratch[6]);
+
+         scratch[11].r = scratch[0].r + S_MUL_ADD(scratch[7].r,yb.r,scratch[8].r,ya.r);
+         scratch[11].i = scratch[0].i + S_MUL_ADD(scratch[7].i,yb.r,scratch[8].i,ya.r);
+
+         scratch[12].r =  S_MUL_SUB(scratch[9].i,ya.i,scratch[10].i,yb.i);
+         scratch[12].i =  S_MUL_SUB(scratch[10].r,yb.i,scratch[9].r,ya.i);
+
+         C_ADD(*Fout2,scratch[11],scratch[12]);
+         C_SUB(*Fout3,scratch[11],scratch[12]);
+
+         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+   }
+}
+
+
+#endif /* KISS_FFT_MIPSR1_H */
diff --git a/celt/mips/mdct_mipsr1.h b/celt/mips/mdct_mipsr1.h
new file mode 100644
index 0000000..2934dab
--- /dev/null
+++ b/celt/mips/mdct_mipsr1.h
@@ -0,0 +1,288 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This is a simple MDCT implementation that uses a N/4 complex FFT
+   to do most of the work. It should be relatively straightforward to
+   plug in pretty much and FFT here.
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
+   (might as well use the same FFT everywhere).
+
+   The algorithm is similar to (and inspired from) Fabrice Bellard's
+   MDCT implementation in FFMPEG, but has differences in signs, ordering
+   and scaling in many places.
+*/
+#ifndef __MDCT_MIPSR1_H__
+#define __MDCT_MIPSR1_H__
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "mdct.h"
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include <math.h>
+#include "os_support.h"
+#include "mathops.h"
+#include "stack_alloc.h"
+
+/* Forward MDCT trashes the input array */
+#define OVERRIDE_clt_mdct_forward
+void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 *window, int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
+
+    (void)arch;
+
+   SAVE_STACK;
+   scale = st->scale;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+          *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2);
+          *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+          *yp++ =  S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
+          *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+
+         yr = S_MUL_SUB(re,t0,im,t1);
+         yi = S_MUL_ADD(im,t0,re,t1);
+
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
+      }
+   }
+
+   /* N/4 complex FFT, does not downscale anymore */
+   opus_fft_impl(st, f2);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]);
+         yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+#define OVERRIDE_clt_mdct_backward
+void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   const kiss_twiddle_scalar *trig;
+
+    (void)arch;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
+      for(i=0;i<N4;i++)
+      {
+         int rev;
+         kiss_fft_scalar yr, yi;
+         rev = *bitrev++;
+         yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]);
+         yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp0[1];
+         im = yp0[0];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL_ADD(re,t0 , im,t1);
+         yi = S_MUL_SUB(re,t1 , im,t0);
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp1[1];
+         im = yp1[0];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL_ADD(re,t0,im,t1);
+         yi = S_MUL_SUB(re,t1,im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+}
+#endif /* __MDCT_MIPSR1_H__ */
diff --git a/celt/mips/pitch_mipsr1.h b/celt/mips/pitch_mipsr1.h
new file mode 100644
index 0000000..a9500af
--- /dev/null
+++ b/celt/mips/pitch_mipsr1.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_MIPSR1_H
+#define PITCH_MIPSR1_H
+
+#define OVERRIDE_DUAL_INNER_PROD
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
+{
+   int j;
+   opus_val32 xy01=0;
+   opus_val32 xy02=0;
+
+   (void)arch;
+
+   asm volatile("MULT $ac1, $0, $0");
+   asm volatile("MULT $ac2, $0, $0");
+   /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
+   for (j=0;j<N;j++)
+   {
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
+      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+      ++j;
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
+      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+   }
+   asm volatile ("mflo %0, $ac1": "=r"(xy01));
+   asm volatile ("mflo %0, $ac2": "=r"(xy02));
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+static inline void xcorr_kernel_mips(const opus_val16 * x,
+      const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+
+    opus_int64 sum_0, sum_1, sum_2, sum_3;
+    sum_0 =  (opus_int64)sum[0];
+    sum_1 =  (opus_int64)sum[1];
+    sum_2 =  (opus_int64)sum[2];
+    sum_3 =  (opus_int64)sum[3];
+
+    y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+    y_0=*y++;
+    y_1=*y++;
+    y_2=*y++;
+    for (j=0;j<len-3;j+=4)
+    {
+        opus_val16 tmp;
+        tmp = *x++;
+        y_3=*y++;
+
+        sum_0 = __builtin_mips_madd( sum_0, tmp, y_0);
+        sum_1 = __builtin_mips_madd( sum_1, tmp, y_1);
+        sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
+        sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+
+        tmp=*x++;
+        y_0=*y++;
+
+        sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
+        sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
+        sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
+        sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+
+       tmp=*x++;
+       y_1=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+
+
+      tmp=*x++;
+      y_2=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_3 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_0 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_1);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_2);
+
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_0 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_1 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+   }
+
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+
+      sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
+      sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
+      sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
+      sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+   }
+
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+
+   }
+
+   sum[0] = (opus_val32)sum_0;
+   sum[1] = (opus_val32)sum_1;
+   sum[2] = (opus_val32)sum_2;
+   sum[3] = (opus_val32)sum_3;
+}
+
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)(arch), xcorr_kernel_mips(x, y, sum, len))
+
+#endif /* PITCH_MIPSR1_H */
diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h
new file mode 100644
index 0000000..54cef86
--- /dev/null
+++ b/celt/mips/vq_mipsr1.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __VQ_MIPSR1_H__
+#define __VQ_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "mathops.h"
+#include "arch.h"
+
+static unsigned extract_collapse_mask(int *iy, int N, int B);
+static void normalise_residual(int * OPUS_RESTRICT iy, celt_norm * OPUS_RESTRICT X, int N, opus_val32 Ryy, opus_val16 gain);
+static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread);
+static void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain, int arch);
+
+#define OVERRIDE_vq_exp_rotation1
+static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
+{
+   int i;
+   opus_val16 ms;
+   celt_norm *Xptr;
+   Xptr = X;
+   ms = NEG16(s);
+   for (i=0;i<len-stride;i++)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr++      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
+   }
+   Xptr = &X[len-2*stride-1];
+   for (i=len-2*stride-1;i>=0;i--)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr--      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
+   }
+}
+
+#define OVERRIDE_renormalise_vector
+
+#define renormalise_vector(X, N, gain, arch) \
+ (renormalise_vector_mips(X, N, gain, arch))
+
+void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain, int arch)
+{
+   int i;
+#ifdef FIXED_POINT
+   int k;
+#endif
+   opus_val32 E = EPSILON;
+   opus_val16 g;
+   opus_val32 t;
+   celt_norm *xptr = X;
+   int X0, X1;
+
+   (void)arch;
+
+   asm volatile("mult $ac1, $0, $0");
+   asm volatile("MTLO %0, $ac1" : :"r" (E));
+   /*if(N %4)
+       printf("error");*/
+   for (i=0;i<N-2;i+=2)
+   {
+      X0 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+
+      X1 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X1), "r" (X1));
+   }
+
+   for (;i<N;i++)
+   {
+      X0 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+   }
+
+   asm volatile("MFLO %0, $ac1" : "=r" (E));
+#ifdef FIXED_POINT
+   k = celt_ilog2(E)>>1;
+#endif
+   t = VSHR32(E, 2*(k-7));
+   g = MULT16_16_P15(celt_rsqrt_norm(t),gain);
+
+   xptr = X;
+   for (i=0;i<N;i++)
+   {
+      *xptr = EXTRACT16(PSHR32(MULT16_16(g, *xptr), k+1));
+      xptr++;
+   }
+   /*return celt_sqrt(E);*/
+}
+
+#endif /* __VQ_MIPSR1_H__ */
diff --git a/celt/modes.c b/celt/modes.c
index 42e68e1..911686e 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -37,6 +37,7 @@
 #include "os_support.h"
 #include "stack_alloc.h"
 #include "quant_bands.h"
+#include "cpu_support.h"
 
 static const opus_int16 eband5ms[] = {
 /*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k 9.6 12k 15.6 */
@@ -229,6 +230,7 @@
    opus_val16 *window;
    opus_int16 *logN;
    int LM;
+   int arch = opus_select_arch();
    ALLOC_STACK;
 #if !defined(VAR_ARRAYS) && !defined(USE_ALLOCA)
    if (global_stack==NULL)
@@ -389,7 +391,7 @@
    compute_pulse_cache(mode, mode->maxLM);
 
    if (clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts,
-           mode->maxLM) == 0)
+           mode->maxLM, arch) == 0)
       goto failure;
 
    if (error)
@@ -408,6 +410,8 @@
 #ifdef CUSTOM_MODES
 void opus_custom_mode_destroy(CELTMode *mode)
 {
+   int arch = opus_select_arch();
+
    if (mode == NULL)
       return;
 #ifndef CUSTOM_MODES_ONLY
@@ -431,7 +435,7 @@
    opus_free((opus_int16*)mode->cache.index);
    opus_free((unsigned char*)mode->cache.bits);
    opus_free((unsigned char*)mode->cache.caps);
-   clt_mdct_clear(&mode->mdct);
+   clt_mdct_clear(&mode->mdct, arch);
 
    opus_free((CELTMode *)mode);
 }
diff --git a/celt/modes.h b/celt/modes.h
index c8340f9..be813cc 100644
--- a/celt/modes.h
+++ b/celt/modes.h
@@ -39,14 +39,6 @@
 
 #define MAX_PERIOD 1024
 
-#ifndef OVERLAP
-#define OVERLAP(mode) ((mode)->overlap)
-#endif
-
-#ifndef FRAMESIZE
-#define FRAMESIZE(mode) ((mode)->mdctSize)
-#endif
-
 typedef struct {
    int size;
    const opus_int16 *index;
diff --git a/celt/os_support.h b/celt/os_support.h
index 5e47e3c..a217197 100644
--- a/celt/os_support.h
+++ b/celt/os_support.h
@@ -67,18 +67,18 @@
 }
 #endif
 
-/** Copy n bytes of memory from src to dst. The 0* term provides compile-time type checking  */
+/** Copy n elements from src to dst. The 0* term provides compile-time type checking  */
 #ifndef OVERRIDE_OPUS_COPY
 #define OPUS_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
 #endif
 
-/** Copy n bytes of memory from src to dst, allowing overlapping regions. The 0* term
+/** Copy n elements from src to dst, allowing overlapping regions. The 0* term
     provides compile-time type checking */
 #ifndef OVERRIDE_OPUS_MOVE
 #define OPUS_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
 #endif
 
-/** Set n elements of dst to zero, starting at address s */
+/** Set n elements of dst to zero */
 #ifndef OVERRIDE_OPUS_CLEAR
 #define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
 #endif
diff --git a/celt/pitch.c b/celt/pitch.c
index d2b3054..1d89cb0 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -214,25 +214,35 @@
    celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
 }
 
-#if 0 /* This is a simple version of the pitch correlation that should work
-         well on DSPs like Blackfin and TI C5x/C6x */
-
+/* Pure C implementation. */
 #ifdef FIXED_POINT
 opus_val32
 #else
 void
 #endif
-celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int max_pitch)
+#if defined(OVERRIDE_PITCH_XCORR)
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch)
+#else
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch, int arch)
+#endif
 {
+
+#if 0 /* This is a simple version of the pitch correlation that should work
+         well on DSPs like Blackfin and TI C5x/C6x */
    int i, j;
 #ifdef FIXED_POINT
    opus_val32 maxcorr=1;
 #endif
+#if !defined(OVERRIDE_PITCH_XCORR)
+   (void)arch;
+#endif
    for (i=0;i<max_pitch;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<len;j++)
-         sum = MAC16_16(sum, x[j],y[i+j]);
+         sum = MAC16_16(sum, _x[j], _y[i+j]);
       xcorr[i] = sum;
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -241,30 +251,25 @@
 #ifdef FIXED_POINT
    return maxcorr;
 #endif
-}
 
 #else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
-
-#ifdef FIXED_POINT
-opus_val32
-#else
-void
-#endif
-celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
-{
-   int i,j;
+   int i;
    /*The EDSP version requires that max_pitch is at least 1, and that _x is
       32-bit aligned.
      Since it's hard to put asserts in assembly, put them here.*/
-   celt_assert(max_pitch>0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 #ifdef FIXED_POINT
    opus_val32 maxcorr=1;
 #endif
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(_x, _y+i, sum, len);
+#if defined(OVERRIDE_PITCH_XCORR)
+      xcorr_kernel_c(_x, _y+i, sum, len);
+#else
+      xcorr_kernel(_x, _y+i, sum, len, arch);
+#endif
       xcorr[i]=sum[0];
       xcorr[i+1]=sum[1];
       xcorr[i+2]=sum[2];
@@ -279,9 +284,12 @@
    /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    for (;i<max_pitch;i++)
    {
-      opus_val32 sum = 0;
-      for (j=0;j<len;j++)
-         sum = MAC16_16(sum, _x[j],_y[i+j]);
+      opus_val32 sum;
+#if defined(OVERRIDE_PITCH_XCORR)
+      sum = celt_inner_prod_c(_x, _y+i, len);
+#else
+      sum = celt_inner_prod(_x, _y+i, len, arch);
+#endif
       xcorr[i] = sum;
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -290,9 +298,9 @@
 #ifdef FIXED_POINT
    return maxcorr;
 #endif
+#endif
 }
 
-#endif
 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
                   int len, int max_pitch, int *pitch, int arch)
 {
@@ -361,12 +369,17 @@
 #endif
    for (i=0;i<max_pitch>>1;i++)
    {
-      opus_val32 sum=0;
+      opus_val32 sum;
       xcorr[i] = 0;
       if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
          continue;
+#ifdef FIXED_POINT
+      sum = 0;
       for (j=0;j<len>>1;j++)
          sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
+#else
+      sum = celt_inner_prod_c(x_lp, y+i, len>>1);
+#endif
       xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -401,7 +414,7 @@
 
 static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
-      int N, int *T0_, int prev_period, opus_val16 prev_gain)
+      int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
 {
    int k, i, T, T0;
    opus_val16 g, g0;
@@ -426,7 +439,7 @@
 
    T = T0 = *T0_;
    ALLOC(yy_lookup, maxperiod+1, opus_val32);
-   dual_inner_prod(x, x, x-T0, N, &xx, &xy);
+   dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);
    yy_lookup[0] = xx;
    yy=xx;
    for (i=1;i<=maxperiod;i++)
@@ -456,7 +469,7 @@
       opus_val16 g1;
       opus_val16 cont=0;
       opus_val16 thresh;
-      T1 = (2*T0+k)/(2*k);
+      T1 = celt_udiv(2*T0+k, 2*k);
       if (T1 < minperiod)
          break;
       /* Look for another strong correlation at T1b */
@@ -468,9 +481,9 @@
             T1b = T0+T1;
       } else
       {
-         T1b = (2*second_check[k]*T0+k)/(2*k);
+         T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
       }
-      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch);
       xy += xy2;
       yy = yy_lookup[T1] + yy_lookup[T1b];
 #ifdef FIXED_POINT
@@ -513,13 +526,7 @@
       pg = SHR32(frac_div32(best_xy,best_yy+1),16);
 
    for (k=0;k<3;k++)
-   {
-      int T1 = T+k-1;
-      xy = 0;
-      for (i=0;i<N;i++)
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-      xcorr[k] = xy;
-   }
+      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
    if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
       offset = 1;
    else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
diff --git a/celt/pitch.h b/celt/pitch.h
index df317ec..65a77a6 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -37,11 +37,17 @@
 #include "modes.h"
 #include "cpu_support.h"
 
-#if defined(__SSE__) && !defined(FIXED_POINT)
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \
+  || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT))
 #include "x86/pitch_sse.h"
 #endif
 
-#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#if defined(MIPSr1_ASM)
+#include "mips/pitch_mipsr1.h"
+#endif
+
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+  || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 # include "arm/pitch_arm.h"
 #endif
 
@@ -52,12 +58,12 @@
                   int len, int max_pitch, int *pitch, int arch);
 
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
-      int N, int *T0, int prev_period, opus_val16 prev_gain);
+      int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
+
 
 /* OPT: This is the kernel you really want to optimize. It gets used a lot
    by the prefilter and by the PLC. */
-#ifndef OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
    opus_val16 y_0, y_1, y_2, y_3;
@@ -122,10 +128,14 @@
       sum[3] = MAC16_16(sum[3],tmp,y_1);
    }
 }
+
+#ifndef OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)(arch),xcorr_kernel_c(x, y, sum, len))
 #endif /* OVERRIDE_XCORR_KERNEL */
 
-#ifndef OVERRIDE_DUAL_INNER_PROD
-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+
+static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
 {
    int i;
@@ -139,8 +149,35 @@
    *xy1 = xy01;
    *xy2 = xy02;
 }
+
+#ifndef OVERRIDE_DUAL_INNER_PROD
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+    ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))
 #endif
 
+/*We make sure a C version is always available for cases where the overhead of
+  vectorization and passing around an arch flag aren't worth it.*/
+static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
+      const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy=0;
+   for (i=0;i<N;i++)
+      xy = MAC16_16(xy, x[i], y[i]);
+   return xy;
+}
+
+#if !defined(OVERRIDE_CELT_INNER_PROD)
+# define celt_inner_prod(x, y, N, arch) \
+    ((void)(arch),celt_inner_prod_c(x, y, N))
+#endif
+
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+     opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+
 #ifdef FIXED_POINT
 opus_val32
 #else
@@ -151,7 +188,9 @@
 
 #if !defined(OVERRIDE_PITCH_XCORR)
 /*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
 extern
 #  if defined(FIXED_POINT)
 opus_val32
@@ -161,12 +200,20 @@
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
       const opus_val16 *, opus_val32 *, int, int);
 
+#  define OVERRIDE_PITCH_XCORR
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
         xcorr, len, max_pitch))
 # else
-#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch, int arch);
+
 # endif
 #endif
 
diff --git a/celt/quant_bands.c b/celt/quant_bands.c
index ac6952c..95076e0 100644
--- a/celt/quant_bands.c
+++ b/celt/quant_bands.c
@@ -292,7 +292,7 @@
 #endif
    }
    if (lfe)
-      max_decay=3;
+      max_decay = QCONST16(3.f,DB_SHIFT);
    enc_start_state = *enc;
 
    ALLOC(oldEBands_intra, C*m->nbEBands, opus_val16);
diff --git a/celt/rate.c b/celt/rate.c
index e13d839..b28d8fe 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -131,7 +131,7 @@
    for (i=0;i<nbEntries;i++)
    {
       unsigned char *ptr = bits+entryI[i];
-      opus_int16 tmp[MAX_PULSES+1];
+      opus_int16 tmp[CELT_MAX_PULSES+1];
       get_required_bits(tmp, entryN[i], get_pulses(entryK[i]), BITRES);
       for (j=1;j<=entryK[i];j++)
          ptr[j] = tmp[get_pulses(j)]-1;
@@ -333,7 +333,7 @@
       /*Figure out how many left-over bits we would be adding to this band.
         This can include bits we've stolen back from higher, skipped bands.*/
       left = total-psum;
-      percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+      percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
       left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
       rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0);
       band_width = m->eBands[codedBands]-m->eBands[j];
@@ -414,7 +414,7 @@
 
    /* Allocate the remaining bits */
    left = total-psum;
-   percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+   percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
    left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
    for (j=start;j<codedBands;j++)
       bits[j] += ((int)percoeff*(m->eBands[j+1]-m->eBands[j]));
@@ -465,7 +465,8 @@
             offset += NClogN>>3;
 
          /* Divide with rounding */
-         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))) / (den<<BITRES));
+         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))));
+         ebits[j] = celt_udiv(ebits[j], den)>>BITRES;
 
          /* Make sure not to bust */
          if (C*ebits[j] > (bits[j]>>BITRES))
diff --git a/celt/rate.h b/celt/rate.h
index f1e0661..515f768 100644
--- a/celt/rate.h
+++ b/celt/rate.h
@@ -32,7 +32,7 @@
 #define MAX_PSEUDO 40
 #define LOG_MAX_PSEUDO 6
 
-#define MAX_PULSES 128
+#define CELT_MAX_PULSES 128
 
 #define MAX_FINE_BITS 8
 
diff --git a/celt/stack_alloc.h b/celt/stack_alloc.h
index 316a6ce..2b51c8d 100644
--- a/celt/stack_alloc.h
+++ b/celt/stack_alloc.h
@@ -116,9 +116,11 @@
 #else
 
 #ifdef CELT_C
+char *scratch_ptr=0;
 char *global_stack=0;
 #else
 extern char *global_stack;
+extern char *scratch_ptr;
 #endif /* CELT_C */
 
 #ifdef ENABLE_VALGRIND
@@ -140,8 +142,12 @@
 
 #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
 #define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#if 0 /* Set this to 1 to instrument pseudostack usage */
+#define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
+#else
 #define RESTORE_STACK (global_stack = _saved_stack)
-#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? opus_alloc_scratch(GLOBAL_STACK_SIZE) : global_stack); _saved_stack = global_stack;
+#endif
+#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? (scratch_ptr=opus_alloc_scratch(GLOBAL_STACK_SIZE)) : global_stack); _saved_stack = global_stack;
 
 #endif /* ENABLE_VALGRIND */
 
diff --git a/celt/static_modes_fixed.h b/celt/static_modes_fixed.h
index 216df9e..8717d62 100644
--- a/celt/static_modes_fixed.h
+++ b/celt/static_modes_fixed.h
@@ -4,6 +4,11 @@
 #include "modes.h"
 #include "rate.h"
 
+#ifdef HAVE_ARM_NE10
+#define OVERRIDE_FFT 1
+#include "static_modes_fixed_arm_ne10.h"
+#endif
+
 #ifndef DEF_WINDOW120
 #define DEF_WINDOW120
 static const opus_val16 window120[120] = {
@@ -341,84 +346,84 @@
 #ifndef FFT_BITREV480
 #define FFT_BITREV480
 static const opus_int16 fft_bitrev480[480] = {
-0, 120, 240, 360, 30, 150, 270, 390, 60, 180, 300, 420, 90, 210, 330,
-450, 15, 135, 255, 375, 45, 165, 285, 405, 75, 195, 315, 435, 105, 225,
-345, 465, 5, 125, 245, 365, 35, 155, 275, 395, 65, 185, 305, 425, 95,
-215, 335, 455, 20, 140, 260, 380, 50, 170, 290, 410, 80, 200, 320, 440,
-110, 230, 350, 470, 10, 130, 250, 370, 40, 160, 280, 400, 70, 190, 310,
-430, 100, 220, 340, 460, 25, 145, 265, 385, 55, 175, 295, 415, 85, 205,
-325, 445, 115, 235, 355, 475, 1, 121, 241, 361, 31, 151, 271, 391, 61,
-181, 301, 421, 91, 211, 331, 451, 16, 136, 256, 376, 46, 166, 286, 406,
-76, 196, 316, 436, 106, 226, 346, 466, 6, 126, 246, 366, 36, 156, 276,
-396, 66, 186, 306, 426, 96, 216, 336, 456, 21, 141, 261, 381, 51, 171,
-291, 411, 81, 201, 321, 441, 111, 231, 351, 471, 11, 131, 251, 371, 41,
-161, 281, 401, 71, 191, 311, 431, 101, 221, 341, 461, 26, 146, 266, 386,
-56, 176, 296, 416, 86, 206, 326, 446, 116, 236, 356, 476, 2, 122, 242,
-362, 32, 152, 272, 392, 62, 182, 302, 422, 92, 212, 332, 452, 17, 137,
-257, 377, 47, 167, 287, 407, 77, 197, 317, 437, 107, 227, 347, 467, 7,
-127, 247, 367, 37, 157, 277, 397, 67, 187, 307, 427, 97, 217, 337, 457,
-22, 142, 262, 382, 52, 172, 292, 412, 82, 202, 322, 442, 112, 232, 352,
-472, 12, 132, 252, 372, 42, 162, 282, 402, 72, 192, 312, 432, 102, 222,
-342, 462, 27, 147, 267, 387, 57, 177, 297, 417, 87, 207, 327, 447, 117,
-237, 357, 477, 3, 123, 243, 363, 33, 153, 273, 393, 63, 183, 303, 423,
-93, 213, 333, 453, 18, 138, 258, 378, 48, 168, 288, 408, 78, 198, 318,
-438, 108, 228, 348, 468, 8, 128, 248, 368, 38, 158, 278, 398, 68, 188,
-308, 428, 98, 218, 338, 458, 23, 143, 263, 383, 53, 173, 293, 413, 83,
-203, 323, 443, 113, 233, 353, 473, 13, 133, 253, 373, 43, 163, 283, 403,
-73, 193, 313, 433, 103, 223, 343, 463, 28, 148, 268, 388, 58, 178, 298,
-418, 88, 208, 328, 448, 118, 238, 358, 478, 4, 124, 244, 364, 34, 154,
-274, 394, 64, 184, 304, 424, 94, 214, 334, 454, 19, 139, 259, 379, 49,
-169, 289, 409, 79, 199, 319, 439, 109, 229, 349, 469, 9, 129, 249, 369,
-39, 159, 279, 399, 69, 189, 309, 429, 99, 219, 339, 459, 24, 144, 264,
-384, 54, 174, 294, 414, 84, 204, 324, 444, 114, 234, 354, 474, 14, 134,
-254, 374, 44, 164, 284, 404, 74, 194, 314, 434, 104, 224, 344, 464, 29,
-149, 269, 389, 59, 179, 299, 419, 89, 209, 329, 449, 119, 239, 359, 479,
+0, 96, 192, 288, 384, 32, 128, 224, 320, 416, 64, 160, 256, 352, 448,
+8, 104, 200, 296, 392, 40, 136, 232, 328, 424, 72, 168, 264, 360, 456,
+16, 112, 208, 304, 400, 48, 144, 240, 336, 432, 80, 176, 272, 368, 464,
+24, 120, 216, 312, 408, 56, 152, 248, 344, 440, 88, 184, 280, 376, 472,
+4, 100, 196, 292, 388, 36, 132, 228, 324, 420, 68, 164, 260, 356, 452,
+12, 108, 204, 300, 396, 44, 140, 236, 332, 428, 76, 172, 268, 364, 460,
+20, 116, 212, 308, 404, 52, 148, 244, 340, 436, 84, 180, 276, 372, 468,
+28, 124, 220, 316, 412, 60, 156, 252, 348, 444, 92, 188, 284, 380, 476,
+1, 97, 193, 289, 385, 33, 129, 225, 321, 417, 65, 161, 257, 353, 449,
+9, 105, 201, 297, 393, 41, 137, 233, 329, 425, 73, 169, 265, 361, 457,
+17, 113, 209, 305, 401, 49, 145, 241, 337, 433, 81, 177, 273, 369, 465,
+25, 121, 217, 313, 409, 57, 153, 249, 345, 441, 89, 185, 281, 377, 473,
+5, 101, 197, 293, 389, 37, 133, 229, 325, 421, 69, 165, 261, 357, 453,
+13, 109, 205, 301, 397, 45, 141, 237, 333, 429, 77, 173, 269, 365, 461,
+21, 117, 213, 309, 405, 53, 149, 245, 341, 437, 85, 181, 277, 373, 469,
+29, 125, 221, 317, 413, 61, 157, 253, 349, 445, 93, 189, 285, 381, 477,
+2, 98, 194, 290, 386, 34, 130, 226, 322, 418, 66, 162, 258, 354, 450,
+10, 106, 202, 298, 394, 42, 138, 234, 330, 426, 74, 170, 266, 362, 458,
+18, 114, 210, 306, 402, 50, 146, 242, 338, 434, 82, 178, 274, 370, 466,
+26, 122, 218, 314, 410, 58, 154, 250, 346, 442, 90, 186, 282, 378, 474,
+6, 102, 198, 294, 390, 38, 134, 230, 326, 422, 70, 166, 262, 358, 454,
+14, 110, 206, 302, 398, 46, 142, 238, 334, 430, 78, 174, 270, 366, 462,
+22, 118, 214, 310, 406, 54, 150, 246, 342, 438, 86, 182, 278, 374, 470,
+30, 126, 222, 318, 414, 62, 158, 254, 350, 446, 94, 190, 286, 382, 478,
+3, 99, 195, 291, 387, 35, 131, 227, 323, 419, 67, 163, 259, 355, 451,
+11, 107, 203, 299, 395, 43, 139, 235, 331, 427, 75, 171, 267, 363, 459,
+19, 115, 211, 307, 403, 51, 147, 243, 339, 435, 83, 179, 275, 371, 467,
+27, 123, 219, 315, 411, 59, 155, 251, 347, 443, 91, 187, 283, 379, 475,
+7, 103, 199, 295, 391, 39, 135, 231, 327, 423, 71, 167, 263, 359, 455,
+15, 111, 207, 303, 399, 47, 143, 239, 335, 431, 79, 175, 271, 367, 463,
+23, 119, 215, 311, 407, 55, 151, 247, 343, 439, 87, 183, 279, 375, 471,
+31, 127, 223, 319, 415, 63, 159, 255, 351, 447, 95, 191, 287, 383, 479,
 };
 #endif
 
 #ifndef FFT_BITREV240
 #define FFT_BITREV240
 static const opus_int16 fft_bitrev240[240] = {
-0, 60, 120, 180, 15, 75, 135, 195, 30, 90, 150, 210, 45, 105, 165,
-225, 5, 65, 125, 185, 20, 80, 140, 200, 35, 95, 155, 215, 50, 110,
-170, 230, 10, 70, 130, 190, 25, 85, 145, 205, 40, 100, 160, 220, 55,
-115, 175, 235, 1, 61, 121, 181, 16, 76, 136, 196, 31, 91, 151, 211,
-46, 106, 166, 226, 6, 66, 126, 186, 21, 81, 141, 201, 36, 96, 156,
-216, 51, 111, 171, 231, 11, 71, 131, 191, 26, 86, 146, 206, 41, 101,
-161, 221, 56, 116, 176, 236, 2, 62, 122, 182, 17, 77, 137, 197, 32,
-92, 152, 212, 47, 107, 167, 227, 7, 67, 127, 187, 22, 82, 142, 202,
-37, 97, 157, 217, 52, 112, 172, 232, 12, 72, 132, 192, 27, 87, 147,
-207, 42, 102, 162, 222, 57, 117, 177, 237, 3, 63, 123, 183, 18, 78,
-138, 198, 33, 93, 153, 213, 48, 108, 168, 228, 8, 68, 128, 188, 23,
-83, 143, 203, 38, 98, 158, 218, 53, 113, 173, 233, 13, 73, 133, 193,
-28, 88, 148, 208, 43, 103, 163, 223, 58, 118, 178, 238, 4, 64, 124,
-184, 19, 79, 139, 199, 34, 94, 154, 214, 49, 109, 169, 229, 9, 69,
-129, 189, 24, 84, 144, 204, 39, 99, 159, 219, 54, 114, 174, 234, 14,
-74, 134, 194, 29, 89, 149, 209, 44, 104, 164, 224, 59, 119, 179, 239,
+0, 48, 96, 144, 192, 16, 64, 112, 160, 208, 32, 80, 128, 176, 224,
+4, 52, 100, 148, 196, 20, 68, 116, 164, 212, 36, 84, 132, 180, 228,
+8, 56, 104, 152, 200, 24, 72, 120, 168, 216, 40, 88, 136, 184, 232,
+12, 60, 108, 156, 204, 28, 76, 124, 172, 220, 44, 92, 140, 188, 236,
+1, 49, 97, 145, 193, 17, 65, 113, 161, 209, 33, 81, 129, 177, 225,
+5, 53, 101, 149, 197, 21, 69, 117, 165, 213, 37, 85, 133, 181, 229,
+9, 57, 105, 153, 201, 25, 73, 121, 169, 217, 41, 89, 137, 185, 233,
+13, 61, 109, 157, 205, 29, 77, 125, 173, 221, 45, 93, 141, 189, 237,
+2, 50, 98, 146, 194, 18, 66, 114, 162, 210, 34, 82, 130, 178, 226,
+6, 54, 102, 150, 198, 22, 70, 118, 166, 214, 38, 86, 134, 182, 230,
+10, 58, 106, 154, 202, 26, 74, 122, 170, 218, 42, 90, 138, 186, 234,
+14, 62, 110, 158, 206, 30, 78, 126, 174, 222, 46, 94, 142, 190, 238,
+3, 51, 99, 147, 195, 19, 67, 115, 163, 211, 35, 83, 131, 179, 227,
+7, 55, 103, 151, 199, 23, 71, 119, 167, 215, 39, 87, 135, 183, 231,
+11, 59, 107, 155, 203, 27, 75, 123, 171, 219, 43, 91, 139, 187, 235,
+15, 63, 111, 159, 207, 31, 79, 127, 175, 223, 47, 95, 143, 191, 239,
 };
 #endif
 
 #ifndef FFT_BITREV120
 #define FFT_BITREV120
 static const opus_int16 fft_bitrev120[120] = {
-0, 30, 60, 90, 15, 45, 75, 105, 5, 35, 65, 95, 20, 50, 80,
-110, 10, 40, 70, 100, 25, 55, 85, 115, 1, 31, 61, 91, 16, 46,
-76, 106, 6, 36, 66, 96, 21, 51, 81, 111, 11, 41, 71, 101, 26,
-56, 86, 116, 2, 32, 62, 92, 17, 47, 77, 107, 7, 37, 67, 97,
-22, 52, 82, 112, 12, 42, 72, 102, 27, 57, 87, 117, 3, 33, 63,
-93, 18, 48, 78, 108, 8, 38, 68, 98, 23, 53, 83, 113, 13, 43,
-73, 103, 28, 58, 88, 118, 4, 34, 64, 94, 19, 49, 79, 109, 9,
-39, 69, 99, 24, 54, 84, 114, 14, 44, 74, 104, 29, 59, 89, 119,
+0, 24, 48, 72, 96, 8, 32, 56, 80, 104, 16, 40, 64, 88, 112,
+4, 28, 52, 76, 100, 12, 36, 60, 84, 108, 20, 44, 68, 92, 116,
+1, 25, 49, 73, 97, 9, 33, 57, 81, 105, 17, 41, 65, 89, 113,
+5, 29, 53, 77, 101, 13, 37, 61, 85, 109, 21, 45, 69, 93, 117,
+2, 26, 50, 74, 98, 10, 34, 58, 82, 106, 18, 42, 66, 90, 114,
+6, 30, 54, 78, 102, 14, 38, 62, 86, 110, 22, 46, 70, 94, 118,
+3, 27, 51, 75, 99, 11, 35, 59, 83, 107, 19, 43, 67, 91, 115,
+7, 31, 55, 79, 103, 15, 39, 63, 87, 111, 23, 47, 71, 95, 119,
 };
 #endif
 
 #ifndef FFT_BITREV60
 #define FFT_BITREV60
 static const opus_int16 fft_bitrev60[60] = {
-0, 15, 30, 45, 5, 20, 35, 50, 10, 25, 40, 55, 1, 16, 31,
-46, 6, 21, 36, 51, 11, 26, 41, 56, 2, 17, 32, 47, 7, 22,
-37, 52, 12, 27, 42, 57, 3, 18, 33, 48, 8, 23, 38, 53, 13,
-28, 43, 58, 4, 19, 34, 49, 9, 24, 39, 54, 14, 29, 44, 59,
+0, 12, 24, 36, 48, 4, 16, 28, 40, 52, 8, 20, 32, 44, 56,
+1, 13, 25, 37, 49, 5, 17, 29, 41, 53, 9, 21, 33, 45, 57,
+2, 14, 26, 38, 50, 6, 18, 30, 42, 54, 10, 22, 34, 46, 58,
+3, 15, 27, 39, 51, 7, 19, 31, 43, 55, 11, 23, 35, 47, 59,
 };
 #endif
 
@@ -426,10 +431,17 @@
 #define FFT_STATE48000_960_0
 static const kiss_fft_state fft_state48000_960_0 = {
 480,    /* nfft */
+17476,    /* scale */
+8,      /* scale_shift */
 -1,     /* shift */
-{4, 120, 4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, }, /* factors */
+{5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev480,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_480,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -437,10 +449,17 @@
 #define FFT_STATE48000_960_1
 static const kiss_fft_state fft_state48000_960_1 = {
 240,    /* nfft */
+17476,    /* scale */
+7,      /* scale_shift */
 1,      /* shift */
-{4, 60, 4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev240,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_240,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -448,10 +467,17 @@
 #define FFT_STATE48000_960_2
 static const kiss_fft_state fft_state48000_960_2 = {
 120,    /* nfft */
+17476,    /* scale */
+6,      /* scale_shift */
 2,      /* shift */
-{4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev120,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_120,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -459,10 +485,17 @@
 #define FFT_STATE48000_960_3
 static const kiss_fft_state fft_state48000_960_3 = {
 60,     /* nfft */
+17476,    /* scale */
+5,      /* scale_shift */
 3,      /* shift */
-{4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
+{5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev60,   /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_60,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -470,104 +503,368 @@
 
 #ifndef MDCT_TWIDDLES960
 #define MDCT_TWIDDLES960
-static const opus_val16 mdct_twiddles960[481] = {
-32767, 32767, 32767, 32767, 32766,
-32763, 32762, 32759, 32757, 32753,
-32751, 32747, 32743, 32738, 32733,
-32729, 32724, 32717, 32711, 32705,
-32698, 32690, 32683, 32676, 32667,
-32658, 32650, 32640, 32631, 32620,
-32610, 32599, 32588, 32577, 32566,
-32554, 32541, 32528, 32515, 32502,
-32487, 32474, 32459, 32444, 32429,
-32413, 32397, 32381, 32364, 32348,
-32331, 32313, 32294, 32277, 32257,
-32239, 32219, 32200, 32180, 32159,
-32138, 32118, 32096, 32074, 32051,
-32029, 32006, 31984, 31960, 31936,
-31912, 31888, 31863, 31837, 31812,
-31786, 31760, 31734, 31707, 31679,
-31652, 31624, 31596, 31567, 31539,
-31508, 31479, 31450, 31419, 31388,
-31357, 31326, 31294, 31262, 31230,
-31198, 31164, 31131, 31097, 31063,
-31030, 30994, 30959, 30924, 30889,
-30853, 30816, 30779, 30743, 30705,
-30668, 30629, 30592, 30553, 30515,
-30475, 30435, 30396, 30356, 30315,
-30274, 30233, 30191, 30149, 30107,
-30065, 30022, 29979, 29936, 29891,
-29847, 29803, 29758, 29713, 29668,
-29622, 29577, 29529, 29483, 29436,
-29390, 29341, 29293, 29246, 29197,
-29148, 29098, 29050, 29000, 28949,
-28899, 28848, 28797, 28746, 28694,
-28642, 28590, 28537, 28485, 28432,
-28378, 28324, 28271, 28217, 28162,
-28106, 28051, 27995, 27940, 27884,
-27827, 27770, 27713, 27657, 27598,
-27540, 27481, 27423, 27365, 27305,
-27246, 27187, 27126, 27066, 27006,
-26945, 26883, 26822, 26760, 26698,
-26636, 26574, 26510, 26448, 26383,
-26320, 26257, 26191, 26127, 26062,
-25997, 25931, 25866, 25800, 25734,
-25667, 25601, 25533, 25466, 25398,
-25330, 25262, 25194, 25125, 25056,
-24987, 24917, 24848, 24778, 24707,
-24636, 24566, 24495, 24424, 24352,
-24280, 24208, 24135, 24063, 23990,
-23917, 23842, 23769, 23695, 23622,
-23546, 23472, 23398, 23322, 23246,
-23171, 23095, 23018, 22942, 22866,
-22788, 22711, 22634, 22557, 22478,
-22400, 22322, 22244, 22165, 22085,
-22006, 21927, 21846, 21766, 21687,
-21606, 21524, 21443, 21363, 21282,
-21199, 21118, 21035, 20954, 20870,
-20788, 20705, 20621, 20538, 20455,
-20371, 20286, 20202, 20118, 20034,
-19947, 19863, 19777, 19692, 19606,
-19520, 19434, 19347, 19260, 19174,
-19088, 18999, 18911, 18825, 18737,
-18648, 18560, 18472, 18384, 18294,
-18205, 18116, 18025, 17936, 17846,
-17757, 17666, 17576, 17485, 17395,
-17303, 17212, 17122, 17030, 16937,
-16846, 16755, 16662, 16569, 16477,
-16385, 16291, 16198, 16105, 16012,
-15917, 15824, 15730, 15636, 15541,
-15447, 15352, 15257, 15162, 15067,
-14973, 14875, 14781, 14685, 14589,
-14493, 14396, 14300, 14204, 14107,
-14010, 13914, 13815, 13718, 13621,
-13524, 13425, 13328, 13230, 13133,
-13033, 12935, 12836, 12738, 12638,
-12540, 12441, 12341, 12241, 12142,
-12044, 11943, 11843, 11744, 11643,
-11542, 11442, 11342, 11241, 11139,
-11039, 10939, 10836, 10736, 10635,
-10534, 10431, 10330, 10228, 10127,
-10024, 9921, 9820, 9718, 9614,
-9512, 9410, 9306, 9204, 9101,
-8998, 8895, 8791, 8689, 8585,
-8481, 8377, 8274, 8171, 8067,
-7962, 7858, 7753, 7650, 7545,
-7441, 7336, 7231, 7129, 7023,
-6917, 6813, 6709, 6604, 6498,
-6393, 6288, 6182, 6077, 5973,
-5867, 5760, 5656, 5549, 5445,
-5339, 5232, 5127, 5022, 4914,
-4809, 4703, 4596, 4490, 4384,
-4278, 4171, 4065, 3958, 3852,
-3745, 3640, 3532, 3426, 3318,
-3212, 3106, 2998, 2891, 2786,
-2679, 2570, 2465, 2358, 2251,
-2143, 2037, 1929, 1823, 1715,
-1609, 1501, 1393, 1287, 1180,
-1073, 964, 858, 751, 644,
-535, 429, 322, 214, 107,
-0, };
+static const opus_val16 mdct_twiddles960[1800] = {
+32767, 32767, 32767, 32766, 32765,
+32763, 32761, 32759, 32756, 32753,
+32750, 32746, 32742, 32738, 32733,
+32728, 32722, 32717, 32710, 32704,
+32697, 32690, 32682, 32674, 32666,
+32657, 32648, 32639, 32629, 32619,
+32609, 32598, 32587, 32576, 32564,
+32552, 32539, 32526, 32513, 32500,
+32486, 32472, 32457, 32442, 32427,
+32411, 32395, 32379, 32362, 32345,
+32328, 32310, 32292, 32274, 32255,
+32236, 32217, 32197, 32177, 32157,
+32136, 32115, 32093, 32071, 32049,
+32027, 32004, 31981, 31957, 31933,
+31909, 31884, 31859, 31834, 31809,
+31783, 31756, 31730, 31703, 31676,
+31648, 31620, 31592, 31563, 31534,
+31505, 31475, 31445, 31415, 31384,
+31353, 31322, 31290, 31258, 31226,
+31193, 31160, 31127, 31093, 31059,
+31025, 30990, 30955, 30920, 30884,
+30848, 30812, 30775, 30738, 30701,
+30663, 30625, 30587, 30548, 30509,
+30470, 30430, 30390, 30350, 30309,
+30269, 30227, 30186, 30144, 30102,
+30059, 30016, 29973, 29930, 29886,
+29842, 29797, 29752, 29707, 29662,
+29616, 29570, 29524, 29477, 29430,
+29383, 29335, 29287, 29239, 29190,
+29142, 29092, 29043, 28993, 28943,
+28892, 28842, 28791, 28739, 28688,
+28636, 28583, 28531, 28478, 28425,
+28371, 28317, 28263, 28209, 28154,
+28099, 28044, 27988, 27932, 27876,
+27820, 27763, 27706, 27648, 27591,
+27533, 27474, 27416, 27357, 27298,
+27238, 27178, 27118, 27058, 26997,
+26936, 26875, 26814, 26752, 26690,
+26628, 26565, 26502, 26439, 26375,
+26312, 26247, 26183, 26119, 26054,
+25988, 25923, 25857, 25791, 25725,
+25658, 25592, 25524, 25457, 25389,
+25322, 25253, 25185, 25116, 25047,
+24978, 24908, 24838, 24768, 24698,
+24627, 24557, 24485, 24414, 24342,
+24270, 24198, 24126, 24053, 23980,
+23907, 23834, 23760, 23686, 23612,
+23537, 23462, 23387, 23312, 23237,
+23161, 23085, 23009, 22932, 22856,
+22779, 22701, 22624, 22546, 22468,
+22390, 22312, 22233, 22154, 22075,
+21996, 21916, 21836, 21756, 21676,
+21595, 21515, 21434, 21352, 21271,
+21189, 21107, 21025, 20943, 20860,
+20777, 20694, 20611, 20528, 20444,
+20360, 20276, 20192, 20107, 20022,
+19937, 19852, 19767, 19681, 19595,
+19509, 19423, 19336, 19250, 19163,
+19076, 18988, 18901, 18813, 18725,
+18637, 18549, 18460, 18372, 18283,
+18194, 18104, 18015, 17925, 17835,
+17745, 17655, 17565, 17474, 17383,
+17292, 17201, 17110, 17018, 16927,
+16835, 16743, 16650, 16558, 16465,
+16372, 16279, 16186, 16093, 15999,
+15906, 15812, 15718, 15624, 15529,
+15435, 15340, 15245, 15150, 15055,
+14960, 14864, 14769, 14673, 14577,
+14481, 14385, 14288, 14192, 14095,
+13998, 13901, 13804, 13706, 13609,
+13511, 13414, 13316, 13218, 13119,
+13021, 12923, 12824, 12725, 12626,
+12527, 12428, 12329, 12230, 12130,
+12030, 11930, 11831, 11730, 11630,
+11530, 11430, 11329, 11228, 11128,
+11027, 10926, 10824, 10723, 10622,
+10520, 10419, 10317, 10215, 10113,
+10011, 9909, 9807, 9704, 9602,
+9499, 9397, 9294, 9191, 9088,
+8985, 8882, 8778, 8675, 8572,
+8468, 8364, 8261, 8157, 8053,
+7949, 7845, 7741, 7637, 7532,
+7428, 7323, 7219, 7114, 7009,
+6905, 6800, 6695, 6590, 6485,
+6380, 6274, 6169, 6064, 5958,
+5853, 5747, 5642, 5536, 5430,
+5325, 5219, 5113, 5007, 4901,
+4795, 4689, 4583, 4476, 4370,
+4264, 4157, 4051, 3945, 3838,
+3732, 3625, 3518, 3412, 3305,
+3198, 3092, 2985, 2878, 2771,
+2664, 2558, 2451, 2344, 2237,
+2130, 2023, 1916, 1809, 1702,
+1594, 1487, 1380, 1273, 1166,
+1059, 952, 844, 737, 630,
+523, 416, 308, 201, 94,
+-13, -121, -228, -335, -442,
+-550, -657, -764, -871, -978,
+-1086, -1193, -1300, -1407, -1514,
+-1621, -1728, -1835, -1942, -2049,
+-2157, -2263, -2370, -2477, -2584,
+-2691, -2798, -2905, -3012, -3118,
+-3225, -3332, -3439, -3545, -3652,
+-3758, -3865, -3971, -4078, -4184,
+-4290, -4397, -4503, -4609, -4715,
+-4821, -4927, -5033, -5139, -5245,
+-5351, -5457, -5562, -5668, -5774,
+-5879, -5985, -6090, -6195, -6301,
+-6406, -6511, -6616, -6721, -6826,
+-6931, -7036, -7140, -7245, -7349,
+-7454, -7558, -7663, -7767, -7871,
+-7975, -8079, -8183, -8287, -8390,
+-8494, -8597, -8701, -8804, -8907,
+-9011, -9114, -9217, -9319, -9422,
+-9525, -9627, -9730, -9832, -9934,
+-10037, -10139, -10241, -10342, -10444,
+-10546, -10647, -10748, -10850, -10951,
+-11052, -11153, -11253, -11354, -11455,
+-11555, -11655, -11756, -11856, -11955,
+-12055, -12155, -12254, -12354, -12453,
+-12552, -12651, -12750, -12849, -12947,
+-13046, -13144, -13242, -13340, -13438,
+-13536, -13633, -13731, -13828, -13925,
+-14022, -14119, -14216, -14312, -14409,
+-14505, -14601, -14697, -14793, -14888,
+-14984, -15079, -15174, -15269, -15364,
+-15459, -15553, -15647, -15741, -15835,
+-15929, -16023, -16116, -16210, -16303,
+-16396, -16488, -16581, -16673, -16766,
+-16858, -16949, -17041, -17133, -17224,
+-17315, -17406, -17497, -17587, -17678,
+-17768, -17858, -17948, -18037, -18127,
+-18216, -18305, -18394, -18483, -18571,
+-18659, -18747, -18835, -18923, -19010,
+-19098, -19185, -19271, -19358, -19444,
+-19531, -19617, -19702, -19788, -19873,
+-19959, -20043, -20128, -20213, -20297,
+-20381, -20465, -20549, -20632, -20715,
+-20798, -20881, -20963, -21046, -21128,
+-21210, -21291, -21373, -21454, -21535,
+-21616, -21696, -21776, -21856, -21936,
+-22016, -22095, -22174, -22253, -22331,
+-22410, -22488, -22566, -22643, -22721,
+-22798, -22875, -22951, -23028, -23104,
+-23180, -23256, -23331, -23406, -23481,
+-23556, -23630, -23704, -23778, -23852,
+-23925, -23998, -24071, -24144, -24216,
+-24288, -24360, -24432, -24503, -24574,
+-24645, -24716, -24786, -24856, -24926,
+-24995, -25064, -25133, -25202, -25270,
+-25339, -25406, -25474, -25541, -25608,
+-25675, -25742, -25808, -25874, -25939,
+-26005, -26070, -26135, -26199, -26264,
+-26327, -26391, -26455, -26518, -26581,
+-26643, -26705, -26767, -26829, -26891,
+-26952, -27013, -27073, -27133, -27193,
+-27253, -27312, -27372, -27430, -27489,
+-27547, -27605, -27663, -27720, -27777,
+-27834, -27890, -27946, -28002, -28058,
+-28113, -28168, -28223, -28277, -28331,
+-28385, -28438, -28491, -28544, -28596,
+-28649, -28701, -28752, -28803, -28854,
+-28905, -28955, -29006, -29055, -29105,
+-29154, -29203, -29251, -29299, -29347,
+-29395, -29442, -29489, -29535, -29582,
+-29628, -29673, -29719, -29764, -29808,
+-29853, -29897, -29941, -29984, -30027,
+-30070, -30112, -30154, -30196, -30238,
+-30279, -30320, -30360, -30400, -30440,
+-30480, -30519, -30558, -30596, -30635,
+-30672, -30710, -30747, -30784, -30821,
+-30857, -30893, -30929, -30964, -30999,
+-31033, -31068, -31102, -31135, -31168,
+-31201, -31234, -31266, -31298, -31330,
+-31361, -31392, -31422, -31453, -31483,
+-31512, -31541, -31570, -31599, -31627,
+-31655, -31682, -31710, -31737, -31763,
+-31789, -31815, -31841, -31866, -31891,
+-31915, -31939, -31963, -31986, -32010,
+-32032, -32055, -32077, -32099, -32120,
+-32141, -32162, -32182, -32202, -32222,
+-32241, -32260, -32279, -32297, -32315,
+-32333, -32350, -32367, -32383, -32399,
+-32415, -32431, -32446, -32461, -32475,
+-32489, -32503, -32517, -32530, -32542,
+-32555, -32567, -32579, -32590, -32601,
+-32612, -32622, -32632, -32641, -32651,
+-32659, -32668, -32676, -32684, -32692,
+-32699, -32706, -32712, -32718, -32724,
+-32729, -32734, -32739, -32743, -32747,
+-32751, -32754, -32757, -32760, -32762,
+-32764, -32765, -32767, -32767, -32767,
+32767, 32767, 32765, 32761, 32756,
+32750, 32742, 32732, 32722, 32710,
+32696, 32681, 32665, 32647, 32628,
+32608, 32586, 32562, 32538, 32512,
+32484, 32455, 32425, 32393, 32360,
+32326, 32290, 32253, 32214, 32174,
+32133, 32090, 32046, 32001, 31954,
+31906, 31856, 31805, 31753, 31700,
+31645, 31588, 31530, 31471, 31411,
+31349, 31286, 31222, 31156, 31089,
+31020, 30951, 30880, 30807, 30733,
+30658, 30582, 30504, 30425, 30345,
+30263, 30181, 30096, 30011, 29924,
+29836, 29747, 29656, 29564, 29471,
+29377, 29281, 29184, 29086, 28987,
+28886, 28784, 28681, 28577, 28471,
+28365, 28257, 28147, 28037, 27925,
+27812, 27698, 27583, 27467, 27349,
+27231, 27111, 26990, 26868, 26744,
+26620, 26494, 26367, 26239, 26110,
+25980, 25849, 25717, 25583, 25449,
+25313, 25176, 25038, 24900, 24760,
+24619, 24477, 24333, 24189, 24044,
+23898, 23751, 23602, 23453, 23303,
+23152, 22999, 22846, 22692, 22537,
+22380, 22223, 22065, 21906, 21746,
+21585, 21423, 21261, 21097, 20933,
+20767, 20601, 20434, 20265, 20096,
+19927, 19756, 19584, 19412, 19239,
+19065, 18890, 18714, 18538, 18361,
+18183, 18004, 17824, 17644, 17463,
+17281, 17098, 16915, 16731, 16546,
+16361, 16175, 15988, 15800, 15612,
+15423, 15234, 15043, 14852, 14661,
+14469, 14276, 14083, 13889, 13694,
+13499, 13303, 13107, 12910, 12713,
+12515, 12317, 12118, 11918, 11718,
+11517, 11316, 11115, 10913, 10710,
+10508, 10304, 10100, 9896, 9691,
+9486, 9281, 9075, 8869, 8662,
+8455, 8248, 8040, 7832, 7623,
+7415, 7206, 6996, 6787, 6577,
+6366, 6156, 5945, 5734, 5523,
+5311, 5100, 4888, 4675, 4463,
+4251, 4038, 3825, 3612, 3399,
+3185, 2972, 2758, 2544, 2330,
+2116, 1902, 1688, 1474, 1260,
+1045, 831, 617, 402, 188,
+-27, -241, -456, -670, -885,
+-1099, -1313, -1528, -1742, -1956,
+-2170, -2384, -2598, -2811, -3025,
+-3239, -3452, -3665, -3878, -4091,
+-4304, -4516, -4728, -4941, -5153,
+-5364, -5576, -5787, -5998, -6209,
+-6419, -6629, -6839, -7049, -7258,
+-7467, -7676, -7884, -8092, -8300,
+-8507, -8714, -8920, -9127, -9332,
+-9538, -9743, -9947, -10151, -10355,
+-10558, -10761, -10963, -11165, -11367,
+-11568, -11768, -11968, -12167, -12366,
+-12565, -12762, -12960, -13156, -13352,
+-13548, -13743, -13937, -14131, -14324,
+-14517, -14709, -14900, -15091, -15281,
+-15470, -15659, -15847, -16035, -16221,
+-16407, -16593, -16777, -16961, -17144,
+-17326, -17508, -17689, -17869, -18049,
+-18227, -18405, -18582, -18758, -18934,
+-19108, -19282, -19455, -19627, -19799,
+-19969, -20139, -20308, -20475, -20642,
+-20809, -20974, -21138, -21301, -21464,
+-21626, -21786, -21946, -22105, -22263,
+-22420, -22575, -22730, -22884, -23037,
+-23189, -23340, -23490, -23640, -23788,
+-23935, -24080, -24225, -24369, -24512,
+-24654, -24795, -24934, -25073, -25211,
+-25347, -25482, -25617, -25750, -25882,
+-26013, -26143, -26272, -26399, -26526,
+-26651, -26775, -26898, -27020, -27141,
+-27260, -27379, -27496, -27612, -27727,
+-27841, -27953, -28065, -28175, -28284,
+-28391, -28498, -28603, -28707, -28810,
+-28911, -29012, -29111, -29209, -29305,
+-29401, -29495, -29587, -29679, -29769,
+-29858, -29946, -30032, -30118, -30201,
+-30284, -30365, -30445, -30524, -30601,
+-30677, -30752, -30825, -30897, -30968,
+-31038, -31106, -31172, -31238, -31302,
+-31365, -31426, -31486, -31545, -31602,
+-31658, -31713, -31766, -31818, -31869,
+-31918, -31966, -32012, -32058, -32101,
+-32144, -32185, -32224, -32262, -32299,
+-32335, -32369, -32401, -32433, -32463,
+-32491, -32518, -32544, -32568, -32591,
+-32613, -32633, -32652, -32669, -32685,
+-32700, -32713, -32724, -32735, -32744,
+-32751, -32757, -32762, -32766, -32767,
+32767, 32764, 32755, 32741, 32720,
+32694, 32663, 32626, 32583, 32535,
+32481, 32421, 32356, 32286, 32209,
+32128, 32041, 31948, 31850, 31747,
+31638, 31523, 31403, 31278, 31148,
+31012, 30871, 30724, 30572, 30415,
+30253, 30086, 29913, 29736, 29553,
+29365, 29172, 28974, 28771, 28564,
+28351, 28134, 27911, 27684, 27452,
+27216, 26975, 26729, 26478, 26223,
+25964, 25700, 25432, 25159, 24882,
+24601, 24315, 24026, 23732, 23434,
+23133, 22827, 22517, 22204, 21886,
+21565, 21240, 20912, 20580, 20244,
+19905, 19563, 19217, 18868, 18516,
+18160, 17802, 17440, 17075, 16708,
+16338, 15964, 15588, 15210, 14829,
+14445, 14059, 13670, 13279, 12886,
+12490, 12093, 11693, 11291, 10888,
+10482, 10075, 9666, 9255, 8843,
+8429, 8014, 7597, 7180, 6760,
+6340, 5919, 5496, 5073, 4649,
+4224, 3798, 3372, 2945, 2517,
+2090, 1661, 1233, 804, 375,
+-54, -483, -911, -1340, -1768,
+-2197, -2624, -3052, -3479, -3905,
+-4330, -4755, -5179, -5602, -6024,
+-6445, -6865, -7284, -7702, -8118,
+-8533, -8946, -9358, -9768, -10177,
+-10584, -10989, -11392, -11793, -12192,
+-12589, -12984, -13377, -13767, -14155,
+-14541, -14924, -15305, -15683, -16058,
+-16430, -16800, -17167, -17531, -17892,
+-18249, -18604, -18956, -19304, -19649,
+-19990, -20329, -20663, -20994, -21322,
+-21646, -21966, -22282, -22595, -22904,
+-23208, -23509, -23806, -24099, -24387,
+-24672, -24952, -25228, -25499, -25766,
+-26029, -26288, -26541, -26791, -27035,
+-27275, -27511, -27741, -27967, -28188,
+-28405, -28616, -28823, -29024, -29221,
+-29412, -29599, -29780, -29957, -30128,
+-30294, -30455, -30611, -30761, -30906,
+-31046, -31181, -31310, -31434, -31552,
+-31665, -31773, -31875, -31972, -32063,
+-32149, -32229, -32304, -32373, -32437,
+-32495, -32547, -32594, -32635, -32671,
+-32701, -32726, -32745, -32758, -32766,
+32767, 32754, 32717, 32658, 32577,
+32473, 32348, 32200, 32029, 31837,
+31624, 31388, 31131, 30853, 30553,
+30232, 29891, 29530, 29148, 28746,
+28324, 27883, 27423, 26944, 26447,
+25931, 25398, 24847, 24279, 23695,
+23095, 22478, 21846, 21199, 20538,
+19863, 19174, 18472, 17757, 17030,
+16291, 15541, 14781, 14010, 13230,
+12441, 11643, 10837, 10024, 9204,
+8377, 7545, 6708, 5866, 5020,
+4171, 3319, 2464, 1608, 751,
+-107, -965, -1822, -2678, -3532,
+-4383, -5232, -6077, -6918, -7754,
+-8585, -9409, -10228, -11039, -11843,
+-12639, -13426, -14204, -14972, -15730,
+-16477, -17213, -17937, -18648, -19347,
+-20033, -20705, -21363, -22006, -22634,
+-23246, -23843, -24423, -24986, -25533,
+-26062, -26573, -27066, -27540, -27995,
+-28431, -28848, -29245, -29622, -29979,
+-30315, -30630, -30924, -31197, -31449,
+-31679, -31887, -32074, -32239, -32381,
+-32501, -32600, -32675, -32729, -32759,
+};
 #endif
 
 static const CELTMode mode48000_960_120 = {
diff --git a/celt/static_modes_fixed_arm_ne10.h b/celt/static_modes_fixed_arm_ne10.h
new file mode 100644
index 0000000..b8ef0ce
--- /dev/null
+++ b/celt/static_modes_fixed_arm_ne10.h
@@ -0,0 +1,388 @@
+/* The contents of this file was automatically generated by
+ * dump_mode_arm_ne10.c with arguments: 48000 960
+ * It contains static definitions for some pre-defined modes. */
+#include <NE10_init.h>
+
+#ifndef NE10_FFT_PARAMS48000_960
+#define NE10_FFT_PARAMS48000_960
+static const ne10_int32_t ne10_factors_480[64] = {
+4, 40, 4, 30, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_240[64] = {
+3, 20, 4, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_120[64] = {
+3, 10, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_60[64] = {
+2, 5, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_fft_cpx_int32_t ne10_twiddles_480[480] = {
+{0,0}, {2147483647,0}, {2147483647,0},
+{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394},
+{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496},
+{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096},
+{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152},
+{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313},
+{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424},
+{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496},
+{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268},
+{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785},
+{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172},
+{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682},
+{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313},
+{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450},
+{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067},
+{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277},
+{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424},
+{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771},
+{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994},
+{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593},
+{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968},
+{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851},
+{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394},
+{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959},
+{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516},
+{-94,-2147483647}, {-224473265,-2135719496}, {-446487060,-2100555955},
+{-663609049,-2042378281}, {-873460398,-1961823883}, {-1073741932,-1859775330},
+{-1262259116,-1737350839}, {-1436947137,-1595891268}, {-1595891628,-1436946738},
+{-1737350854,-1262259096}, {-1859775343,-1073741910}, {-1961823997,-873460141},
+{-2042378447,-663608538}, {-2100556013,-446486785}, {-2135719499,-224473240},
+{2147483647,0}, {2121044558,-335940465}, {2042378310,-663608960},
+{1913421927,-974937199}, {1737350743,-1262259248}, {1518500216,-1518500282},
+{1262259172,-1737350799}, {974937230,-1913421912}, {663608871,-2042378339},
+{335940246,-2121044593}, {-94,-2147483647}, {-335940431,-2121044564},
+{-663609049,-2042378281}, {-974937397,-1913421827}, {-1262259116,-1737350839},
+{-1518500258,-1518500240}, {-1737350854,-1262259096}, {-1913422071,-974936918},
+{-2042378447,-663608538}, {-2121044568,-335940406}, {-2147483647,188},
+{-2121044509,335940777}, {-2042378331,663608895}, {-1913421900,974937252},
+{-1737350633,1262259400}, {-1518499993,1518500506}, {-1262258813,1737351059},
+{-974936606,1913422229}, {-663609179,2042378239}, {-335940566,2121044542},
+{2147483647,0}, {2147299667,-28109693}, {2146747758,-56214570},
+{2145828015,-84309815}, {2144540595,-112390613}, {2142885719,-140452154},
+{2140863671,-168489630}, {2138474797,-196498235}, {2135719506,-224473172},
+{2132598271,-252409646}, {2129111626,-280302871}, {2125260168,-308148068},
+{2121044558,-335940465}, {2116465518,-363675300}, {2111523833,-391347822},
+{2106220349,-418953288}, {2100555974,-446486968}, {2094531681,-473944146},
+{2088148500,-501320115}, {2081407525,-528610186}, {2074309912,-555809682},
+{2066856885,-582913912}, {2059049696,-609918325}, {2050889698,-636818231},
+{2042378310,-663608960}, {2033516972,-690285983}, {2024307180,-716844791},
+{2014750533,-743280770}, {2004848691,-769589332}, {1994603329,-795766029},
+{1984016179,-821806435}, {1973089077,-847706028}, {1961823921,-873460313},
+{1950222618,-899064934}, {1938287127,-924515564}, {1926019520,-949807783},
+{1913421927,-974937199}, {1900496481,-999899565}, {1887245364,-1024690661},
+{1873670877,-1049306180}, {1859775377,-1073741851}, {1845561215,-1097993541},
+{1831030826,-1122057097}, {1816186632,-1145928502}, {1801031311,-1169603450},
+{1785567394,-1193077993}, {1769797456,-1216348214}, {1753724345,-1239409914},
+{1737350743,-1262259248}, {1720679456,-1284892300}, {1703713340,-1307305194},
+{1686455222,-1329494189}, {1668908218,-1351455280}, {1651075255,-1373184807},
+{1632959307,-1394679144}, {1614563642,-1415934412}, {1595891331,-1436947067},
+{1576945572,-1457713510}, {1557729613,-1478230181}, {1538246655,-1498493658},
+{1518500216,-1518500282}, {1498493590,-1538246721}, {1478230113,-1557729677},
+{1457713441,-1576945636}, {1436946998,-1595891394}, {1415934341,-1614563704},
+{1394679073,-1632959368}, {1373184735,-1651075315}, {1351455207,-1668908277},
+{1329494115,-1686455280}, {1307305120,-1703713397}, {1284892225,-1720679512},
+{1262259172,-1737350799}, {1239409837,-1753724400}, {1216348136,-1769797510},
+{1193077915,-1785567446}, {1169603371,-1801031362}, {1145928423,-1816186682},
+{1122057017,-1831030875}, {1097993571,-1845561197}, {1073741769,-1859775424},
+{1049305987,-1873670985}, {1024690635,-1887245378}, {999899482,-1900496524},
+{974937230,-1913421912}, {949807699,-1926019561}, {924515422,-1938287195},
+{899064965,-1950222603}, {873460227,-1961823959}, {847705824,-1973089164},
+{821806407,-1984016190}, {795765941,-1994603364}, {769589125,-2004848771},
+{743280682,-2014750566}, {716844642,-2024307233}, {690286016,-2033516961},
+{663608871,-2042378339}, {636818019,-2050889764}, {609918296,-2059049705},
+{582913822,-2066856911}, {555809715,-2074309903}, {528610126,-2081407540},
+{501319962,-2088148536}, {473944148,-2094531680}, {446486876,-2100555994},
+{418953102,-2106220386}, {391347792,-2111523838}, {363675176,-2116465540},
+{335940246,-2121044593}, {308148006,-2125260177}, {280302715,-2129111646},
+{252409648,-2132598271}, {224473078,-2135719516}, {196498046,-2138474814},
+{168489600,-2140863674}, {140452029,-2142885728}, {112390647,-2144540593},
+{84309753,-2145828017}, {56214412,-2146747762}, {28109695,-2147299667},
+{2147483647,0}, {2146747758,-56214570}, {2144540595,-112390613},
+{2140863671,-168489630}, {2135719506,-224473172}, {2129111626,-280302871},
+{2121044558,-335940465}, {2111523833,-391347822}, {2100555974,-446486968},
+{2088148500,-501320115}, {2074309912,-555809682}, {2059049696,-609918325},
+{2042378310,-663608960}, {2024307180,-716844791}, {2004848691,-769589332},
+{1984016179,-821806435}, {1961823921,-873460313}, {1938287127,-924515564},
+{1913421927,-974937199}, {1887245364,-1024690661}, {1859775377,-1073741851},
+{1831030826,-1122057097}, {1801031311,-1169603450}, {1769797456,-1216348214},
+{1737350743,-1262259248}, {1703713340,-1307305194}, {1668908218,-1351455280},
+{1632959307,-1394679144}, {1595891331,-1436947067}, {1557729613,-1478230181},
+{1518500216,-1518500282}, {1478230113,-1557729677}, {1436946998,-1595891394},
+{1394679073,-1632959368}, {1351455207,-1668908277}, {1307305120,-1703713397},
+{1262259172,-1737350799}, {1216348136,-1769797510}, {1169603371,-1801031362},
+{1122057017,-1831030875}, {1073741769,-1859775424}, {1024690635,-1887245378},
+{974937230,-1913421912}, {924515422,-1938287195}, {873460227,-1961823959},
+{821806407,-1984016190}, {769589125,-2004848771}, {716844642,-2024307233},
+{663608871,-2042378339}, {609918296,-2059049705}, {555809715,-2074309903},
+{501319962,-2088148536}, {446486876,-2100555994}, {391347792,-2111523838},
+{335940246,-2121044593}, {280302715,-2129111646}, {224473078,-2135719516},
+{168489600,-2140863674}, {112390647,-2144540593}, {56214412,-2146747762},
+{-94,-2147483647}, {-56214600,-2146747757}, {-112390835,-2144540584},
+{-168489787,-2140863659}, {-224473265,-2135719496}, {-280302901,-2129111622},
+{-335940431,-2121044564}, {-391347977,-2111523804}, {-446487060,-2100555955},
+{-501320144,-2088148493}, {-555809896,-2074309855}, {-609918476,-2059049651},
+{-663609049,-2042378281}, {-716844819,-2024307170}, {-769589300,-2004848703},
+{-821806581,-1984016118}, {-873460398,-1961823883}, {-924515591,-1938287114},
+{-974937397,-1913421827}, {-1024690575,-1887245411}, {-1073741932,-1859775330},
+{-1122057395,-1831030643}, {-1169603421,-1801031330}, {-1216348291,-1769797403},
+{-1262259116,-1737350839}, {-1307305268,-1703713283}, {-1351455453,-1668908078},
+{-1394679021,-1632959413}, {-1436947137,-1595891268}, {-1478230435,-1557729372},
+{-1518500258,-1518500240}, {-1557729742,-1478230045}, {-1595891628,-1436946738},
+{-1632959429,-1394679001}, {-1668908417,-1351455035}, {-1703713298,-1307305248},
+{-1737350854,-1262259096}, {-1769797708,-1216347848}, {-1801031344,-1169603400},
+{-1831030924,-1122056937}, {-1859775343,-1073741910}, {-1887245423,-1024690552},
+{-1913422071,-974936918}, {-1938287125,-924515568}, {-1961823997,-873460141},
+{-1984016324,-821806084}, {-2004848713,-769589276}, {-2024307264,-716844553},
+{-2042378447,-663608538}, {-2059049731,-609918206}, {-2074309994,-555809377},
+{-2088148499,-501320119}, {-2100556013,-446486785}, {-2111523902,-391347448},
+{-2121044568,-335940406}, {-2129111659,-280302621}, {-2135719499,-224473240},
+{-2140863681,-168489506}, {-2144540612,-112390298}, {-2146747758,-56214574},
+{2147483647,0}, {2145828015,-84309815}, {2140863671,-168489630},
+{2132598271,-252409646}, {2121044558,-335940465}, {2106220349,-418953288},
+{2088148500,-501320115}, {2066856885,-582913912}, {2042378310,-663608960},
+{2014750533,-743280770}, {1984016179,-821806435}, {1950222618,-899064934},
+{1913421927,-974937199}, {1873670877,-1049306180}, {1831030826,-1122057097},
+{1785567394,-1193077993}, {1737350743,-1262259248}, {1686455222,-1329494189},
+{1632959307,-1394679144}, {1576945572,-1457713510}, {1518500216,-1518500282},
+{1457713441,-1576945636}, {1394679073,-1632959368}, {1329494115,-1686455280},
+{1262259172,-1737350799}, {1193077915,-1785567446}, {1122057017,-1831030875},
+{1049305987,-1873670985}, {974937230,-1913421912}, {899064965,-1950222603},
+{821806407,-1984016190}, {743280682,-2014750566}, {663608871,-2042378339},
+{582913822,-2066856911}, {501319962,-2088148536}, {418953102,-2106220386},
+{335940246,-2121044593}, {252409648,-2132598271}, {168489600,-2140863674},
+{84309753,-2145828017}, {-94,-2147483647}, {-84309940,-2145828010},
+{-168489787,-2140863659}, {-252409834,-2132598249}, {-335940431,-2121044564},
+{-418953286,-2106220349}, {-501320144,-2088148493}, {-582914003,-2066856860},
+{-663609049,-2042378281}, {-743280858,-2014750501}, {-821806581,-1984016118},
+{-899065136,-1950222525}, {-974937397,-1913421827}, {-1049306374,-1873670768},
+{-1122057395,-1831030643}, {-1193078284,-1785567199}, {-1262259116,-1737350839},
+{-1329494061,-1686455323}, {-1394679021,-1632959413}, {-1457713485,-1576945595},
+{-1518500258,-1518500240}, {-1576945613,-1457713466}, {-1632959429,-1394679001},
+{-1686455338,-1329494041}, {-1737350854,-1262259096}, {-1785567498,-1193077837},
+{-1831030924,-1122056937}, {-1873671031,-1049305905}, {-1913422071,-974936918},
+{-1950222750,-899064648}, {-1984016324,-821806084}, {-2014750687,-743280354},
+{-2042378447,-663608538}, {-2066856867,-582913978}, {-2088148499,-501320119},
+{-2106220354,-418953261}, {-2121044568,-335940406}, {-2132598282,-252409555},
+{-2140863681,-168489506}, {-2145828021,-84309659}, {-2147483647,188},
+{-2145828006,84310034}, {-2140863651,168489881}, {-2132598237,252409928},
+{-2121044509,335940777}, {-2106220281,418953629}, {-2088148411,501320484},
+{-2066856765,582914339}, {-2042378331,663608895}, {-2014750557,743280706},
+{-1984016181,821806431}, {-1950222593,899064989}, {-1913421900,974937252},
+{-1873670848,1049306232}, {-1831030728,1122057257}, {-1785567289,1193078149},
+{-1737350633,1262259400}, {-1686455106,1329494336}, {-1632959185,1394679287},
+{-1576945358,1457713742}, {-1518499993,1518500506}, {-1457713209,1576945850},
+{-1394678735,1632959656}, {-1329493766,1686455555}, {-1262258813,1737351059},
+{-1193077546,1785567692}, {-1122056638,1831031107}, {-1049305599,1873671202},
+{-974936606,1913422229}, {-899064330,1950222896}, {-821805761,1984016458},
+{-743280025,2014750808}, {-663609179,2042378239}, {-582914134,2066856823},
+{-501320277,2088148461}, {-418953420,2106220322}, {-335940566,2121044542},
+{-252409716,2132598263}, {-168489668,2140863668}, {-84309821,2145828015},
+};
+static const ne10_fft_cpx_int32_t ne10_twiddles_240[240] = {
+{0,0}, {2147483647,0}, {2147483647,0},
+{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394},
+{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496},
+{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096},
+{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152},
+{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968},
+{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851},
+{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394},
+{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959},
+{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516},
+{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313},
+{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424},
+{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496},
+{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268},
+{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785},
+{2147483647,0}, {2042378310,-663608960}, {1737350743,-1262259248},
+{1262259172,-1737350799}, {663608871,-2042378339}, {-94,-2147483647},
+{-663609049,-2042378281}, {-1262259116,-1737350839}, {-1737350854,-1262259096},
+{-2042378447,-663608538}, {-2147483647,188}, {-2042378331,663608895},
+{-1737350633,1262259400}, {-1262258813,1737351059}, {-663609179,2042378239},
+{2147483647,0}, {2146747758,-56214570}, {2144540595,-112390613},
+{2140863671,-168489630}, {2135719506,-224473172}, {2129111626,-280302871},
+{2121044558,-335940465}, {2111523833,-391347822}, {2100555974,-446486968},
+{2088148500,-501320115}, {2074309912,-555809682}, {2059049696,-609918325},
+{2042378310,-663608960}, {2024307180,-716844791}, {2004848691,-769589332},
+{1984016179,-821806435}, {1961823921,-873460313}, {1938287127,-924515564},
+{1913421927,-974937199}, {1887245364,-1024690661}, {1859775377,-1073741851},
+{1831030826,-1122057097}, {1801031311,-1169603450}, {1769797456,-1216348214},
+{1737350743,-1262259248}, {1703713340,-1307305194}, {1668908218,-1351455280},
+{1632959307,-1394679144}, {1595891331,-1436947067}, {1557729613,-1478230181},
+{1518500216,-1518500282}, {1478230113,-1557729677}, {1436946998,-1595891394},
+{1394679073,-1632959368}, {1351455207,-1668908277}, {1307305120,-1703713397},
+{1262259172,-1737350799}, {1216348136,-1769797510}, {1169603371,-1801031362},
+{1122057017,-1831030875}, {1073741769,-1859775424}, {1024690635,-1887245378},
+{974937230,-1913421912}, {924515422,-1938287195}, {873460227,-1961823959},
+{821806407,-1984016190}, {769589125,-2004848771}, {716844642,-2024307233},
+{663608871,-2042378339}, {609918296,-2059049705}, {555809715,-2074309903},
+{501319962,-2088148536}, {446486876,-2100555994}, {391347792,-2111523838},
+{335940246,-2121044593}, {280302715,-2129111646}, {224473078,-2135719516},
+{168489600,-2140863674}, {112390647,-2144540593}, {56214412,-2146747762},
+{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172},
+{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682},
+{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313},
+{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450},
+{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067},
+{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277},
+{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424},
+{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771},
+{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994},
+{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593},
+{-94,-2147483647}, {-112390835,-2144540584}, {-224473265,-2135719496},
+{-335940431,-2121044564}, {-446487060,-2100555955}, {-555809896,-2074309855},
+{-663609049,-2042378281}, {-769589300,-2004848703}, {-873460398,-1961823883},
+{-974937397,-1913421827}, {-1073741932,-1859775330}, {-1169603421,-1801031330},
+{-1262259116,-1737350839}, {-1351455453,-1668908078}, {-1436947137,-1595891268},
+{-1518500258,-1518500240}, {-1595891628,-1436946738}, {-1668908417,-1351455035},
+{-1737350854,-1262259096}, {-1801031344,-1169603400}, {-1859775343,-1073741910},
+{-1913422071,-974936918}, {-1961823997,-873460141}, {-2004848713,-769589276},
+{-2042378447,-663608538}, {-2074309994,-555809377}, {-2100556013,-446486785},
+{-2121044568,-335940406}, {-2135719499,-224473240}, {-2144540612,-112390298},
+{2147483647,0}, {2140863671,-168489630}, {2121044558,-335940465},
+{2088148500,-501320115}, {2042378310,-663608960}, {1984016179,-821806435},
+{1913421927,-974937199}, {1831030826,-1122057097}, {1737350743,-1262259248},
+{1632959307,-1394679144}, {1518500216,-1518500282}, {1394679073,-1632959368},
+{1262259172,-1737350799}, {1122057017,-1831030875}, {974937230,-1913421912},
+{821806407,-1984016190}, {663608871,-2042378339}, {501319962,-2088148536},
+{335940246,-2121044593}, {168489600,-2140863674}, {-94,-2147483647},
+{-168489787,-2140863659}, {-335940431,-2121044564}, {-501320144,-2088148493},
+{-663609049,-2042378281}, {-821806581,-1984016118}, {-974937397,-1913421827},
+{-1122057395,-1831030643}, {-1262259116,-1737350839}, {-1394679021,-1632959413},
+{-1518500258,-1518500240}, {-1632959429,-1394679001}, {-1737350854,-1262259096},
+{-1831030924,-1122056937}, {-1913422071,-974936918}, {-1984016324,-821806084},
+{-2042378447,-663608538}, {-2088148499,-501320119}, {-2121044568,-335940406},
+{-2140863681,-168489506}, {-2147483647,188}, {-2140863651,168489881},
+{-2121044509,335940777}, {-2088148411,501320484}, {-2042378331,663608895},
+{-1984016181,821806431}, {-1913421900,974937252}, {-1831030728,1122057257},
+{-1737350633,1262259400}, {-1632959185,1394679287}, {-1518499993,1518500506},
+{-1394678735,1632959656}, {-1262258813,1737351059}, {-1122056638,1831031107},
+{-974936606,1913422229}, {-821805761,1984016458}, {-663609179,2042378239},
+{-501320277,2088148461}, {-335940566,2121044542}, {-168489668,2140863668},
+};
+static const ne10_fft_cpx_int32_t ne10_twiddles_120[120] = {
+{0,0}, {2147483647,0}, {2147483647,0},
+{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394},
+{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496},
+{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096},
+{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152},
+{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313},
+{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424},
+{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496},
+{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268},
+{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785},
+{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172},
+{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682},
+{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313},
+{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450},
+{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067},
+{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277},
+{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424},
+{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771},
+{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994},
+{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593},
+{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968},
+{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851},
+{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394},
+{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959},
+{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516},
+{-94,-2147483647}, {-224473265,-2135719496}, {-446487060,-2100555955},
+{-663609049,-2042378281}, {-873460398,-1961823883}, {-1073741932,-1859775330},
+{-1262259116,-1737350839}, {-1436947137,-1595891268}, {-1595891628,-1436946738},
+{-1737350854,-1262259096}, {-1859775343,-1073741910}, {-1961823997,-873460141},
+{-2042378447,-663608538}, {-2100556013,-446486785}, {-2135719499,-224473240},
+{2147483647,0}, {2121044558,-335940465}, {2042378310,-663608960},
+{1913421927,-974937199}, {1737350743,-1262259248}, {1518500216,-1518500282},
+{1262259172,-1737350799}, {974937230,-1913421912}, {663608871,-2042378339},
+{335940246,-2121044593}, {-94,-2147483647}, {-335940431,-2121044564},
+{-663609049,-2042378281}, {-974937397,-1913421827}, {-1262259116,-1737350839},
+{-1518500258,-1518500240}, {-1737350854,-1262259096}, {-1913422071,-974936918},
+{-2042378447,-663608538}, {-2121044568,-335940406}, {-2147483647,188},
+{-2121044509,335940777}, {-2042378331,663608895}, {-1913421900,974937252},
+{-1737350633,1262259400}, {-1518499993,1518500506}, {-1262258813,1737351059},
+{-974936606,1913422229}, {-663609179,2042378239}, {-335940566,2121044542},
+};
+static const ne10_fft_cpx_int32_t ne10_twiddles_60[60] = {
+{0,0}, {2147483647,0}, {2147483647,0},
+{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394},
+{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496},
+{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096},
+{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152},
+{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968},
+{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851},
+{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394},
+{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959},
+{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516},
+{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313},
+{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424},
+{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496},
+{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268},
+{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785},
+{2147483647,0}, {2042378310,-663608960}, {1737350743,-1262259248},
+{1262259172,-1737350799}, {663608871,-2042378339}, {-94,-2147483647},
+{-663609049,-2042378281}, {-1262259116,-1737350839}, {-1737350854,-1262259096},
+{-2042378447,-663608538}, {-2147483647,188}, {-2042378331,663608895},
+{-1737350633,1262259400}, {-1262258813,1737351059}, {-663609179,2042378239},
+};
+static const ne10_fft_state_int32_t ne10_fft_state_int32_t_480 = {
+120,
+(ne10_int32_t *)ne10_factors_480,
+(ne10_fft_cpx_int32_t *)ne10_twiddles_480,
+NULL,
+(ne10_fft_cpx_int32_t *)&ne10_twiddles_480[120],
+};
+static const arch_fft_state cfg_arch_480 = {
+1,
+(void *)&ne10_fft_state_int32_t_480,
+};
+
+static const ne10_fft_state_int32_t ne10_fft_state_int32_t_240 = {
+60,
+(ne10_int32_t *)ne10_factors_240,
+(ne10_fft_cpx_int32_t *)ne10_twiddles_240,
+NULL,
+(ne10_fft_cpx_int32_t *)&ne10_twiddles_240[60],
+};
+static const arch_fft_state cfg_arch_240 = {
+1,
+(void *)&ne10_fft_state_int32_t_240,
+};
+
+static const ne10_fft_state_int32_t ne10_fft_state_int32_t_120 = {
+30,
+(ne10_int32_t *)ne10_factors_120,
+(ne10_fft_cpx_int32_t *)ne10_twiddles_120,
+NULL,
+(ne10_fft_cpx_int32_t *)&ne10_twiddles_120[30],
+};
+static const arch_fft_state cfg_arch_120 = {
+1,
+(void *)&ne10_fft_state_int32_t_120,
+};
+
+static const ne10_fft_state_int32_t ne10_fft_state_int32_t_60 = {
+15,
+(ne10_int32_t *)ne10_factors_60,
+(ne10_fft_cpx_int32_t *)ne10_twiddles_60,
+NULL,
+(ne10_fft_cpx_int32_t *)&ne10_twiddles_60[15],
+};
+static const arch_fft_state cfg_arch_60 = {
+1,
+(void *)&ne10_fft_state_int32_t_60,
+};
+
+#endif  /* end NE10_FFT_PARAMS48000_960 */
diff --git a/celt/static_modes_float.h b/celt/static_modes_float.h
index 5d7e7b8..e102a38 100644
--- a/celt/static_modes_float.h
+++ b/celt/static_modes_float.h
@@ -4,6 +4,11 @@
 #include "modes.h"
 #include "rate.h"
 
+#ifdef HAVE_ARM_NE10
+#define OVERRIDE_FFT 1
+#include "static_modes_float_arm_ne10.h"
+#endif
+
 #ifndef DEF_WINDOW120
 #define DEF_WINDOW120
 static const opus_val16 window120[120] = {
@@ -341,84 +346,84 @@
 #ifndef FFT_BITREV480
 #define FFT_BITREV480
 static const opus_int16 fft_bitrev480[480] = {
-0, 120, 240, 360, 30, 150, 270, 390, 60, 180, 300, 420, 90, 210, 330,
-450, 15, 135, 255, 375, 45, 165, 285, 405, 75, 195, 315, 435, 105, 225,
-345, 465, 5, 125, 245, 365, 35, 155, 275, 395, 65, 185, 305, 425, 95,
-215, 335, 455, 20, 140, 260, 380, 50, 170, 290, 410, 80, 200, 320, 440,
-110, 230, 350, 470, 10, 130, 250, 370, 40, 160, 280, 400, 70, 190, 310,
-430, 100, 220, 340, 460, 25, 145, 265, 385, 55, 175, 295, 415, 85, 205,
-325, 445, 115, 235, 355, 475, 1, 121, 241, 361, 31, 151, 271, 391, 61,
-181, 301, 421, 91, 211, 331, 451, 16, 136, 256, 376, 46, 166, 286, 406,
-76, 196, 316, 436, 106, 226, 346, 466, 6, 126, 246, 366, 36, 156, 276,
-396, 66, 186, 306, 426, 96, 216, 336, 456, 21, 141, 261, 381, 51, 171,
-291, 411, 81, 201, 321, 441, 111, 231, 351, 471, 11, 131, 251, 371, 41,
-161, 281, 401, 71, 191, 311, 431, 101, 221, 341, 461, 26, 146, 266, 386,
-56, 176, 296, 416, 86, 206, 326, 446, 116, 236, 356, 476, 2, 122, 242,
-362, 32, 152, 272, 392, 62, 182, 302, 422, 92, 212, 332, 452, 17, 137,
-257, 377, 47, 167, 287, 407, 77, 197, 317, 437, 107, 227, 347, 467, 7,
-127, 247, 367, 37, 157, 277, 397, 67, 187, 307, 427, 97, 217, 337, 457,
-22, 142, 262, 382, 52, 172, 292, 412, 82, 202, 322, 442, 112, 232, 352,
-472, 12, 132, 252, 372, 42, 162, 282, 402, 72, 192, 312, 432, 102, 222,
-342, 462, 27, 147, 267, 387, 57, 177, 297, 417, 87, 207, 327, 447, 117,
-237, 357, 477, 3, 123, 243, 363, 33, 153, 273, 393, 63, 183, 303, 423,
-93, 213, 333, 453, 18, 138, 258, 378, 48, 168, 288, 408, 78, 198, 318,
-438, 108, 228, 348, 468, 8, 128, 248, 368, 38, 158, 278, 398, 68, 188,
-308, 428, 98, 218, 338, 458, 23, 143, 263, 383, 53, 173, 293, 413, 83,
-203, 323, 443, 113, 233, 353, 473, 13, 133, 253, 373, 43, 163, 283, 403,
-73, 193, 313, 433, 103, 223, 343, 463, 28, 148, 268, 388, 58, 178, 298,
-418, 88, 208, 328, 448, 118, 238, 358, 478, 4, 124, 244, 364, 34, 154,
-274, 394, 64, 184, 304, 424, 94, 214, 334, 454, 19, 139, 259, 379, 49,
-169, 289, 409, 79, 199, 319, 439, 109, 229, 349, 469, 9, 129, 249, 369,
-39, 159, 279, 399, 69, 189, 309, 429, 99, 219, 339, 459, 24, 144, 264,
-384, 54, 174, 294, 414, 84, 204, 324, 444, 114, 234, 354, 474, 14, 134,
-254, 374, 44, 164, 284, 404, 74, 194, 314, 434, 104, 224, 344, 464, 29,
-149, 269, 389, 59, 179, 299, 419, 89, 209, 329, 449, 119, 239, 359, 479,
+0, 96, 192, 288, 384, 32, 128, 224, 320, 416, 64, 160, 256, 352, 448,
+8, 104, 200, 296, 392, 40, 136, 232, 328, 424, 72, 168, 264, 360, 456,
+16, 112, 208, 304, 400, 48, 144, 240, 336, 432, 80, 176, 272, 368, 464,
+24, 120, 216, 312, 408, 56, 152, 248, 344, 440, 88, 184, 280, 376, 472,
+4, 100, 196, 292, 388, 36, 132, 228, 324, 420, 68, 164, 260, 356, 452,
+12, 108, 204, 300, 396, 44, 140, 236, 332, 428, 76, 172, 268, 364, 460,
+20, 116, 212, 308, 404, 52, 148, 244, 340, 436, 84, 180, 276, 372, 468,
+28, 124, 220, 316, 412, 60, 156, 252, 348, 444, 92, 188, 284, 380, 476,
+1, 97, 193, 289, 385, 33, 129, 225, 321, 417, 65, 161, 257, 353, 449,
+9, 105, 201, 297, 393, 41, 137, 233, 329, 425, 73, 169, 265, 361, 457,
+17, 113, 209, 305, 401, 49, 145, 241, 337, 433, 81, 177, 273, 369, 465,
+25, 121, 217, 313, 409, 57, 153, 249, 345, 441, 89, 185, 281, 377, 473,
+5, 101, 197, 293, 389, 37, 133, 229, 325, 421, 69, 165, 261, 357, 453,
+13, 109, 205, 301, 397, 45, 141, 237, 333, 429, 77, 173, 269, 365, 461,
+21, 117, 213, 309, 405, 53, 149, 245, 341, 437, 85, 181, 277, 373, 469,
+29, 125, 221, 317, 413, 61, 157, 253, 349, 445, 93, 189, 285, 381, 477,
+2, 98, 194, 290, 386, 34, 130, 226, 322, 418, 66, 162, 258, 354, 450,
+10, 106, 202, 298, 394, 42, 138, 234, 330, 426, 74, 170, 266, 362, 458,
+18, 114, 210, 306, 402, 50, 146, 242, 338, 434, 82, 178, 274, 370, 466,
+26, 122, 218, 314, 410, 58, 154, 250, 346, 442, 90, 186, 282, 378, 474,
+6, 102, 198, 294, 390, 38, 134, 230, 326, 422, 70, 166, 262, 358, 454,
+14, 110, 206, 302, 398, 46, 142, 238, 334, 430, 78, 174, 270, 366, 462,
+22, 118, 214, 310, 406, 54, 150, 246, 342, 438, 86, 182, 278, 374, 470,
+30, 126, 222, 318, 414, 62, 158, 254, 350, 446, 94, 190, 286, 382, 478,
+3, 99, 195, 291, 387, 35, 131, 227, 323, 419, 67, 163, 259, 355, 451,
+11, 107, 203, 299, 395, 43, 139, 235, 331, 427, 75, 171, 267, 363, 459,
+19, 115, 211, 307, 403, 51, 147, 243, 339, 435, 83, 179, 275, 371, 467,
+27, 123, 219, 315, 411, 59, 155, 251, 347, 443, 91, 187, 283, 379, 475,
+7, 103, 199, 295, 391, 39, 135, 231, 327, 423, 71, 167, 263, 359, 455,
+15, 111, 207, 303, 399, 47, 143, 239, 335, 431, 79, 175, 271, 367, 463,
+23, 119, 215, 311, 407, 55, 151, 247, 343, 439, 87, 183, 279, 375, 471,
+31, 127, 223, 319, 415, 63, 159, 255, 351, 447, 95, 191, 287, 383, 479,
 };
 #endif
 
 #ifndef FFT_BITREV240
 #define FFT_BITREV240
 static const opus_int16 fft_bitrev240[240] = {
-0, 60, 120, 180, 15, 75, 135, 195, 30, 90, 150, 210, 45, 105, 165,
-225, 5, 65, 125, 185, 20, 80, 140, 200, 35, 95, 155, 215, 50, 110,
-170, 230, 10, 70, 130, 190, 25, 85, 145, 205, 40, 100, 160, 220, 55,
-115, 175, 235, 1, 61, 121, 181, 16, 76, 136, 196, 31, 91, 151, 211,
-46, 106, 166, 226, 6, 66, 126, 186, 21, 81, 141, 201, 36, 96, 156,
-216, 51, 111, 171, 231, 11, 71, 131, 191, 26, 86, 146, 206, 41, 101,
-161, 221, 56, 116, 176, 236, 2, 62, 122, 182, 17, 77, 137, 197, 32,
-92, 152, 212, 47, 107, 167, 227, 7, 67, 127, 187, 22, 82, 142, 202,
-37, 97, 157, 217, 52, 112, 172, 232, 12, 72, 132, 192, 27, 87, 147,
-207, 42, 102, 162, 222, 57, 117, 177, 237, 3, 63, 123, 183, 18, 78,
-138, 198, 33, 93, 153, 213, 48, 108, 168, 228, 8, 68, 128, 188, 23,
-83, 143, 203, 38, 98, 158, 218, 53, 113, 173, 233, 13, 73, 133, 193,
-28, 88, 148, 208, 43, 103, 163, 223, 58, 118, 178, 238, 4, 64, 124,
-184, 19, 79, 139, 199, 34, 94, 154, 214, 49, 109, 169, 229, 9, 69,
-129, 189, 24, 84, 144, 204, 39, 99, 159, 219, 54, 114, 174, 234, 14,
-74, 134, 194, 29, 89, 149, 209, 44, 104, 164, 224, 59, 119, 179, 239,
+0, 48, 96, 144, 192, 16, 64, 112, 160, 208, 32, 80, 128, 176, 224,
+4, 52, 100, 148, 196, 20, 68, 116, 164, 212, 36, 84, 132, 180, 228,
+8, 56, 104, 152, 200, 24, 72, 120, 168, 216, 40, 88, 136, 184, 232,
+12, 60, 108, 156, 204, 28, 76, 124, 172, 220, 44, 92, 140, 188, 236,
+1, 49, 97, 145, 193, 17, 65, 113, 161, 209, 33, 81, 129, 177, 225,
+5, 53, 101, 149, 197, 21, 69, 117, 165, 213, 37, 85, 133, 181, 229,
+9, 57, 105, 153, 201, 25, 73, 121, 169, 217, 41, 89, 137, 185, 233,
+13, 61, 109, 157, 205, 29, 77, 125, 173, 221, 45, 93, 141, 189, 237,
+2, 50, 98, 146, 194, 18, 66, 114, 162, 210, 34, 82, 130, 178, 226,
+6, 54, 102, 150, 198, 22, 70, 118, 166, 214, 38, 86, 134, 182, 230,
+10, 58, 106, 154, 202, 26, 74, 122, 170, 218, 42, 90, 138, 186, 234,
+14, 62, 110, 158, 206, 30, 78, 126, 174, 222, 46, 94, 142, 190, 238,
+3, 51, 99, 147, 195, 19, 67, 115, 163, 211, 35, 83, 131, 179, 227,
+7, 55, 103, 151, 199, 23, 71, 119, 167, 215, 39, 87, 135, 183, 231,
+11, 59, 107, 155, 203, 27, 75, 123, 171, 219, 43, 91, 139, 187, 235,
+15, 63, 111, 159, 207, 31, 79, 127, 175, 223, 47, 95, 143, 191, 239,
 };
 #endif
 
 #ifndef FFT_BITREV120
 #define FFT_BITREV120
 static const opus_int16 fft_bitrev120[120] = {
-0, 30, 60, 90, 15, 45, 75, 105, 5, 35, 65, 95, 20, 50, 80,
-110, 10, 40, 70, 100, 25, 55, 85, 115, 1, 31, 61, 91, 16, 46,
-76, 106, 6, 36, 66, 96, 21, 51, 81, 111, 11, 41, 71, 101, 26,
-56, 86, 116, 2, 32, 62, 92, 17, 47, 77, 107, 7, 37, 67, 97,
-22, 52, 82, 112, 12, 42, 72, 102, 27, 57, 87, 117, 3, 33, 63,
-93, 18, 48, 78, 108, 8, 38, 68, 98, 23, 53, 83, 113, 13, 43,
-73, 103, 28, 58, 88, 118, 4, 34, 64, 94, 19, 49, 79, 109, 9,
-39, 69, 99, 24, 54, 84, 114, 14, 44, 74, 104, 29, 59, 89, 119,
+0, 24, 48, 72, 96, 8, 32, 56, 80, 104, 16, 40, 64, 88, 112,
+4, 28, 52, 76, 100, 12, 36, 60, 84, 108, 20, 44, 68, 92, 116,
+1, 25, 49, 73, 97, 9, 33, 57, 81, 105, 17, 41, 65, 89, 113,
+5, 29, 53, 77, 101, 13, 37, 61, 85, 109, 21, 45, 69, 93, 117,
+2, 26, 50, 74, 98, 10, 34, 58, 82, 106, 18, 42, 66, 90, 114,
+6, 30, 54, 78, 102, 14, 38, 62, 86, 110, 22, 46, 70, 94, 118,
+3, 27, 51, 75, 99, 11, 35, 59, 83, 107, 19, 43, 67, 91, 115,
+7, 31, 55, 79, 103, 15, 39, 63, 87, 111, 23, 47, 71, 95, 119,
 };
 #endif
 
 #ifndef FFT_BITREV60
 #define FFT_BITREV60
 static const opus_int16 fft_bitrev60[60] = {
-0, 15, 30, 45, 5, 20, 35, 50, 10, 25, 40, 55, 1, 16, 31,
-46, 6, 21, 36, 51, 11, 26, 41, 56, 2, 17, 32, 47, 7, 22,
-37, 52, 12, 27, 42, 57, 3, 18, 33, 48, 8, 23, 38, 53, 13,
-28, 43, 58, 4, 19, 34, 49, 9, 24, 39, 54, 14, 29, 44, 59,
+0, 12, 24, 36, 48, 4, 16, 28, 40, 52, 8, 20, 32, 44, 56,
+1, 13, 25, 37, 49, 5, 17, 29, 41, 53, 9, 21, 33, 45, 57,
+2, 14, 26, 38, 50, 6, 18, 30, 42, 54, 10, 22, 34, 46, 58,
+3, 15, 27, 39, 51, 7, 19, 31, 43, 55, 11, 23, 35, 47, 59,
 };
 #endif
 
@@ -428,9 +433,14 @@
 480,    /* nfft */
 0.002083333f,   /* scale */
 -1,     /* shift */
-{4, 120, 4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, }, /* factors */
+{5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, },   /* factors */
 fft_bitrev480,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_480,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -440,9 +450,14 @@
 240,    /* nfft */
 0.004166667f,   /* scale */
 1,      /* shift */
-{4, 60, 4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev240,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_240,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -452,9 +467,14 @@
 120,    /* nfft */
 0.008333333f,   /* scale */
 2,      /* shift */
-{4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev120,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_120,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -464,9 +484,14 @@
 60,     /* nfft */
 0.016666667f,   /* scale */
 3,      /* shift */
-{4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
+{5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
 fft_bitrev60,   /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
+#ifdef OVERRIDE_FFT
+(arch_fft_state *)&cfg_arch_60,
+#else
+NULL,
+#endif
 };
 #endif
 
@@ -474,104 +499,368 @@
 
 #ifndef MDCT_TWIDDLES960
 #define MDCT_TWIDDLES960
-static const opus_val16 mdct_twiddles960[481] = {
-1.0000000f, 0.99999465f, 0.99997858f, 0.99995181f, 0.99991433f,
-0.99986614f, 0.99980724f, 0.99973764f, 0.99965732f, 0.99956631f,
-0.99946459f, 0.99935216f, 0.99922904f, 0.99909521f, 0.99895068f,
-0.99879546f, 0.99862953f, 0.99845292f, 0.99826561f, 0.99806761f,
-0.99785892f, 0.99763955f, 0.99740949f, 0.99716875f, 0.99691733f,
-0.99665524f, 0.99638247f, 0.99609903f, 0.99580493f, 0.99550016f,
-0.99518473f, 0.99485864f, 0.99452190f, 0.99417450f, 0.99381646f,
-0.99344778f, 0.99306846f, 0.99267850f, 0.99227791f, 0.99186670f,
-0.99144486f, 0.99101241f, 0.99056934f, 0.99011566f, 0.98965139f,
-0.98917651f, 0.98869104f, 0.98819498f, 0.98768834f, 0.98717112f,
-0.98664333f, 0.98610497f, 0.98555606f, 0.98499659f, 0.98442657f,
-0.98384600f, 0.98325491f, 0.98265328f, 0.98204113f, 0.98141846f,
-0.98078528f, 0.98014159f, 0.97948742f, 0.97882275f, 0.97814760f,
-0.97746197f, 0.97676588f, 0.97605933f, 0.97534232f, 0.97461487f,
-0.97387698f, 0.97312866f, 0.97236992f, 0.97160077f, 0.97082121f,
-0.97003125f, 0.96923091f, 0.96842019f, 0.96759909f, 0.96676764f,
-0.96592582f, 0.96507367f, 0.96421118f, 0.96333837f, 0.96245523f,
-0.96156180f, 0.96065806f, 0.95974403f, 0.95881973f, 0.95788517f,
-0.95694034f, 0.95598526f, 0.95501995f, 0.95404440f, 0.95305864f,
-0.95206267f, 0.95105651f, 0.95004016f, 0.94901364f, 0.94797697f,
-0.94693013f, 0.94587315f, 0.94480604f, 0.94372882f, 0.94264149f,
-0.94154406f, 0.94043656f, 0.93931897f, 0.93819133f, 0.93705365f,
-0.93590592f, 0.93474818f, 0.93358042f, 0.93240268f, 0.93121493f,
-0.93001722f, 0.92880955f, 0.92759193f, 0.92636438f, 0.92512690f,
-0.92387953f, 0.92262225f, 0.92135509f, 0.92007809f, 0.91879121f,
-0.91749449f, 0.91618795f, 0.91487161f, 0.91354545f, 0.91220952f,
-0.91086382f, 0.90950836f, 0.90814316f, 0.90676824f, 0.90538363f,
-0.90398929f, 0.90258528f, 0.90117161f, 0.89974828f, 0.89831532f,
-0.89687273f, 0.89542055f, 0.89395877f, 0.89248742f, 0.89100652f,
-0.88951606f, 0.88801610f, 0.88650661f, 0.88498764f, 0.88345918f,
-0.88192125f, 0.88037390f, 0.87881711f, 0.87725090f, 0.87567531f,
-0.87409035f, 0.87249599f, 0.87089232f, 0.86927933f, 0.86765699f,
-0.86602540f, 0.86438453f, 0.86273437f, 0.86107503f, 0.85940641f,
-0.85772862f, 0.85604161f, 0.85434547f, 0.85264014f, 0.85092572f,
-0.84920218f, 0.84746955f, 0.84572781f, 0.84397704f, 0.84221721f,
-0.84044838f, 0.83867056f, 0.83688375f, 0.83508799f, 0.83328325f,
-0.83146961f, 0.82964704f, 0.82781562f, 0.82597530f, 0.82412620f,
-0.82226820f, 0.82040144f, 0.81852589f, 0.81664154f, 0.81474847f,
-0.81284665f, 0.81093620f, 0.80901698f, 0.80708914f, 0.80515262f,
-0.80320752f, 0.80125378f, 0.79929149f, 0.79732067f, 0.79534125f,
-0.79335335f, 0.79135691f, 0.78935204f, 0.78733867f, 0.78531691f,
-0.78328674f, 0.78124818f, 0.77920122f, 0.77714595f, 0.77508232f,
-0.77301043f, 0.77093026f, 0.76884183f, 0.76674517f, 0.76464026f,
-0.76252720f, 0.76040593f, 0.75827656f, 0.75613907f, 0.75399349f,
-0.75183978f, 0.74967807f, 0.74750833f, 0.74533054f, 0.74314481f,
-0.74095112f, 0.73874950f, 0.73653993f, 0.73432251f, 0.73209718f,
-0.72986405f, 0.72762307f, 0.72537438f, 0.72311787f, 0.72085359f,
-0.71858162f, 0.71630192f, 0.71401459f, 0.71171956f, 0.70941701f,
-0.70710677f, 0.70478900f, 0.70246363f, 0.70013079f, 0.69779041f,
-0.69544260f, 0.69308738f, 0.69072466f, 0.68835458f, 0.68597709f,
-0.68359229f, 0.68120013f, 0.67880072f, 0.67639404f, 0.67398011f,
-0.67155892f, 0.66913059f, 0.66669509f, 0.66425240f, 0.66180265f,
-0.65934581f, 0.65688191f, 0.65441092f, 0.65193298f, 0.64944801f,
-0.64695613f, 0.64445727f, 0.64195160f, 0.63943902f, 0.63691954f,
-0.63439328f, 0.63186019f, 0.62932037f, 0.62677377f, 0.62422055f,
-0.62166055f, 0.61909394f, 0.61652065f, 0.61394081f, 0.61135435f,
-0.60876139f, 0.60616195f, 0.60355593f, 0.60094349f, 0.59832457f,
-0.59569929f, 0.59306758f, 0.59042957f, 0.58778523f, 0.58513460f,
-0.58247766f, 0.57981452f, 0.57714518f, 0.57446961f, 0.57178793f,
-0.56910013f, 0.56640624f, 0.56370623f, 0.56100023f, 0.55828818f,
-0.55557020f, 0.55284627f, 0.55011641f, 0.54738067f, 0.54463901f,
-0.54189157f, 0.53913828f, 0.53637921f, 0.53361450f, 0.53084398f,
-0.52806787f, 0.52528601f, 0.52249852f, 0.51970543f, 0.51690688f,
-0.51410279f, 0.51129310f, 0.50847793f, 0.50565732f, 0.50283139f,
-0.49999997f, 0.49716321f, 0.49432122f, 0.49147383f, 0.48862118f,
-0.48576340f, 0.48290042f, 0.48003216f, 0.47715876f, 0.47428025f,
-0.47139677f, 0.46850813f, 0.46561448f, 0.46271584f, 0.45981235f,
-0.45690383f, 0.45399042f, 0.45107214f, 0.44814915f, 0.44522124f,
-0.44228868f, 0.43935137f, 0.43640926f, 0.43346247f, 0.43051104f,
-0.42755511f, 0.42459449f, 0.42162932f, 0.41865964f, 0.41568558f,
-0.41270697f, 0.40972393f, 0.40673661f, 0.40374494f, 0.40074884f,
-0.39774844f, 0.39474390f, 0.39173501f, 0.38872193f, 0.38570469f,
-0.38268343f, 0.37965796f, 0.37662842f, 0.37359496f, 0.37055739f,
-0.36751585f, 0.36447038f, 0.36142122f, 0.35836797f, 0.35531089f,
-0.35225000f, 0.34918544f, 0.34611704f, 0.34304493f, 0.33996926f,
-0.33688983f, 0.33380680f, 0.33072019f, 0.32763015f, 0.32453650f,
-0.32143936f, 0.31833890f, 0.31523503f, 0.31212767f, 0.30901696f,
-0.30590306f, 0.30278577f, 0.29966524f, 0.29654150f, 0.29341470f,
-0.29028464f, 0.28715147f, 0.28401522f, 0.28087605f, 0.27773376f,
-0.27458861f, 0.27144052f, 0.26828940f, 0.26513541f, 0.26197859f,
-0.25881907f, 0.25565666f, 0.25249152f, 0.24932367f, 0.24615327f,
-0.24298012f, 0.23980436f, 0.23662604f, 0.23344530f, 0.23026206f,
-0.22707623f, 0.22388809f, 0.22069744f, 0.21750443f, 0.21430908f,
-0.21111156f, 0.20791165f, 0.20470953f, 0.20150520f, 0.19829884f,
-0.19509024f, 0.19187955f, 0.18866692f, 0.18545227f, 0.18223552f,
-0.17901681f, 0.17579631f, 0.17257380f, 0.16934945f, 0.16612328f,
-0.16289546f, 0.15966577f, 0.15643437f, 0.15320141f, 0.14996669f,
-0.14673037f, 0.14349260f, 0.14025329f, 0.13701235f, 0.13376995f,
-0.13052612f, 0.12728101f, 0.12403442f, 0.12078650f, 0.11753740f,
-0.11428693f, 0.11103523f, 0.10778234f, 0.10452842f, 0.10127326f,
-0.098017137f, 0.094759842f, 0.091501652f, 0.088242363f, 0.084982129f,
-0.081721103f, 0.078459084f, 0.075196224f, 0.071932560f, 0.068668243f,
-0.065403073f, 0.062137201f, 0.058870665f, 0.055603617f, 0.052335974f,
-0.049067651f, 0.045798921f, 0.042529582f, 0.039259788f, 0.035989573f,
-0.032719092f, 0.029448142f, 0.026176876f, 0.022905329f, 0.019633657f,
-0.016361655f, 0.013089478f, 0.0098171604f, 0.0065449764f, 0.0032724839f,
--4.3711390e-08f, };
+static const opus_val16 mdct_twiddles960[1800] = {
+0.99999994f, 0.99999321f, 0.99997580f, 0.99994773f, 0.99990886f,
+0.99985933f, 0.99979913f, 0.99972820f, 0.99964654f, 0.99955416f,
+0.99945110f, 0.99933738f, 0.99921292f, 0.99907774f, 0.99893188f,
+0.99877530f, 0.99860805f, 0.99843007f, 0.99824142f, 0.99804211f,
+0.99783206f, 0.99761140f, 0.99737996f, 0.99713790f, 0.99688518f,
+0.99662173f, 0.99634761f, 0.99606287f, 0.99576741f, 0.99546129f,
+0.99514455f, 0.99481714f, 0.99447906f, 0.99413031f, 0.99377096f,
+0.99340093f, 0.99302030f, 0.99262899f, 0.99222708f, 0.99181455f,
+0.99139136f, 0.99095762f, 0.99051321f, 0.99005818f, 0.98959261f,
+0.98911643f, 0.98862964f, 0.98813224f, 0.98762429f, 0.98710573f,
+0.98657662f, 0.98603696f, 0.98548669f, 0.98492593f, 0.98435456f,
+0.98377270f, 0.98318028f, 0.98257732f, 0.98196387f, 0.98133987f,
+0.98070538f, 0.98006040f, 0.97940493f, 0.97873890f, 0.97806245f,
+0.97737551f, 0.97667813f, 0.97597027f, 0.97525197f, 0.97452319f,
+0.97378403f, 0.97303438f, 0.97227436f, 0.97150391f, 0.97072303f,
+0.96993178f, 0.96913016f, 0.96831810f, 0.96749574f, 0.96666300f,
+0.96581990f, 0.96496642f, 0.96410263f, 0.96322852f, 0.96234411f,
+0.96144938f, 0.96054435f, 0.95962906f, 0.95870346f, 0.95776761f,
+0.95682150f, 0.95586514f, 0.95489854f, 0.95392174f, 0.95293468f,
+0.95193744f, 0.95093000f, 0.94991243f, 0.94888461f, 0.94784665f,
+0.94679856f, 0.94574034f, 0.94467193f, 0.94359344f, 0.94250488f,
+0.94140619f, 0.94029742f, 0.93917859f, 0.93804967f, 0.93691075f,
+0.93576175f, 0.93460274f, 0.93343377f, 0.93225473f, 0.93106574f,
+0.92986679f, 0.92865789f, 0.92743903f, 0.92621022f, 0.92497152f,
+0.92372292f, 0.92246443f, 0.92119598f, 0.91991776f, 0.91862965f,
+0.91733170f, 0.91602397f, 0.91470635f, 0.91337901f, 0.91204184f,
+0.91069490f, 0.90933824f, 0.90797186f, 0.90659571f, 0.90520984f,
+0.90381432f, 0.90240908f, 0.90099424f, 0.89956969f, 0.89813554f,
+0.89669174f, 0.89523834f, 0.89377540f, 0.89230281f, 0.89082074f,
+0.88932908f, 0.88782793f, 0.88631725f, 0.88479710f, 0.88326746f,
+0.88172835f, 0.88017982f, 0.87862182f, 0.87705445f, 0.87547767f,
+0.87389153f, 0.87229604f, 0.87069118f, 0.86907703f, 0.86745358f,
+0.86582077f, 0.86417878f, 0.86252749f, 0.86086690f, 0.85919720f,
+0.85751826f, 0.85583007f, 0.85413277f, 0.85242635f, 0.85071075f,
+0.84898609f, 0.84725231f, 0.84550947f, 0.84375757f, 0.84199661f,
+0.84022665f, 0.83844769f, 0.83665979f, 0.83486289f, 0.83305705f,
+0.83124226f, 0.82941860f, 0.82758605f, 0.82574469f, 0.82389444f,
+0.82203537f, 0.82016748f, 0.81829083f, 0.81640542f, 0.81451124f,
+0.81260836f, 0.81069672f, 0.80877650f, 0.80684757f, 0.80490994f,
+0.80296379f, 0.80100900f, 0.79904562f, 0.79707366f, 0.79509324f,
+0.79310423f, 0.79110676f, 0.78910083f, 0.78708643f, 0.78506362f,
+0.78303236f, 0.78099275f, 0.77894479f, 0.77688843f, 0.77482378f,
+0.77275085f, 0.77066964f, 0.76858020f, 0.76648247f, 0.76437658f,
+0.76226246f, 0.76014024f, 0.75800985f, 0.75587130f, 0.75372469f,
+0.75157005f, 0.74940729f, 0.74723655f, 0.74505776f, 0.74287105f,
+0.74067634f, 0.73847371f, 0.73626316f, 0.73404479f, 0.73181850f,
+0.72958434f, 0.72734243f, 0.72509271f, 0.72283524f, 0.72057003f,
+0.71829706f, 0.71601641f, 0.71372813f, 0.71143216f, 0.70912862f,
+0.70681745f, 0.70449871f, 0.70217246f, 0.69983864f, 0.69749737f,
+0.69514859f, 0.69279242f, 0.69042879f, 0.68805778f, 0.68567938f,
+0.68329364f, 0.68090063f, 0.67850029f, 0.67609268f, 0.67367786f,
+0.67125577f, 0.66882652f, 0.66639012f, 0.66394657f, 0.66149592f,
+0.65903819f, 0.65657341f, 0.65410155f, 0.65162271f, 0.64913690f,
+0.64664418f, 0.64414448f, 0.64163786f, 0.63912445f, 0.63660413f,
+0.63407701f, 0.63154310f, 0.62900239f, 0.62645501f, 0.62390089f,
+0.62134010f, 0.61877263f, 0.61619854f, 0.61361790f, 0.61103064f,
+0.60843682f, 0.60583651f, 0.60322970f, 0.60061646f, 0.59799677f,
+0.59537065f, 0.59273821f, 0.59009939f, 0.58745426f, 0.58480281f,
+0.58214509f, 0.57948118f, 0.57681108f, 0.57413477f, 0.57145232f,
+0.56876373f, 0.56606907f, 0.56336832f, 0.56066155f, 0.55794877f,
+0.55523002f, 0.55250537f, 0.54977477f, 0.54703826f, 0.54429591f,
+0.54154772f, 0.53879374f, 0.53603399f, 0.53326851f, 0.53049731f,
+0.52772039f, 0.52493787f, 0.52214974f, 0.51935595f, 0.51655668f,
+0.51375180f, 0.51094145f, 0.50812566f, 0.50530440f, 0.50247771f,
+0.49964568f, 0.49680826f, 0.49396557f, 0.49111754f, 0.48826426f,
+0.48540577f, 0.48254207f, 0.47967321f, 0.47679919f, 0.47392011f,
+0.47103590f, 0.46814668f, 0.46525243f, 0.46235323f, 0.45944905f,
+0.45653993f, 0.45362595f, 0.45070711f, 0.44778344f, 0.44485497f,
+0.44192174f, 0.43898380f, 0.43604112f, 0.43309379f, 0.43014181f,
+0.42718524f, 0.42422408f, 0.42125839f, 0.41828820f, 0.41531351f,
+0.41233435f, 0.40935081f, 0.40636289f, 0.40337059f, 0.40037400f,
+0.39737311f, 0.39436796f, 0.39135858f, 0.38834500f, 0.38532731f,
+0.38230544f, 0.37927949f, 0.37624949f, 0.37321547f, 0.37017745f,
+0.36713544f, 0.36408952f, 0.36103970f, 0.35798600f, 0.35492846f,
+0.35186714f, 0.34880206f, 0.34573323f, 0.34266070f, 0.33958447f,
+0.33650464f, 0.33342120f, 0.33033419f, 0.32724363f, 0.32414958f,
+0.32105204f, 0.31795108f, 0.31484672f, 0.31173897f, 0.30862790f,
+0.30551350f, 0.30239585f, 0.29927495f, 0.29615086f, 0.29302359f,
+0.28989318f, 0.28675964f, 0.28362307f, 0.28048345f, 0.27734083f,
+0.27419522f, 0.27104670f, 0.26789525f, 0.26474094f, 0.26158381f,
+0.25842386f, 0.25526115f, 0.25209570f, 0.24892756f, 0.24575676f,
+0.24258332f, 0.23940729f, 0.23622867f, 0.23304754f, 0.22986393f,
+0.22667783f, 0.22348931f, 0.22029841f, 0.21710514f, 0.21390954f,
+0.21071166f, 0.20751151f, 0.20430915f, 0.20110460f, 0.19789790f,
+0.19468907f, 0.19147816f, 0.18826519f, 0.18505022f, 0.18183327f,
+0.17861435f, 0.17539354f, 0.17217083f, 0.16894630f, 0.16571994f,
+0.16249183f, 0.15926196f, 0.15603039f, 0.15279715f, 0.14956227f,
+0.14632578f, 0.14308774f, 0.13984816f, 0.13660708f, 0.13336454f,
+0.13012058f, 0.12687522f, 0.12362850f, 0.12038045f, 0.11713112f,
+0.11388054f, 0.11062872f, 0.10737573f, 0.10412160f, 0.10086634f,
+0.097609997f, 0.094352618f, 0.091094226f, 0.087834857f, 0.084574550f,
+0.081313334f, 0.078051247f, 0.074788325f, 0.071524605f, 0.068260118f,
+0.064994894f, 0.061728980f, 0.058462404f, 0.055195201f, 0.051927410f,
+0.048659060f, 0.045390189f, 0.042120833f, 0.038851023f, 0.035580799f,
+0.032310195f, 0.029039243f, 0.025767982f, 0.022496443f, 0.019224664f,
+0.015952680f, 0.012680525f, 0.0094082337f, 0.0061358409f, 0.0028633832f,
+-0.00040910527f, -0.0036815894f, -0.0069540343f, -0.010226404f, -0.013498665f,
+-0.016770782f, -0.020042717f, -0.023314439f, -0.026585912f, -0.029857099f,
+-0.033127967f, -0.036398482f, -0.039668605f, -0.042938303f, -0.046207540f,
+-0.049476285f, -0.052744497f, -0.056012146f, -0.059279196f, -0.062545612f,
+-0.065811358f, -0.069076397f, -0.072340697f, -0.075604223f, -0.078866936f,
+-0.082128808f, -0.085389800f, -0.088649876f, -0.091909006f, -0.095167145f,
+-0.098424271f, -0.10168034f, -0.10493532f, -0.10818918f, -0.11144188f,
+-0.11469338f, -0.11794366f, -0.12119267f, -0.12444039f, -0.12768677f,
+-0.13093179f, -0.13417540f, -0.13741758f, -0.14065829f, -0.14389749f,
+-0.14713514f, -0.15037122f, -0.15360570f, -0.15683852f, -0.16006967f,
+-0.16329910f, -0.16652679f, -0.16975269f, -0.17297678f, -0.17619900f,
+-0.17941935f, -0.18263777f, -0.18585424f, -0.18906870f, -0.19228116f,
+-0.19549155f, -0.19869985f, -0.20190603f, -0.20511003f, -0.20831184f,
+-0.21151142f, -0.21470875f, -0.21790376f, -0.22109644f, -0.22428675f,
+-0.22747467f, -0.23066014f, -0.23384315f, -0.23702365f, -0.24020162f,
+-0.24337701f, -0.24654980f, -0.24971995f, -0.25288740f, -0.25605217f,
+-0.25921419f, -0.26237345f, -0.26552987f, -0.26868346f, -0.27183419f,
+-0.27498198f, -0.27812684f, -0.28126872f, -0.28440759f, -0.28754342f,
+-0.29067615f, -0.29380578f, -0.29693225f, -0.30005556f, -0.30317566f,
+-0.30629250f, -0.30940607f, -0.31251630f, -0.31562322f, -0.31872672f,
+-0.32182685f, -0.32492352f, -0.32801670f, -0.33110636f, -0.33419248f,
+-0.33727503f, -0.34035397f, -0.34342924f, -0.34650084f, -0.34956875f,
+-0.35263291f, -0.35569328f, -0.35874987f, -0.36180258f, -0.36485144f,
+-0.36789638f, -0.37093741f, -0.37397444f, -0.37700745f, -0.38003644f,
+-0.38306138f, -0.38608220f, -0.38909888f, -0.39211139f, -0.39511973f,
+-0.39812380f, -0.40112361f, -0.40411916f, -0.40711036f, -0.41009718f,
+-0.41307965f, -0.41605768f, -0.41903123f, -0.42200032f, -0.42496487f,
+-0.42792490f, -0.43088034f, -0.43383113f, -0.43677729f, -0.43971881f,
+-0.44265559f, -0.44558764f, -0.44851488f, -0.45143735f, -0.45435500f,
+-0.45726776f, -0.46017563f, -0.46307856f, -0.46597654f, -0.46886954f,
+-0.47175750f, -0.47464043f, -0.47751826f, -0.48039100f, -0.48325855f,
+-0.48612097f, -0.48897815f, -0.49183011f, -0.49467680f, -0.49751821f,
+-0.50035429f, -0.50318497f, -0.50601029f, -0.50883019f, -0.51164466f,
+-0.51445359f, -0.51725709f, -0.52005500f, -0.52284735f, -0.52563411f,
+-0.52841520f, -0.53119069f, -0.53396046f, -0.53672451f, -0.53948283f,
+-0.54223537f, -0.54498214f, -0.54772300f, -0.55045801f, -0.55318713f,
+-0.55591035f, -0.55862761f, -0.56133890f, -0.56404412f, -0.56674337f,
+-0.56943649f, -0.57212353f, -0.57480448f, -0.57747924f, -0.58014780f,
+-0.58281022f, -0.58546633f, -0.58811617f, -0.59075975f, -0.59339696f,
+-0.59602785f, -0.59865236f, -0.60127044f, -0.60388207f, -0.60648727f,
+-0.60908598f, -0.61167812f, -0.61426371f, -0.61684275f, -0.61941516f,
+-0.62198097f, -0.62454009f, -0.62709254f, -0.62963831f, -0.63217729f,
+-0.63470948f, -0.63723493f, -0.63975352f, -0.64226526f, -0.64477009f,
+-0.64726806f, -0.64975911f, -0.65224314f, -0.65472025f, -0.65719032f,
+-0.65965337f, -0.66210932f, -0.66455823f, -0.66700000f, -0.66943461f,
+-0.67186207f, -0.67428231f, -0.67669535f, -0.67910111f, -0.68149966f,
+-0.68389088f, -0.68627477f, -0.68865126f, -0.69102043f, -0.69338220f,
+-0.69573659f, -0.69808346f, -0.70042288f, -0.70275480f, -0.70507920f,
+-0.70739603f, -0.70970529f, -0.71200693f, -0.71430099f, -0.71658736f,
+-0.71886611f, -0.72113711f, -0.72340041f, -0.72565591f, -0.72790372f,
+-0.73014367f, -0.73237586f, -0.73460019f, -0.73681659f, -0.73902518f,
+-0.74122584f, -0.74341851f, -0.74560326f, -0.74778003f, -0.74994880f,
+-0.75210953f, -0.75426215f, -0.75640678f, -0.75854325f, -0.76067162f,
+-0.76279181f, -0.76490390f, -0.76700771f, -0.76910341f, -0.77119076f,
+-0.77326995f, -0.77534080f, -0.77740335f, -0.77945763f, -0.78150350f,
+-0.78354102f, -0.78557014f, -0.78759086f, -0.78960317f, -0.79160696f,
+-0.79360235f, -0.79558921f, -0.79756755f, -0.79953730f, -0.80149853f,
+-0.80345118f, -0.80539525f, -0.80733067f, -0.80925739f, -0.81117553f,
+-0.81308490f, -0.81498563f, -0.81687760f, -0.81876087f, -0.82063532f,
+-0.82250100f, -0.82435787f, -0.82620591f, -0.82804507f, -0.82987541f,
+-0.83169687f, -0.83350939f, -0.83531296f, -0.83710766f, -0.83889335f,
+-0.84067005f, -0.84243774f, -0.84419644f, -0.84594607f, -0.84768665f,
+-0.84941816f, -0.85114056f, -0.85285389f, -0.85455805f, -0.85625303f,
+-0.85793889f, -0.85961550f, -0.86128294f, -0.86294121f, -0.86459017f,
+-0.86622989f, -0.86786032f, -0.86948150f, -0.87109333f, -0.87269586f,
+-0.87428904f, -0.87587279f, -0.87744725f, -0.87901229f, -0.88056785f,
+-0.88211405f, -0.88365078f, -0.88517809f, -0.88669586f, -0.88820416f,
+-0.88970292f, -0.89119220f, -0.89267188f, -0.89414203f, -0.89560264f,
+-0.89705360f, -0.89849502f, -0.89992678f, -0.90134889f, -0.90276134f,
+-0.90416414f, -0.90555727f, -0.90694070f, -0.90831441f, -0.90967834f,
+-0.91103262f, -0.91237706f, -0.91371179f, -0.91503674f, -0.91635185f,
+-0.91765714f, -0.91895264f, -0.92023826f, -0.92151409f, -0.92277998f,
+-0.92403603f, -0.92528218f, -0.92651838f, -0.92774469f, -0.92896110f,
+-0.93016750f, -0.93136400f, -0.93255049f, -0.93372697f, -0.93489349f,
+-0.93604994f, -0.93719643f, -0.93833286f, -0.93945926f, -0.94057560f,
+-0.94168180f, -0.94277799f, -0.94386405f, -0.94494003f, -0.94600588f,
+-0.94706154f, -0.94810712f, -0.94914252f, -0.95016778f, -0.95118284f,
+-0.95218778f, -0.95318246f, -0.95416695f, -0.95514119f, -0.95610523f,
+-0.95705903f, -0.95800257f, -0.95893586f, -0.95985889f, -0.96077162f,
+-0.96167403f, -0.96256620f, -0.96344805f, -0.96431959f, -0.96518075f,
+-0.96603161f, -0.96687216f, -0.96770233f, -0.96852213f, -0.96933156f,
+-0.97013056f, -0.97091925f, -0.97169751f, -0.97246534f, -0.97322279f,
+-0.97396982f, -0.97470641f, -0.97543252f, -0.97614825f, -0.97685349f,
+-0.97754824f, -0.97823256f, -0.97890645f, -0.97956979f, -0.98022264f,
+-0.98086500f, -0.98149687f, -0.98211825f, -0.98272908f, -0.98332942f,
+-0.98391914f, -0.98449844f, -0.98506713f, -0.98562527f, -0.98617285f,
+-0.98670989f, -0.98723638f, -0.98775226f, -0.98825759f, -0.98875231f,
+-0.98923647f, -0.98971003f, -0.99017298f, -0.99062532f, -0.99106705f,
+-0.99149817f, -0.99191868f, -0.99232858f, -0.99272782f, -0.99311644f,
+-0.99349445f, -0.99386179f, -0.99421853f, -0.99456459f, -0.99489999f,
+-0.99522477f, -0.99553883f, -0.99584228f, -0.99613506f, -0.99641716f,
+-0.99668860f, -0.99694937f, -0.99719942f, -0.99743885f, -0.99766755f,
+-0.99788558f, -0.99809295f, -0.99828959f, -0.99847561f, -0.99865085f,
+-0.99881548f, -0.99896932f, -0.99911255f, -0.99924499f, -0.99936682f,
+-0.99947786f, -0.99957830f, -0.99966794f, -0.99974692f, -0.99981517f,
+-0.99987274f, -0.99991959f, -0.99995571f, -0.99998116f, -0.99999589f,
+0.99999964f, 0.99997288f, 0.99990326f, 0.99979085f, 0.99963558f,
+0.99943751f, 0.99919659f, 0.99891287f, 0.99858636f, 0.99821711f,
+0.99780506f, 0.99735034f, 0.99685282f, 0.99631262f, 0.99572974f,
+0.99510419f, 0.99443603f, 0.99372530f, 0.99297196f, 0.99217612f,
+0.99133772f, 0.99045694f, 0.98953366f, 0.98856801f, 0.98756003f,
+0.98650974f, 0.98541719f, 0.98428243f, 0.98310548f, 0.98188645f,
+0.98062533f, 0.97932225f, 0.97797716f, 0.97659022f, 0.97516143f,
+0.97369087f, 0.97217858f, 0.97062469f, 0.96902919f, 0.96739221f,
+0.96571374f, 0.96399397f, 0.96223283f, 0.96043050f, 0.95858705f,
+0.95670253f, 0.95477700f, 0.95281059f, 0.95080340f, 0.94875544f,
+0.94666684f, 0.94453770f, 0.94236809f, 0.94015813f, 0.93790787f,
+0.93561745f, 0.93328691f, 0.93091643f, 0.92850608f, 0.92605597f,
+0.92356616f, 0.92103678f, 0.91846794f, 0.91585976f, 0.91321236f,
+0.91052586f, 0.90780038f, 0.90503591f, 0.90223277f, 0.89939094f,
+0.89651060f, 0.89359182f, 0.89063478f, 0.88763964f, 0.88460642f,
+0.88153529f, 0.87842643f, 0.87527996f, 0.87209594f, 0.86887461f,
+0.86561602f, 0.86232042f, 0.85898781f, 0.85561842f, 0.85221243f,
+0.84876984f, 0.84529096f, 0.84177583f, 0.83822471f, 0.83463764f,
+0.83101481f, 0.82735640f, 0.82366252f, 0.81993335f, 0.81616908f,
+0.81236988f, 0.80853581f, 0.80466717f, 0.80076402f, 0.79682660f,
+0.79285502f, 0.78884947f, 0.78481019f, 0.78073722f, 0.77663082f,
+0.77249116f, 0.76831841f, 0.76411277f, 0.75987434f, 0.75560343f,
+0.75130010f, 0.74696463f, 0.74259710f, 0.73819780f, 0.73376691f,
+0.72930455f, 0.72481096f, 0.72028631f, 0.71573079f, 0.71114463f,
+0.70652801f, 0.70188117f, 0.69720417f, 0.69249737f, 0.68776089f,
+0.68299496f, 0.67819971f, 0.67337549f, 0.66852236f, 0.66364062f,
+0.65873051f, 0.65379208f, 0.64882571f, 0.64383155f, 0.63880974f,
+0.63376063f, 0.62868434f, 0.62358117f, 0.61845124f, 0.61329484f,
+0.60811216f, 0.60290343f, 0.59766883f, 0.59240872f, 0.58712316f,
+0.58181250f, 0.57647687f, 0.57111657f, 0.56573176f, 0.56032276f,
+0.55488980f, 0.54943299f, 0.54395270f, 0.53844911f, 0.53292239f,
+0.52737290f, 0.52180082f, 0.51620632f, 0.51058978f, 0.50495136f,
+0.49929130f, 0.49360985f, 0.48790723f, 0.48218375f, 0.47643960f,
+0.47067502f, 0.46489030f, 0.45908567f, 0.45326138f, 0.44741765f,
+0.44155475f, 0.43567297f, 0.42977250f, 0.42385364f, 0.41791660f,
+0.41196167f, 0.40598908f, 0.39999911f, 0.39399201f, 0.38796803f,
+0.38192743f, 0.37587047f, 0.36979741f, 0.36370850f, 0.35760403f,
+0.35148421f, 0.34534934f, 0.33919969f, 0.33303553f, 0.32685706f,
+0.32066461f, 0.31445843f, 0.30823877f, 0.30200592f, 0.29576012f,
+0.28950164f, 0.28323078f, 0.27694780f, 0.27065292f, 0.26434645f,
+0.25802869f, 0.25169984f, 0.24536023f, 0.23901010f, 0.23264973f,
+0.22627939f, 0.21989937f, 0.21350993f, 0.20711134f, 0.20070387f,
+0.19428782f, 0.18786344f, 0.18143101f, 0.17499080f, 0.16854310f,
+0.16208819f, 0.15562633f, 0.14915779f, 0.14268288f, 0.13620184f,
+0.12971498f, 0.12322257f, 0.11672486f, 0.11022217f, 0.10371475f,
+0.097202882f, 0.090686858f, 0.084166944f, 0.077643424f, 0.071116582f,
+0.064586692f, 0.058054037f, 0.051518895f, 0.044981543f, 0.038442269f,
+0.031901345f, 0.025359053f, 0.018815678f, 0.012271495f, 0.0057267868f,
+-0.00081816671f, -0.0073630852f, -0.013907688f, -0.020451695f, -0.026994826f,
+-0.033536803f, -0.040077340f, -0.046616159f, -0.053152986f, -0.059687532f,
+-0.066219524f, -0.072748676f, -0.079274714f, -0.085797355f, -0.092316322f,
+-0.098831341f, -0.10534211f, -0.11184838f, -0.11834986f, -0.12484626f,
+-0.13133731f, -0.13782275f, -0.14430228f, -0.15077563f, -0.15724251f,
+-0.16370267f, -0.17015581f, -0.17660165f, -0.18303993f, -0.18947038f,
+-0.19589271f, -0.20230664f, -0.20871192f, -0.21510825f, -0.22149536f,
+-0.22787298f, -0.23424086f, -0.24059868f, -0.24694622f, -0.25328314f,
+-0.25960925f, -0.26592422f, -0.27222782f, -0.27851975f, -0.28479972f,
+-0.29106751f, -0.29732284f, -0.30356544f, -0.30979502f, -0.31601134f,
+-0.32221413f, -0.32840309f, -0.33457801f, -0.34073856f, -0.34688455f,
+-0.35301566f, -0.35913166f, -0.36523229f, -0.37131724f, -0.37738630f,
+-0.38343921f, -0.38947567f, -0.39549544f, -0.40149832f, -0.40748394f,
+-0.41345215f, -0.41940263f, -0.42533514f, -0.43124944f, -0.43714526f,
+-0.44302234f, -0.44888046f, -0.45471936f, -0.46053877f, -0.46633846f,
+-0.47211814f, -0.47787762f, -0.48361665f, -0.48933494f, -0.49503228f,
+-0.50070840f, -0.50636309f, -0.51199609f, -0.51760709f, -0.52319598f,
+-0.52876246f, -0.53430629f, -0.53982723f, -0.54532504f, -0.55079949f,
+-0.55625033f, -0.56167740f, -0.56708032f, -0.57245898f, -0.57781315f,
+-0.58314258f, -0.58844697f, -0.59372622f, -0.59897995f, -0.60420811f,
+-0.60941035f, -0.61458647f, -0.61973625f, -0.62485951f, -0.62995601f,
+-0.63502556f, -0.64006782f, -0.64508271f, -0.65007001f, -0.65502942f,
+-0.65996075f, -0.66486382f, -0.66973841f, -0.67458433f, -0.67940134f,
+-0.68418926f, -0.68894786f, -0.69367695f, -0.69837630f, -0.70304573f,
+-0.70768511f, -0.71229410f, -0.71687263f, -0.72142041f, -0.72593731f,
+-0.73042315f, -0.73487765f, -0.73930067f, -0.74369204f, -0.74805158f,
+-0.75237900f, -0.75667429f, -0.76093709f, -0.76516730f, -0.76936477f,
+-0.77352923f, -0.77766061f, -0.78175867f, -0.78582323f, -0.78985411f,
+-0.79385114f, -0.79781419f, -0.80174309f, -0.80563760f, -0.80949765f,
+-0.81332302f, -0.81711352f, -0.82086903f, -0.82458937f, -0.82827437f,
+-0.83192390f, -0.83553779f, -0.83911592f, -0.84265804f, -0.84616417f,
+-0.84963393f, -0.85306740f, -0.85646427f, -0.85982448f, -0.86314780f,
+-0.86643422f, -0.86968350f, -0.87289548f, -0.87607014f, -0.87920725f,
+-0.88230664f, -0.88536829f, -0.88839203f, -0.89137769f, -0.89432514f,
+-0.89723432f, -0.90010506f, -0.90293723f, -0.90573072f, -0.90848541f,
+-0.91120118f, -0.91387796f, -0.91651553f, -0.91911387f, -0.92167282f,
+-0.92419231f, -0.92667222f, -0.92911243f, -0.93151283f, -0.93387336f,
+-0.93619382f, -0.93847424f, -0.94071442f, -0.94291431f, -0.94507378f,
+-0.94719279f, -0.94927126f, -0.95130903f, -0.95330608f, -0.95526224f,
+-0.95717752f, -0.95905179f, -0.96088499f, -0.96267700f, -0.96442777f,
+-0.96613729f, -0.96780539f, -0.96943200f, -0.97101706f, -0.97256058f,
+-0.97406244f, -0.97552258f, -0.97694093f, -0.97831738f, -0.97965199f,
+-0.98094457f, -0.98219514f, -0.98340368f, -0.98457009f, -0.98569429f,
+-0.98677629f, -0.98781598f, -0.98881340f, -0.98976845f, -0.99068111f,
+-0.99155134f, -0.99237907f, -0.99316430f, -0.99390697f, -0.99460709f,
+-0.99526459f, -0.99587947f, -0.99645168f, -0.99698120f, -0.99746799f,
+-0.99791211f, -0.99831343f, -0.99867201f, -0.99898779f, -0.99926084f,
+-0.99949104f, -0.99967843f, -0.99982297f, -0.99992472f, -0.99998361f,
+0.99999869f, 0.99989158f, 0.99961317f, 0.99916345f, 0.99854255f,
+0.99775058f, 0.99678761f, 0.99565387f, 0.99434954f, 0.99287480f,
+0.99122995f, 0.98941529f, 0.98743105f, 0.98527765f, 0.98295540f,
+0.98046476f, 0.97780609f, 0.97497988f, 0.97198665f, 0.96882683f,
+0.96550101f, 0.96200979f, 0.95835376f, 0.95453346f, 0.95054960f,
+0.94640291f, 0.94209403f, 0.93762374f, 0.93299282f, 0.92820197f,
+0.92325211f, 0.91814411f, 0.91287869f, 0.90745693f, 0.90187967f,
+0.89614785f, 0.89026248f, 0.88422459f, 0.87803519f, 0.87169534f,
+0.86520612f, 0.85856867f, 0.85178405f, 0.84485358f, 0.83777827f,
+0.83055943f, 0.82319832f, 0.81569612f, 0.80805415f, 0.80027372f,
+0.79235619f, 0.78430289f, 0.77611518f, 0.76779449f, 0.75934225f,
+0.75075996f, 0.74204898f, 0.73321080f, 0.72424710f, 0.71515924f,
+0.70594883f, 0.69661748f, 0.68716675f, 0.67759830f, 0.66791373f,
+0.65811473f, 0.64820296f, 0.63818014f, 0.62804794f, 0.61780810f,
+0.60746247f, 0.59701276f, 0.58646071f, 0.57580817f, 0.56505698f,
+0.55420899f, 0.54326600f, 0.53222996f, 0.52110273f, 0.50988621f,
+0.49858227f, 0.48719296f, 0.47572014f, 0.46416581f, 0.45253196f,
+0.44082057f, 0.42903364f, 0.41717321f, 0.40524128f, 0.39323992f,
+0.38117120f, 0.36903715f, 0.35683987f, 0.34458145f, 0.33226398f,
+0.31988961f, 0.30746040f, 0.29497850f, 0.28244606f, 0.26986524f,
+0.25723818f, 0.24456702f, 0.23185398f, 0.21910121f, 0.20631088f,
+0.19348522f, 0.18062639f, 0.16773662f, 0.15481812f, 0.14187308f,
+0.12890373f, 0.11591230f, 0.10290100f, 0.089872077f, 0.076827750f,
+0.063770257f, 0.050701842f, 0.037624735f, 0.024541186f, 0.011453429f,
+-0.0016362892f, -0.014725727f, -0.027812643f, -0.040894791f, -0.053969935f,
+-0.067035832f, -0.080090240f, -0.093130924f, -0.10615565f, -0.11916219f,
+-0.13214831f, -0.14511178f, -0.15805040f, -0.17096193f, -0.18384418f,
+-0.19669491f, -0.20951195f, -0.22229309f, -0.23503613f, -0.24773891f,
+-0.26039925f, -0.27301496f, -0.28558388f, -0.29810387f, -0.31057280f,
+-0.32298848f, -0.33534884f, -0.34765175f, -0.35989508f, -0.37207675f,
+-0.38419467f, -0.39624676f, -0.40823093f, -0.42014518f, -0.43198743f,
+-0.44375566f, -0.45544785f, -0.46706200f, -0.47859612f, -0.49004826f,
+-0.50141639f, -0.51269865f, -0.52389306f, -0.53499764f, -0.54601061f,
+-0.55693001f, -0.56775403f, -0.57848072f, -0.58910829f, -0.59963489f,
+-0.61005878f, -0.62037814f, -0.63059121f, -0.64069623f, -0.65069145f,
+-0.66057515f, -0.67034572f, -0.68000144f, -0.68954057f, -0.69896162f,
+-0.70826286f, -0.71744281f, -0.72649974f, -0.73543227f, -0.74423873f,
+-0.75291771f, -0.76146764f, -0.76988715f, -0.77817470f, -0.78632891f,
+-0.79434842f, -0.80223179f, -0.80997771f, -0.81758487f, -0.82505190f,
+-0.83237761f, -0.83956063f, -0.84659988f, -0.85349399f, -0.86024189f,
+-0.86684239f, -0.87329435f, -0.87959671f, -0.88574833f, -0.89174819f,
+-0.89759529f, -0.90328854f, -0.90882701f, -0.91420978f, -0.91943592f,
+-0.92450452f, -0.92941469f, -0.93416560f, -0.93875647f, -0.94318646f,
+-0.94745487f, -0.95156091f, -0.95550388f, -0.95928317f, -0.96289814f,
+-0.96634805f, -0.96963239f, -0.97275060f, -0.97570217f, -0.97848648f,
+-0.98110318f, -0.98355180f, -0.98583186f, -0.98794299f, -0.98988485f,
+-0.99165714f, -0.99325943f, -0.99469161f, -0.99595332f, -0.99704438f,
+-0.99796462f, -0.99871385f, -0.99929196f, -0.99969882f, -0.99993443f,
+0.99999464f, 0.99956632f, 0.99845290f, 0.99665523f, 0.99417448f,
+0.99101239f, 0.98717111f, 0.98265326f, 0.97746199f, 0.97160077f,
+0.96507365f, 0.95788515f, 0.95004016f, 0.94154406f, 0.93240267f,
+0.92262226f, 0.91220951f, 0.90117162f, 0.88951606f, 0.87725091f,
+0.86438453f, 0.85092574f, 0.83688372f, 0.82226819f, 0.80708915f,
+0.79135692f, 0.77508235f, 0.75827658f, 0.74095112f, 0.72311783f,
+0.70478898f, 0.68597710f, 0.66669506f, 0.64695615f, 0.62677377f,
+0.60616189f, 0.58513457f, 0.56370622f, 0.54189157f, 0.51970547f,
+0.49716324f, 0.47428027f, 0.45107225f, 0.42755505f, 0.40374488f,
+0.37965798f, 0.35531086f, 0.33072025f, 0.30590299f, 0.28087607f,
+0.25565663f, 0.23026201f, 0.20470956f, 0.17901683f, 0.15320139f,
+0.12728097f, 0.10127331f, 0.075196236f, 0.049067631f, 0.022905400f,
+-0.0032725304f, -0.029448219f, -0.055603724f, -0.081721120f, -0.10778251f,
+-0.13377003f, -0.15966587f, -0.18545228f, -0.21111161f, -0.23662624f,
+-0.26197869f, -0.28715160f, -0.31212771f, -0.33688989f, -0.36142120f,
+-0.38570482f, -0.40972409f, -0.43346253f, -0.45690393f, -0.48003218f,
+-0.50283146f, -0.52528608f, -0.54738069f, -0.56910020f, -0.59042966f,
+-0.61135447f, -0.63186026f, -0.65193301f, -0.67155898f, -0.69072473f,
+-0.70941705f, -0.72762316f, -0.74533063f, -0.76252723f, -0.77920127f,
+-0.79534131f, -0.81093621f, -0.82597536f, -0.84044844f, -0.85434550f,
+-0.86765707f, -0.88037395f, -0.89248747f, -0.90398932f, -0.91487163f,
+-0.92512697f, -0.93474823f, -0.94372886f, -0.95206273f, -0.95974404f,
+-0.96676767f, -0.97312868f, -0.97882277f, -0.98384601f, -0.98819500f,
+-0.99186671f, -0.99485862f, -0.99716878f, -0.99879545f, -0.99973762f,
+};
 #endif
 
 static const CELTMode mode48000_960_120 = {
diff --git a/celt/static_modes_float_arm_ne10.h b/celt/static_modes_float_arm_ne10.h
new file mode 100644
index 0000000..934a82a
--- /dev/null
+++ b/celt/static_modes_float_arm_ne10.h
@@ -0,0 +1,404 @@
+/* The contents of this file was automatically generated by
+ * dump_mode_arm_ne10.c with arguments: 48000 960
+ * It contains static definitions for some pre-defined modes. */
+#include <NE10_init.h>
+
+#ifndef NE10_FFT_PARAMS48000_960
+#define NE10_FFT_PARAMS48000_960
+static const ne10_int32_t ne10_factors_480[64] = {
+4, 40, 4, 30, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_240[64] = {
+3, 20, 4, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_120[64] = {
+3, 10, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_int32_t ne10_factors_60[64] = {
+2, 5, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, };
+static const ne10_fft_cpx_float32_t ne10_twiddles_480[480] = {
+{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f},
+{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f},
+{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f},
+{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f},
+{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f},
+{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f},
+{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f},
+{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f},
+{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f},
+{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f},
+{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f},
+{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f},
+{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f},
+{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f},
+{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f},
+{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f},
+{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f},
+{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f},
+{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f},
+{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f},
+{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f},
+{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f},
+{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f},
+{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f},
+{-4.3711388e-08f,-1.0000000f}, {-0.10452851f,-0.99452192f}, {-0.20791174f,-0.97814757f},
+{-0.30901703f,-0.95105648f}, {-0.40673670f,-0.91354543f}, {-0.50000006f,-0.86602533f},
+{-0.58778518f,-0.80901700f}, {-0.66913068f,-0.74314475f}, {-0.74314493f,-0.66913044f},
+{-0.80901700f,-0.58778518f}, {-0.86602539f,-0.50000006f}, {-0.91354549f,-0.40673658f},
+{-0.95105654f,-0.30901679f}, {-0.97814763f,-0.20791161f}, {-0.99452192f,-0.10452849f},
+{1.0000000f,-0.0000000f}, {0.98768836f,-0.15643448f}, {0.95105648f,-0.30901700f},
+{0.89100653f,-0.45399052f}, {0.80901700f,-0.58778524f}, {0.70710677f,-0.70710683f},
+{0.58778524f,-0.80901700f}, {0.45399052f,-0.89100653f}, {0.30901697f,-0.95105654f},
+{0.15643437f,-0.98768836f}, {-4.3711388e-08f,-1.0000000f}, {-0.15643445f,-0.98768836f},
+{-0.30901703f,-0.95105648f}, {-0.45399061f,-0.89100647f}, {-0.58778518f,-0.80901700f},
+{-0.70710677f,-0.70710677f}, {-0.80901700f,-0.58778518f}, {-0.89100659f,-0.45399037f},
+{-0.95105654f,-0.30901679f}, {-0.98768836f,-0.15643445f}, {-1.0000000f,8.7422777e-08f},
+{-0.98768830f,0.15643461f}, {-0.95105654f,0.30901697f}, {-0.89100653f,0.45399055f},
+{-0.80901694f,0.58778536f}, {-0.70710665f,0.70710689f}, {-0.58778507f,0.80901712f},
+{-0.45399022f,0.89100665f}, {-0.30901709f,0.95105648f}, {-0.15643452f,0.98768830f},
+{1.0000000f,-0.0000000f}, {0.99991435f,-0.013089596f}, {0.99965733f,-0.026176950f},
+{0.99922901f,-0.039259817f}, {0.99862951f,-0.052335959f}, {0.99785894f,-0.065403134f},
+{0.99691731f,-0.078459099f}, {0.99580491f,-0.091501623f}, {0.99452192f,-0.10452846f},
+{0.99306846f,-0.11753740f}, {0.99144489f,-0.13052620f}, {0.98965138f,-0.14349262f},
+{0.98768836f,-0.15643448f}, {0.98555607f,-0.16934951f}, {0.98325491f,-0.18223552f},
+{0.98078525f,-0.19509032f}, {0.97814763f,-0.20791170f}, {0.97534233f,-0.22069745f},
+{0.97236991f,-0.23344538f}, {0.96923089f,-0.24615330f}, {0.96592581f,-0.25881904f},
+{0.96245521f,-0.27144045f}, {0.95881975f,-0.28401536f}, {0.95501995f,-0.29654160f},
+{0.95105648f,-0.30901700f}, {0.94693011f,-0.32143945f}, {0.94264150f,-0.33380687f},
+{0.93819129f,-0.34611708f}, {0.93358040f,-0.35836795f}, {0.92880952f,-0.37055743f},
+{0.92387956f,-0.38268346f}, {0.91879117f,-0.39474389f}, {0.91354543f,-0.40673664f},
+{0.90814316f,-0.41865975f}, {0.90258527f,-0.43051112f}, {0.89687270f,-0.44228873f},
+{0.89100653f,-0.45399052f}, {0.88498765f,-0.46561453f}, {0.87881708f,-0.47715878f},
+{0.87249601f,-0.48862126f}, {0.86602545f,-0.50000000f}, {0.85940641f,-0.51129311f},
+{0.85264015f,-0.52249855f}, {0.84572786f,-0.53361452f}, {0.83867055f,-0.54463905f},
+{0.83146960f,-0.55557024f}, {0.82412618f,-0.56640625f}, {0.81664151f,-0.57714522f},
+{0.80901700f,-0.58778524f}, {0.80125380f,-0.59832460f}, {0.79335332f,-0.60876143f},
+{0.78531694f,-0.61909395f}, {0.77714598f,-0.62932038f}, {0.76884180f,-0.63943899f},
+{0.76040596f,-0.64944810f}, {0.75183982f,-0.65934587f}, {0.74314475f,-0.66913062f},
+{0.73432249f,-0.67880076f}, {0.72537434f,-0.68835455f}, {0.71630192f,-0.69779050f},
+{0.70710677f,-0.70710683f}, {0.69779044f,-0.71630198f}, {0.68835455f,-0.72537440f},
+{0.67880070f,-0.73432255f}, {0.66913056f,-0.74314487f}, {0.65934581f,-0.75183982f},
+{0.64944804f,-0.76040596f}, {0.63943899f,-0.76884186f}, {0.62932038f,-0.77714598f},
+{0.61909395f,-0.78531694f}, {0.60876137f,-0.79335338f}, {0.59832460f,-0.80125386f},
+{0.58778524f,-0.80901700f}, {0.57714516f,-0.81664151f}, {0.56640625f,-0.82412618f},
+{0.55557019f,-0.83146960f}, {0.54463899f,-0.83867055f}, {0.53361452f,-0.84572786f},
+{0.52249849f,-0.85264015f}, {0.51129311f,-0.85940641f}, {0.49999997f,-0.86602545f},
+{0.48862118f,-0.87249601f}, {0.47715876f,-0.87881708f}, {0.46561447f,-0.88498765f},
+{0.45399052f,-0.89100653f}, {0.44228867f,-0.89687276f}, {0.43051103f,-0.90258533f},
+{0.41865975f,-0.90814316f}, {0.40673661f,-0.91354549f}, {0.39474380f,-0.91879129f},
+{0.38268343f,-0.92387956f}, {0.37055740f,-0.92880958f}, {0.35836786f,-0.93358046f},
+{0.34611705f,-0.93819135f}, {0.33380681f,-0.94264150f}, {0.32143947f,-0.94693011f},
+{0.30901697f,-0.95105654f}, {0.29654151f,-0.95501995f}, {0.28401533f,-0.95881975f},
+{0.27144039f,-0.96245527f}, {0.25881907f,-0.96592581f}, {0.24615327f,-0.96923089f},
+{0.23344530f,-0.97236991f}, {0.22069745f,-0.97534233f}, {0.20791166f,-0.97814763f},
+{0.19509023f,-0.98078531f}, {0.18223552f,-0.98325491f}, {0.16934945f,-0.98555607f},
+{0.15643437f,-0.98768836f}, {0.14349259f,-0.98965138f}, {0.13052613f,-0.99144489f},
+{0.11753740f,-0.99306846f}, {0.10452842f,-0.99452192f}, {0.091501534f,-0.99580491f},
+{0.078459084f,-0.99691731f}, {0.065403074f,-0.99785894f}, {0.052335974f,-0.99862951f},
+{0.039259788f,-0.99922901f}, {0.026176875f,-0.99965733f}, {0.013089597f,-0.99991435f},
+{1.0000000f,-0.0000000f}, {0.99965733f,-0.026176950f}, {0.99862951f,-0.052335959f},
+{0.99691731f,-0.078459099f}, {0.99452192f,-0.10452846f}, {0.99144489f,-0.13052620f},
+{0.98768836f,-0.15643448f}, {0.98325491f,-0.18223552f}, {0.97814763f,-0.20791170f},
+{0.97236991f,-0.23344538f}, {0.96592581f,-0.25881904f}, {0.95881975f,-0.28401536f},
+{0.95105648f,-0.30901700f}, {0.94264150f,-0.33380687f}, {0.93358040f,-0.35836795f},
+{0.92387956f,-0.38268346f}, {0.91354543f,-0.40673664f}, {0.90258527f,-0.43051112f},
+{0.89100653f,-0.45399052f}, {0.87881708f,-0.47715878f}, {0.86602545f,-0.50000000f},
+{0.85264015f,-0.52249855f}, {0.83867055f,-0.54463905f}, {0.82412618f,-0.56640625f},
+{0.80901700f,-0.58778524f}, {0.79335332f,-0.60876143f}, {0.77714598f,-0.62932038f},
+{0.76040596f,-0.64944810f}, {0.74314475f,-0.66913062f}, {0.72537434f,-0.68835455f},
+{0.70710677f,-0.70710683f}, {0.68835455f,-0.72537440f}, {0.66913056f,-0.74314487f},
+{0.64944804f,-0.76040596f}, {0.62932038f,-0.77714598f}, {0.60876137f,-0.79335338f},
+{0.58778524f,-0.80901700f}, {0.56640625f,-0.82412618f}, {0.54463899f,-0.83867055f},
+{0.52249849f,-0.85264015f}, {0.49999997f,-0.86602545f}, {0.47715876f,-0.87881708f},
+{0.45399052f,-0.89100653f}, {0.43051103f,-0.90258533f}, {0.40673661f,-0.91354549f},
+{0.38268343f,-0.92387956f}, {0.35836786f,-0.93358046f}, {0.33380681f,-0.94264150f},
+{0.30901697f,-0.95105654f}, {0.28401533f,-0.95881975f}, {0.25881907f,-0.96592581f},
+{0.23344530f,-0.97236991f}, {0.20791166f,-0.97814763f}, {0.18223552f,-0.98325491f},
+{0.15643437f,-0.98768836f}, {0.13052613f,-0.99144489f}, {0.10452842f,-0.99452192f},
+{0.078459084f,-0.99691731f}, {0.052335974f,-0.99862951f}, {0.026176875f,-0.99965733f},
+{-4.3711388e-08f,-1.0000000f}, {-0.026176963f,-0.99965733f}, {-0.052336060f,-0.99862951f},
+{-0.078459173f,-0.99691731f}, {-0.10452851f,-0.99452192f}, {-0.13052621f,-0.99144489f},
+{-0.15643445f,-0.98768836f}, {-0.18223560f,-0.98325491f}, {-0.20791174f,-0.97814757f},
+{-0.23344538f,-0.97236991f}, {-0.25881916f,-0.96592581f}, {-0.28401542f,-0.95881969f},
+{-0.30901703f,-0.95105648f}, {-0.33380687f,-0.94264150f}, {-0.35836795f,-0.93358040f},
+{-0.38268352f,-0.92387950f}, {-0.40673670f,-0.91354543f}, {-0.43051112f,-0.90258527f},
+{-0.45399061f,-0.89100647f}, {-0.47715873f,-0.87881708f}, {-0.50000006f,-0.86602533f},
+{-0.52249867f,-0.85264009f}, {-0.54463905f,-0.83867055f}, {-0.56640631f,-0.82412612f},
+{-0.58778518f,-0.80901700f}, {-0.60876143f,-0.79335332f}, {-0.62932050f,-0.77714586f},
+{-0.64944804f,-0.76040596f}, {-0.66913068f,-0.74314475f}, {-0.68835467f,-0.72537428f},
+{-0.70710677f,-0.70710677f}, {-0.72537446f,-0.68835449f}, {-0.74314493f,-0.66913044f},
+{-0.76040596f,-0.64944804f}, {-0.77714604f,-0.62932026f}, {-0.79335332f,-0.60876143f},
+{-0.80901700f,-0.58778518f}, {-0.82412624f,-0.56640613f}, {-0.83867055f,-0.54463899f},
+{-0.85264021f,-0.52249849f}, {-0.86602539f,-0.50000006f}, {-0.87881714f,-0.47715873f},
+{-0.89100659f,-0.45399037f}, {-0.90258527f,-0.43051112f}, {-0.91354549f,-0.40673658f},
+{-0.92387956f,-0.38268328f}, {-0.93358040f,-0.35836792f}, {-0.94264150f,-0.33380675f},
+{-0.95105654f,-0.30901679f}, {-0.95881975f,-0.28401530f}, {-0.96592587f,-0.25881892f},
+{-0.97236991f,-0.23344538f}, {-0.97814763f,-0.20791161f}, {-0.98325491f,-0.18223536f},
+{-0.98768836f,-0.15643445f}, {-0.99144489f,-0.13052608f}, {-0.99452192f,-0.10452849f},
+{-0.99691737f,-0.078459039f}, {-0.99862957f,-0.052335810f}, {-0.99965733f,-0.026176952f},
+{1.0000000f,-0.0000000f}, {0.99922901f,-0.039259817f}, {0.99691731f,-0.078459099f},
+{0.99306846f,-0.11753740f}, {0.98768836f,-0.15643448f}, {0.98078525f,-0.19509032f},
+{0.97236991f,-0.23344538f}, {0.96245521f,-0.27144045f}, {0.95105648f,-0.30901700f},
+{0.93819129f,-0.34611708f}, {0.92387956f,-0.38268346f}, {0.90814316f,-0.41865975f},
+{0.89100653f,-0.45399052f}, {0.87249601f,-0.48862126f}, {0.85264015f,-0.52249855f},
+{0.83146960f,-0.55557024f}, {0.80901700f,-0.58778524f}, {0.78531694f,-0.61909395f},
+{0.76040596f,-0.64944810f}, {0.73432249f,-0.67880076f}, {0.70710677f,-0.70710683f},
+{0.67880070f,-0.73432255f}, {0.64944804f,-0.76040596f}, {0.61909395f,-0.78531694f},
+{0.58778524f,-0.80901700f}, {0.55557019f,-0.83146960f}, {0.52249849f,-0.85264015f},
+{0.48862118f,-0.87249601f}, {0.45399052f,-0.89100653f}, {0.41865975f,-0.90814316f},
+{0.38268343f,-0.92387956f}, {0.34611705f,-0.93819135f}, {0.30901697f,-0.95105654f},
+{0.27144039f,-0.96245527f}, {0.23344530f,-0.97236991f}, {0.19509023f,-0.98078531f},
+{0.15643437f,-0.98768836f}, {0.11753740f,-0.99306846f}, {0.078459084f,-0.99691731f},
+{0.039259788f,-0.99922901f}, {-4.3711388e-08f,-1.0000000f}, {-0.039259877f,-0.99922901f},
+{-0.078459173f,-0.99691731f}, {-0.11753749f,-0.99306846f}, {-0.15643445f,-0.98768836f},
+{-0.19509032f,-0.98078525f}, {-0.23344538f,-0.97236991f}, {-0.27144048f,-0.96245521f},
+{-0.30901703f,-0.95105648f}, {-0.34611711f,-0.93819129f}, {-0.38268352f,-0.92387950f},
+{-0.41865984f,-0.90814310f}, {-0.45399061f,-0.89100647f}, {-0.48862135f,-0.87249595f},
+{-0.52249867f,-0.85264009f}, {-0.55557036f,-0.83146954f}, {-0.58778518f,-0.80901700f},
+{-0.61909389f,-0.78531694f}, {-0.64944804f,-0.76040596f}, {-0.67880076f,-0.73432249f},
+{-0.70710677f,-0.70710677f}, {-0.73432249f,-0.67880070f}, {-0.76040596f,-0.64944804f},
+{-0.78531694f,-0.61909389f}, {-0.80901700f,-0.58778518f}, {-0.83146966f,-0.55557019f},
+{-0.85264021f,-0.52249849f}, {-0.87249607f,-0.48862115f}, {-0.89100659f,-0.45399037f},
+{-0.90814322f,-0.41865960f}, {-0.92387956f,-0.38268328f}, {-0.93819135f,-0.34611690f},
+{-0.95105654f,-0.30901679f}, {-0.96245521f,-0.27144048f}, {-0.97236991f,-0.23344538f},
+{-0.98078531f,-0.19509031f}, {-0.98768836f,-0.15643445f}, {-0.99306846f,-0.11753736f},
+{-0.99691737f,-0.078459039f}, {-0.99922901f,-0.039259743f}, {-1.0000000f,8.7422777e-08f},
+{-0.99922901f,0.039259918f}, {-0.99691731f,0.078459218f}, {-0.99306846f,0.11753753f},
+{-0.98768830f,0.15643461f}, {-0.98078525f,0.19509049f}, {-0.97236985f,0.23344554f},
+{-0.96245515f,0.27144065f}, {-0.95105654f,0.30901697f}, {-0.93819135f,0.34611705f},
+{-0.92387956f,0.38268346f}, {-0.90814316f,0.41865975f}, {-0.89100653f,0.45399055f},
+{-0.87249601f,0.48862129f}, {-0.85264015f,0.52249861f}, {-0.83146960f,0.55557030f},
+{-0.80901694f,0.58778536f}, {-0.78531688f,0.61909401f}, {-0.76040590f,0.64944816f},
+{-0.73432243f,0.67880082f}, {-0.70710665f,0.70710689f}, {-0.67880058f,0.73432261f},
+{-0.64944792f,0.76040608f}, {-0.61909378f,0.78531706f}, {-0.58778507f,0.80901712f},
+{-0.55557001f,0.83146977f}, {-0.52249837f,0.85264033f}, {-0.48862100f,0.87249613f},
+{-0.45399022f,0.89100665f}, {-0.41865945f,0.90814328f}, {-0.38268313f,0.92387968f},
+{-0.34611672f,0.93819147f}, {-0.30901709f,0.95105648f}, {-0.27144054f,0.96245521f},
+{-0.23344545f,0.97236991f}, {-0.19509038f,0.98078525f}, {-0.15643452f,0.98768830f},
+{-0.11753743f,0.99306846f}, {-0.078459114f,0.99691731f}, {-0.039259821f,0.99922901f},
+};
+static const ne10_fft_cpx_float32_t ne10_twiddles_240[240] = {
+{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f},
+{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f},
+{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f},
+{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f},
+{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f},
+{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f},
+{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f},
+{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f},
+{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f},
+{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f},
+{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f},
+{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f},
+{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f},
+{1.0000000f,-0.0000000f}, {0.95105648f,-0.30901700f}, {0.80901700f,-0.58778524f},
+{0.58778524f,-0.80901700f}, {0.30901697f,-0.95105654f}, {-4.3711388e-08f,-1.0000000f},
+{-0.30901703f,-0.95105648f}, {-0.58778518f,-0.80901700f}, {-0.80901700f,-0.58778518f},
+{-0.95105654f,-0.30901679f}, {-1.0000000f,8.7422777e-08f}, {-0.95105654f,0.30901697f},
+{-0.80901694f,0.58778536f}, {-0.58778507f,0.80901712f}, {-0.30901709f,0.95105648f},
+{1.0000000f,-0.0000000f}, {0.99965733f,-0.026176950f}, {0.99862951f,-0.052335959f},
+{0.99691731f,-0.078459099f}, {0.99452192f,-0.10452846f}, {0.99144489f,-0.13052620f},
+{0.98768836f,-0.15643448f}, {0.98325491f,-0.18223552f}, {0.97814763f,-0.20791170f},
+{0.97236991f,-0.23344538f}, {0.96592581f,-0.25881904f}, {0.95881975f,-0.28401536f},
+{0.95105648f,-0.30901700f}, {0.94264150f,-0.33380687f}, {0.93358040f,-0.35836795f},
+{0.92387956f,-0.38268346f}, {0.91354543f,-0.40673664f}, {0.90258527f,-0.43051112f},
+{0.89100653f,-0.45399052f}, {0.87881708f,-0.47715878f}, {0.86602545f,-0.50000000f},
+{0.85264015f,-0.52249855f}, {0.83867055f,-0.54463905f}, {0.82412618f,-0.56640625f},
+{0.80901700f,-0.58778524f}, {0.79335332f,-0.60876143f}, {0.77714598f,-0.62932038f},
+{0.76040596f,-0.64944810f}, {0.74314475f,-0.66913062f}, {0.72537434f,-0.68835455f},
+{0.70710677f,-0.70710683f}, {0.68835455f,-0.72537440f}, {0.66913056f,-0.74314487f},
+{0.64944804f,-0.76040596f}, {0.62932038f,-0.77714598f}, {0.60876137f,-0.79335338f},
+{0.58778524f,-0.80901700f}, {0.56640625f,-0.82412618f}, {0.54463899f,-0.83867055f},
+{0.52249849f,-0.85264015f}, {0.49999997f,-0.86602545f}, {0.47715876f,-0.87881708f},
+{0.45399052f,-0.89100653f}, {0.43051103f,-0.90258533f}, {0.40673661f,-0.91354549f},
+{0.38268343f,-0.92387956f}, {0.35836786f,-0.93358046f}, {0.33380681f,-0.94264150f},
+{0.30901697f,-0.95105654f}, {0.28401533f,-0.95881975f}, {0.25881907f,-0.96592581f},
+{0.23344530f,-0.97236991f}, {0.20791166f,-0.97814763f}, {0.18223552f,-0.98325491f},
+{0.15643437f,-0.98768836f}, {0.13052613f,-0.99144489f}, {0.10452842f,-0.99452192f},
+{0.078459084f,-0.99691731f}, {0.052335974f,-0.99862951f}, {0.026176875f,-0.99965733f},
+{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f},
+{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f},
+{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f},
+{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f},
+{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f},
+{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f},
+{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f},
+{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f},
+{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f},
+{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f},
+{-4.3711388e-08f,-1.0000000f}, {-0.052336060f,-0.99862951f}, {-0.10452851f,-0.99452192f},
+{-0.15643445f,-0.98768836f}, {-0.20791174f,-0.97814757f}, {-0.25881916f,-0.96592581f},
+{-0.30901703f,-0.95105648f}, {-0.35836795f,-0.93358040f}, {-0.40673670f,-0.91354543f},
+{-0.45399061f,-0.89100647f}, {-0.50000006f,-0.86602533f}, {-0.54463905f,-0.83867055f},
+{-0.58778518f,-0.80901700f}, {-0.62932050f,-0.77714586f}, {-0.66913068f,-0.74314475f},
+{-0.70710677f,-0.70710677f}, {-0.74314493f,-0.66913044f}, {-0.77714604f,-0.62932026f},
+{-0.80901700f,-0.58778518f}, {-0.83867055f,-0.54463899f}, {-0.86602539f,-0.50000006f},
+{-0.89100659f,-0.45399037f}, {-0.91354549f,-0.40673658f}, {-0.93358040f,-0.35836792f},
+{-0.95105654f,-0.30901679f}, {-0.96592587f,-0.25881892f}, {-0.97814763f,-0.20791161f},
+{-0.98768836f,-0.15643445f}, {-0.99452192f,-0.10452849f}, {-0.99862957f,-0.052335810f},
+{1.0000000f,-0.0000000f}, {0.99691731f,-0.078459099f}, {0.98768836f,-0.15643448f},
+{0.97236991f,-0.23344538f}, {0.95105648f,-0.30901700f}, {0.92387956f,-0.38268346f},
+{0.89100653f,-0.45399052f}, {0.85264015f,-0.52249855f}, {0.80901700f,-0.58778524f},
+{0.76040596f,-0.64944810f}, {0.70710677f,-0.70710683f}, {0.64944804f,-0.76040596f},
+{0.58778524f,-0.80901700f}, {0.52249849f,-0.85264015f}, {0.45399052f,-0.89100653f},
+{0.38268343f,-0.92387956f}, {0.30901697f,-0.95105654f}, {0.23344530f,-0.97236991f},
+{0.15643437f,-0.98768836f}, {0.078459084f,-0.99691731f}, {-4.3711388e-08f,-1.0000000f},
+{-0.078459173f,-0.99691731f}, {-0.15643445f,-0.98768836f}, {-0.23344538f,-0.97236991f},
+{-0.30901703f,-0.95105648f}, {-0.38268352f,-0.92387950f}, {-0.45399061f,-0.89100647f},
+{-0.52249867f,-0.85264009f}, {-0.58778518f,-0.80901700f}, {-0.64944804f,-0.76040596f},
+{-0.70710677f,-0.70710677f}, {-0.76040596f,-0.64944804f}, {-0.80901700f,-0.58778518f},
+{-0.85264021f,-0.52249849f}, {-0.89100659f,-0.45399037f}, {-0.92387956f,-0.38268328f},
+{-0.95105654f,-0.30901679f}, {-0.97236991f,-0.23344538f}, {-0.98768836f,-0.15643445f},
+{-0.99691737f,-0.078459039f}, {-1.0000000f,8.7422777e-08f}, {-0.99691731f,0.078459218f},
+{-0.98768830f,0.15643461f}, {-0.97236985f,0.23344554f}, {-0.95105654f,0.30901697f},
+{-0.92387956f,0.38268346f}, {-0.89100653f,0.45399055f}, {-0.85264015f,0.52249861f},
+{-0.80901694f,0.58778536f}, {-0.76040590f,0.64944816f}, {-0.70710665f,0.70710689f},
+{-0.64944792f,0.76040608f}, {-0.58778507f,0.80901712f}, {-0.52249837f,0.85264033f},
+{-0.45399022f,0.89100665f}, {-0.38268313f,0.92387968f}, {-0.30901709f,0.95105648f},
+{-0.23344545f,0.97236991f}, {-0.15643452f,0.98768830f}, {-0.078459114f,0.99691731f},
+};
+static const ne10_fft_cpx_float32_t ne10_twiddles_120[120] = {
+{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f},
+{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f},
+{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f},
+{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f},
+{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f},
+{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f},
+{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f},
+{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f},
+{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f},
+{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f},
+{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f},
+{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f},
+{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f},
+{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f},
+{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f},
+{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f},
+{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f},
+{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f},
+{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f},
+{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f},
+{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f},
+{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f},
+{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f},
+{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f},
+{-4.3711388e-08f,-1.0000000f}, {-0.10452851f,-0.99452192f}, {-0.20791174f,-0.97814757f},
+{-0.30901703f,-0.95105648f}, {-0.40673670f,-0.91354543f}, {-0.50000006f,-0.86602533f},
+{-0.58778518f,-0.80901700f}, {-0.66913068f,-0.74314475f}, {-0.74314493f,-0.66913044f},
+{-0.80901700f,-0.58778518f}, {-0.86602539f,-0.50000006f}, {-0.91354549f,-0.40673658f},
+{-0.95105654f,-0.30901679f}, {-0.97814763f,-0.20791161f}, {-0.99452192f,-0.10452849f},
+{1.0000000f,-0.0000000f}, {0.98768836f,-0.15643448f}, {0.95105648f,-0.30901700f},
+{0.89100653f,-0.45399052f}, {0.80901700f,-0.58778524f}, {0.70710677f,-0.70710683f},
+{0.58778524f,-0.80901700f}, {0.45399052f,-0.89100653f}, {0.30901697f,-0.95105654f},
+{0.15643437f,-0.98768836f}, {-4.3711388e-08f,-1.0000000f}, {-0.15643445f,-0.98768836f},
+{-0.30901703f,-0.95105648f}, {-0.45399061f,-0.89100647f}, {-0.58778518f,-0.80901700f},
+{-0.70710677f,-0.70710677f}, {-0.80901700f,-0.58778518f}, {-0.89100659f,-0.45399037f},
+{-0.95105654f,-0.30901679f}, {-0.98768836f,-0.15643445f}, {-1.0000000f,8.7422777e-08f},
+{-0.98768830f,0.15643461f}, {-0.95105654f,0.30901697f}, {-0.89100653f,0.45399055f},
+{-0.80901694f,0.58778536f}, {-0.70710665f,0.70710689f}, {-0.58778507f,0.80901712f},
+{-0.45399022f,0.89100665f}, {-0.30901709f,0.95105648f}, {-0.15643452f,0.98768830f},
+};
+static const ne10_fft_cpx_float32_t ne10_twiddles_60[60] = {
+{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f},
+{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f},
+{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f},
+{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f},
+{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f},
+{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f},
+{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f},
+{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f},
+{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f},
+{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f},
+{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f},
+{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f},
+{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f},
+{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f},
+{1.0000000f,-0.0000000f}, {0.95105648f,-0.30901700f}, {0.80901700f,-0.58778524f},
+{0.58778524f,-0.80901700f}, {0.30901697f,-0.95105654f}, {-4.3711388e-08f,-1.0000000f},
+{-0.30901703f,-0.95105648f}, {-0.58778518f,-0.80901700f}, {-0.80901700f,-0.58778518f},
+{-0.95105654f,-0.30901679f}, {-1.0000000f,8.7422777e-08f}, {-0.95105654f,0.30901697f},
+{-0.80901694f,0.58778536f}, {-0.58778507f,0.80901712f}, {-0.30901709f,0.95105648f},
+};
+static const ne10_fft_state_float32_t ne10_fft_state_float32_t_480 = {
+120,
+(ne10_int32_t *)ne10_factors_480,
+(ne10_fft_cpx_float32_t *)ne10_twiddles_480,
+NULL,
+(ne10_fft_cpx_float32_t *)&ne10_twiddles_480[120],
+/* is_forward_scaled = true */
+(ne10_int32_t) 1,
+/* is_backward_scaled = false */
+(ne10_int32_t) 0,
+};
+static const arch_fft_state cfg_arch_480 = {
+1,
+(void *)&ne10_fft_state_float32_t_480,
+};
+
+static const ne10_fft_state_float32_t ne10_fft_state_float32_t_240 = {
+60,
+(ne10_int32_t *)ne10_factors_240,
+(ne10_fft_cpx_float32_t *)ne10_twiddles_240,
+NULL,
+(ne10_fft_cpx_float32_t *)&ne10_twiddles_240[60],
+/* is_forward_scaled = true */
+(ne10_int32_t) 1,
+/* is_backward_scaled = false */
+(ne10_int32_t) 0,
+};
+static const arch_fft_state cfg_arch_240 = {
+1,
+(void *)&ne10_fft_state_float32_t_240,
+};
+
+static const ne10_fft_state_float32_t ne10_fft_state_float32_t_120 = {
+30,
+(ne10_int32_t *)ne10_factors_120,
+(ne10_fft_cpx_float32_t *)ne10_twiddles_120,
+NULL,
+(ne10_fft_cpx_float32_t *)&ne10_twiddles_120[30],
+/* is_forward_scaled = true */
+(ne10_int32_t) 1,
+/* is_backward_scaled = false */
+(ne10_int32_t) 0,
+};
+static const arch_fft_state cfg_arch_120 = {
+1,
+(void *)&ne10_fft_state_float32_t_120,
+};
+
+static const ne10_fft_state_float32_t ne10_fft_state_float32_t_60 = {
+15,
+(ne10_int32_t *)ne10_factors_60,
+(ne10_fft_cpx_float32_t *)ne10_twiddles_60,
+NULL,
+(ne10_fft_cpx_float32_t *)&ne10_twiddles_60[15],
+/* is_forward_scaled = true */
+(ne10_int32_t) 1,
+/* is_backward_scaled = false */
+(ne10_int32_t) 0,
+};
+static const arch_fft_state cfg_arch_60 = {
+1,
+(void *)&ne10_fft_state_float32_t_60,
+};
+
+#endif  /* end NE10_FFT_PARAMS48000_960 */
diff --git a/celt/tests/test_unit_cwrs32.c b/celt/tests/test_unit_cwrs32.c
index ac2a8d1..36dd8af 100644
--- a/celt/tests/test_unit_cwrs32.c
+++ b/celt/tests/test_unit_cwrs32.c
@@ -127,7 +127,7 @@
         cwrsi(n,k,i,y);
 #endif
         sy=0;
-        for(j=0;j<n;j++)sy+=ABS(y[j]);
+        for(j=0;j<n;j++)sy+=abs(y[j]);
         if(sy!=k){
           fprintf(stderr,"N=%d Pulse count mismatch in cwrsi (%d!=%d).\n",
            n,sy,k);
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 7ff0be0..6166eb0 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -38,12 +38,29 @@
 #include <stdio.h>
 
 #define CELT_C
+#define TEST_UNIT_DFT_C
 #include "stack_alloc.h"
 #include "kiss_fft.h"
 #include "kiss_fft.c"
 #include "mathops.c"
 #include "entcode.c"
 
+#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# include "x86/x86cpu.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/armcpu.c"
+# include "celt_lpc.c"
+# include "pitch.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_neon_intr.c"
+#  if defined(HAVE_ARM_NE10)
+#   include "mdct.c"
+#   include "arm/celt_ne10_fft.c"
+#   include "arm/celt_ne10_mdct.c"
+#  endif
+# endif
+# include "arm/arm_celt_map.c"
+#endif
 
 #ifndef M_PI
 #define M_PI 3.141592653
@@ -92,13 +109,13 @@
     }
 }
 
-void test1d(int nfft,int isinverse)
+void test1d(int nfft,int isinverse,int arch)
 {
     size_t buflen = sizeof(kiss_fft_cpx)*nfft;
 
     kiss_fft_cpx  * in = (kiss_fft_cpx*)malloc(buflen);
     kiss_fft_cpx  * out= (kiss_fft_cpx*)malloc(buflen);
-    kiss_fft_state *cfg = opus_fft_alloc(nfft,0,0);
+    kiss_fft_state *cfg = opus_fft_alloc(nfft,0,0,arch);
     int k;
 
     for (k=0;k<nfft;++k) {
@@ -122,9 +139,9 @@
     /*for (k=0;k<nfft;++k) printf("%d %d ", in[k].r, in[k].i);printf("\n");*/
 
     if (isinverse)
-       opus_ifft(cfg,in,out);
+       opus_ifft(cfg,in,out, arch);
     else
-       opus_fft(cfg,in,out);
+       opus_fft(cfg,in,out, arch);
 
     /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
 
@@ -132,32 +149,40 @@
 
     free(in);
     free(out);
-    free(cfg);
+    opus_fft_free(cfg, arch);
 }
 
 int main(int argc,char ** argv)
 {
     ALLOC_STACK;
+    int arch = opus_select_arch();
+
     if (argc>1) {
         int k;
         for (k=1;k<argc;++k) {
-            test1d(atoi(argv[k]),0);
-            test1d(atoi(argv[k]),1);
+            test1d(atoi(argv[k]),0,arch);
+            test1d(atoi(argv[k]),1,arch);
         }
     }else{
-        test1d(32,0);
-        test1d(32,1);
-        test1d(128,0);
-        test1d(128,1);
-        test1d(256,0);
-        test1d(256,1);
+        test1d(32,0,arch);
+        test1d(32,1,arch);
+        test1d(128,0,arch);
+        test1d(128,1,arch);
+        test1d(256,0,arch);
+        test1d(256,1,arch);
 #ifndef RADIX_TWO_ONLY
-        test1d(36,0);
-        test1d(36,1);
-        test1d(50,0);
-        test1d(50,1);
-        test1d(120,0);
-        test1d(120,1);
+        test1d(36,0,arch);
+        test1d(36,1,arch);
+        test1d(50,0,arch);
+        test1d(50,1,arch);
+        test1d(60,0,arch);
+        test1d(60,1,arch);
+        test1d(120,0,arch);
+        test1d(120,1,arch);
+        test1d(240,0,arch);
+        test1d(240,1,arch);
+        test1d(480,0,arch);
+        test1d(480,1,arch);
 #endif
     }
     return ret;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index bd83986..ff92658 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -66,10 +66,10 @@
   const char    *env_seed;
   ret=0;
   entropy=0;
-    if (_argc > 2) {
-	fprintf(stderr, "Usage: %s [<seed>]\n", _argv[0]);
-	return 1;
-    }
+  if (_argc > 2) {
+    fprintf(stderr, "Usage: %s [<seed>]\n", _argv[0]);
+    return 1;
+  }
   env_seed = getenv("SEED");
   if (_argc > 1)
     seed = atoi(_argv[1]);
diff --git a/celt/tests/test_unit_laplace.c b/celt/tests/test_unit_laplace.c
index b0f5935..22951e2 100644
--- a/celt/tests/test_unit_laplace.c
+++ b/celt/tests/test_unit_laplace.c
@@ -88,5 +88,6 @@
       }
    }
 
+   free(ptr);
    return ret;
 }
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 4bb780e..fd3319d 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -36,6 +36,8 @@
 
 #define CELT_C
 
+#include <stdio.h>
+#include <math.h>
 #include "mathops.c"
 #include "entenc.c"
 #include "entdec.c"
@@ -45,8 +47,35 @@
 #include "laplace.c"
 #include "vq.c"
 #include "cwrs.c"
-#include <stdio.h>
-#include <math.h>
+#include "pitch.c"
+#include "celt_lpc.c"
+#include "celt.c"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+#  include "x86/pitch_sse.c"
+# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  include "x86/pitch_sse2.c"
+# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#  include "x86/pitch_sse4_1.c"
+#  include "x86/celt_lpc_sse.c"
+# endif
+# include "x86/x86_celt_map.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/armcpu.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_neon_intr.c"
+#  if defined(HAVE_ARM_NE10)
+#   include "kiss_fft.c"
+#   include "mdct.c"
+#   include "arm/celt_ne10_fft.c"
+#   include "arm/celt_ne10_mdct.c"
+#  endif
+# endif
+# include "arm/arm_celt_map.c"
+#endif
 
 #ifdef FIXED_POINT
 #define WORD "%d"
@@ -214,7 +243,7 @@
       float error2 = fabs(exp(0.6931471805599453094*x/1024.0)-celt_exp2(x)/65536.0);
       if (error1>0.0002&&error2>0.00004)
       {
-    	 fprintf (stderr, "celt_exp2 failed: x = "WORD", error1 = %f, error2 = %f\n", x,error1,error2);
+         fprintf (stderr, "celt_exp2 failed: x = "WORD", error1 = %f, error2 = %f\n", x,error1,error2);
          ret = 1;
       }
    }
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index ac8957f..8dbb9ca 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -46,6 +46,22 @@
 #include "mathops.c"
 #include "entcode.c"
 
+#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# include "x86/x86cpu.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/armcpu.c"
+# include "pitch.c"
+# include "celt_lpc.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_neon_intr.c"
+#  if defined(HAVE_ARM_NE10)
+#   include "arm/celt_ne10_fft.c"
+#   include "arm/celt_ne10_mdct.c"
+#  endif
+# endif
+# include "arm/arm_celt_map.c"
+#endif
+
 #ifndef M_PI
 #define M_PI 3.141592653
 #endif
@@ -112,7 +128,7 @@
 }
 
 
-void test1d(int nfft,int isinverse)
+void test1d(int nfft,int isinverse,int arch)
 {
     mdct_lookup cfg;
     size_t buflen = sizeof(kiss_fft_scalar)*nfft;
@@ -123,7 +139,7 @@
     opus_val16  * window= (opus_val16*)malloc(sizeof(opus_val16)*nfft/2);
     int k;
 
-    clt_mdct_init(&cfg, nfft, 0);
+    clt_mdct_init(&cfg, nfft, 0, arch);
     for (k=0;k<nfft;++k) {
         in[k] = (rand() % 32768) - 16384;
     }
@@ -150,60 +166,64 @@
     {
        for (k=0;k<nfft;++k)
           out[k] = 0;
-       clt_mdct_backward(&cfg,in,out, window, nfft/2, 0, 1);
+       clt_mdct_backward(&cfg,in,out, window, nfft/2, 0, 1, arch);
        /* apply TDAC because clt_mdct_backward() no longer does that */
        for (k=0;k<nfft/4;++k)
           out[nfft-k-1] = out[nfft/2+k];
        check_inv(in,out,nfft,isinverse);
     } else {
-       clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1);
+       clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1, arch);
        check(in_copy,out,nfft,isinverse);
     }
     /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
 
 
     free(in);
+    free(in_copy);
     free(out);
-    clt_mdct_clear(&cfg);
+    free(window);
+    clt_mdct_clear(&cfg, arch);
 }
 
 int main(int argc,char ** argv)
 {
     ALLOC_STACK;
+    int arch = opus_select_arch();
+
     if (argc>1) {
         int k;
         for (k=1;k<argc;++k) {
-            test1d(atoi(argv[k]),0);
-            test1d(atoi(argv[k]),1);
+            test1d(atoi(argv[k]),0,arch);
+            test1d(atoi(argv[k]),1,arch);
         }
     }else{
-        test1d(32,0);
-        test1d(32,1);
-        test1d(256,0);
-        test1d(256,1);
-        test1d(512,0);
-        test1d(512,1);
-        test1d(1024,0);
-        test1d(1024,1);
-        test1d(2048,0);
-        test1d(2048,1);
+        test1d(32,0,arch);
+        test1d(32,1,arch);
+        test1d(256,0,arch);
+        test1d(256,1,arch);
+        test1d(512,0,arch);
+        test1d(512,1,arch);
+        test1d(1024,0,arch);
+        test1d(1024,1,arch);
+        test1d(2048,0,arch);
+        test1d(2048,1,arch);
 #ifndef RADIX_TWO_ONLY
-        test1d(36,0);
-        test1d(36,1);
-        test1d(40,0);
-        test1d(40,1);
-        test1d(60,0);
-        test1d(60,1);
-        test1d(120,0);
-        test1d(120,1);
-        test1d(240,0);
-        test1d(240,1);
-        test1d(480,0);
-        test1d(480,1);
-        test1d(960,0);
-        test1d(960,1);
-        test1d(1920,0);
-        test1d(1920,1);
+        test1d(36,0,arch);
+        test1d(36,1,arch);
+        test1d(40,0,arch);
+        test1d(40,1,arch);
+        test1d(60,0,arch);
+        test1d(60,1,arch);
+        test1d(120,0,arch);
+        test1d(120,1,arch);
+        test1d(240,0,arch);
+        test1d(240,1,arch);
+        test1d(480,0,arch);
+        test1d(480,1,arch);
+        test1d(960,0,arch);
+        test1d(960,1,arch);
+        test1d(1920,0,arch);
+        test1d(1920,1,arch);
 #endif
     }
     return ret;
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index ce5f096..1080c20 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -44,7 +44,37 @@
 #include "entdec.c"
 #include "mathops.c"
 #include "bands.h"
+#include "pitch.c"
+#include "celt_lpc.c"
+#include "celt.c"
 #include <math.h>
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+#  include "x86/pitch_sse.c"
+# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  include "x86/pitch_sse2.c"
+# endif
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#  include "x86/pitch_sse4_1.c"
+#  include "x86/celt_lpc_sse.c"
+# endif
+# include "x86/x86_celt_map.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/armcpu.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_neon_intr.c"
+#  if defined(HAVE_ARM_NE10)
+#   include "kiss_fft.c"
+#   include "mdct.c"
+#   include "arm/celt_ne10_fft.c"
+#   include "arm/celt_ne10_mdct.c"
+#  endif
+# endif
+# include "arm/arm_celt_map.c"
+#endif
+
 #define MAX_SIZE 100
 
 int ret=0;
diff --git a/celt/vq.c b/celt/vq.c
index 98a0f36..f358396 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -37,19 +37,23 @@
 #include "os_support.h"
 #include "bands.h"
 #include "rate.h"
+#include "pitch.h"
 
+#ifndef OVERRIDE_vq_exp_rotation1
 static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
 {
    int i;
+   opus_val16 ms;
    celt_norm *Xptr;
    Xptr = X;
+   ms = NEG16(s);
    for (i=0;i<len-stride;i++)
    {
       celt_norm x1, x2;
       x1 = Xptr[0];
       x2 = Xptr[stride];
-      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
-      *Xptr++      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr++      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
    }
    Xptr = &X[len-2*stride-1];
    for (i=len-2*stride-1;i>=0;i--)
@@ -57,10 +61,11 @@
       celt_norm x1, x2;
       x1 = Xptr[0];
       x2 = Xptr[stride];
-      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
-      *Xptr--      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr--      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
    }
 }
+#endif /* OVERRIDE_vq_exp_rotation1 */
 
 static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread)
 {
@@ -91,7 +96,7 @@
    }
    /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
       extract_collapse_mask().*/
-   len /= stride;
+   len = celt_udiv(len, stride);
    for (i=0;i<stride;i++)
    {
       if (dir < 0)
@@ -140,13 +145,15 @@
       return 1;
    /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
       exp_rotation().*/
-   N0 = N/B;
+   N0 = celt_udiv(N, B);
    collapse_mask = 0;
    i=0; do {
       int j;
+      unsigned tmp=0;
       j=0; do {
-         collapse_mask |= (iy[i*N0+j]!=0)<<i;
+         tmp |= iy[i*N0+j];
       } while (++j<N0);
+      collapse_mask |= (tmp!=0)<<i;
    } while (++i<B);
    return collapse_mask;
 }
@@ -322,7 +329,6 @@
 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
       ec_dec *dec, opus_val16 gain)
 {
-   int i;
    opus_val32 Ryy;
    unsigned collapse_mask;
    VARDECL(int, iy);
@@ -331,12 +337,7 @@
    celt_assert2(K>0, "alg_unquant() needs at least one pulse");
    celt_assert2(N>1, "alg_unquant() needs at least two dimensions");
    ALLOC(iy, N, int);
-   decode_pulses(iy, N, K, dec);
-   Ryy = 0;
-   i=0;
-   do {
-      Ryy = MAC16_16(Ryy, iy[i], iy[i]);
-   } while (++i < N);
+   Ryy = decode_pulses(iy, N, K, dec);
    normalise_residual(iy, X, N, Ryy, gain);
    exp_rotation(X, N, -1, B, K, spread);
    collapse_mask = extract_collapse_mask(iy, N, B);
@@ -344,21 +345,18 @@
    return collapse_mask;
 }
 
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+#ifndef OVERRIDE_renormalise_vector
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
 {
    int i;
 #ifdef FIXED_POINT
    int k;
 #endif
-   opus_val32 E = EPSILON;
+   opus_val32 E;
    opus_val16 g;
    opus_val32 t;
-   celt_norm *xptr = X;
-   for (i=0;i<N;i++)
-   {
-      E = MAC16_16(E, *xptr, *xptr);
-      xptr++;
-   }
+   celt_norm *xptr;
+   E = EPSILON + celt_inner_prod(X, X, N, arch);
 #ifdef FIXED_POINT
    k = celt_ilog2(E)>>1;
 #endif
@@ -373,8 +371,9 @@
    }
    /*return celt_sqrt(E);*/
 }
+#endif /* OVERRIDE_renormalise_vector */
 
-int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
 {
    int i;
    int itheta;
@@ -393,14 +392,8 @@
          Eside = MAC16_16(Eside, s, s);
       }
    } else {
-      for (i=0;i<N;i++)
-      {
-         celt_norm m, s;
-         m = X[i];
-         s = Y[i];
-         Emid = MAC16_16(Emid, m, m);
-         Eside = MAC16_16(Eside, s, s);
-      }
+      Emid += celt_inner_prod(X, X, N, arch);
+      Eside += celt_inner_prod(Y, Y, N, arch);
    }
    mid = celt_sqrt(Emid);
    side = celt_sqrt(Eside);
diff --git a/celt/vq.h b/celt/vq.h
index ffdc69c..5cfcbe5 100644
--- a/celt/vq.h
+++ b/celt/vq.h
@@ -37,6 +37,11 @@
 #include "entdec.h"
 #include "modes.h"
 
+#if defined(MIPSr1_ASM)
+#include "mips/vq_mipsr1.h"
+#endif
+
+
 /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of
   * the pitch and a combination of pulses such that its norm is still equal
   * to 1. This is the function that will typically require the most CPU.
@@ -63,8 +68,8 @@
 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
       ec_dec *dec, opus_val16 gain);
 
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
 
-int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N);
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);
 
 #endif /* VQ_H */
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
new file mode 100644
index 0000000..67e5592
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.c
@@ -0,0 +1,132 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_sse4_1(const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch)
+{
+    int i,j;
+    VARDECL(opus_val16, rnum);
+    VARDECL(opus_val16, x);
+
+    __m128i vecNoA;
+    opus_int32 noA ;
+    SAVE_STACK;
+
+   ALLOC(rnum, ord, opus_val16);
+   ALLOC(x, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   for(i=0;i<ord;i++)
+      x[i] = mem[ord-i-1];
+
+   for (i=0;i<N-7;i+=8)
+   {
+       x[i+ord  ]=_x[i  ];
+       x[i+ord+1]=_x[i+1];
+       x[i+ord+2]=_x[i+2];
+       x[i+ord+3]=_x[i+3];
+       x[i+ord+4]=_x[i+4];
+       x[i+ord+5]=_x[i+5];
+       x[i+ord+6]=_x[i+6];
+       x[i+ord+7]=_x[i+7];
+   }
+
+   for (;i<N-3;i+=4)
+   {
+       x[i+ord  ]=_x[i  ];
+       x[i+ord+1]=_x[i+1];
+       x[i+ord+2]=_x[i+2];
+       x[i+ord+3]=_x[i+3];
+   }
+
+   for (;i<N;i++)
+         x[i+ord]=_x[i];
+
+   for(i=0;i<ord;i++)
+      mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      for (j=0;j<ord;j++)
+      {
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
+      }
+      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+   }
+#else
+   noA = EXTEND32(1) << SIG_SHIFT >> 1;
+   vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+   for (i=0;i<N-3;i+=4)
+   {
+      opus_val32 sums[4] = {0};
+      __m128i vecSum, vecX;
+
+      xcorr_kernel(rnum, x+i, sums, ord, arch);
+
+      vecSum = _mm_loadu_si128((__m128i *)sums);
+      vecSum = _mm_add_epi32(vecSum, vecNoA);
+      vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+      vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+      vecSum = _mm_add_epi32(vecSum, vecX);
+      vecSum = _mm_packs_epi32(vecSum, vecSum);
+      _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum, rnum[j], x[i + j]);
+      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+   }
+
+#endif
+   RESTORE_STACK;
+}
+
+#endif
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
new file mode 100644
index 0000000..c5ec796
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_SSE_H
+#define CELT_LPC_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_FIR
+
+void celt_fir_sse4_1(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+
+#else
+
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch);
+
+#  define celt_fir(x, num, y, N, ord, mem, arch) \
+    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+
+#endif
+#endif
+
+#endif
diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c
new file mode 100644
index 0000000..20e7312
--- /dev/null
+++ b/celt/x86/pitch_sse.c
@@ -0,0 +1,185 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+
+#include <xmmintrin.h>
+#include "arch.h"
+
+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+{
+   int j;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_loadu_ps(sum);
+   xsum2 = _mm_setzero_ps();
+
+   for (j = 0; j < len-3; j += 4)
+   {
+      __m128 x0 = _mm_loadu_ps(x+j);
+      __m128 yj = _mm_loadu_ps(y+j);
+      __m128 y3 = _mm_loadu_ps(y+j+3);
+
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+                                          _mm_shuffle_ps(yj,y3,0x49)));
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+                                          _mm_shuffle_ps(yj,y3,0x9e)));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+   }
+   if (j < len)
+   {
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+      if (++j < len)
+      {
+         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         if (++j < len)
+         {
+            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         }
+      }
+   }
+   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
+}
+
+
+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_setzero_ps();
+   xsum2 = _mm_setzero_ps();
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 y1i = _mm_loadu_ps(y01+i);
+      __m128 y2i = _mm_loadu_ps(y02+i);
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+   }
+   /* Horizontal sum */
+   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+   _mm_store_ss(xy1, xsum1);
+   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+   _mm_store_ss(xy2, xsum2);
+   for (;i<N;i++)
+   {
+      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+   }
+}
+
+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+   int i;
+   float xy;
+   __m128 sum;
+   sum = _mm_setzero_ps();
+   /* FIXME: We should probably go 8-way and use 2 sums. */
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 yi = _mm_loadu_ps(y+i);
+      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
+   }
+   /* Horizontal sum */
+   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+   _mm_store_ss(&xy, sum);
+   for (;i<N;i++)
+   {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+#endif
+}
+
+
+#endif
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 695122a..d4cbeb8 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
+   Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
 /**
    @file pitch_sse.h
    @brief Pitch analysis
@@ -32,125 +33,160 @@
 #ifndef PITCH_SSE_H
 #define PITCH_SSE_H
 
-#include <xmmintrin.h>
-#include "arch.h"
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+void xcorr_kernel_sse4_1(
+                    const opus_int16 *x,
+                    const opus_int16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+void xcorr_kernel_sse(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+#endif
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)arch, xcorr_kernel_sse(x, y, sum, len))
+
+#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
 
 #define OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
-{
-   int j;
-   __m128 xsum1, xsum2;
-   xsum1 = _mm_loadu_ps(sum);
-   xsum2 = _mm_setzero_ps();
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
 
-   for (j = 0; j < len-3; j += 4)
-   {
-      __m128 x0 = _mm_loadu_ps(x+j);
-      __m128 yj = _mm_loadu_ps(y+j);
-      __m128 y3 = _mm_loadu_ps(y+j+3);
+#endif
 
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
-                                          _mm_shuffle_ps(yj,y3,0x49)));
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
-                                          _mm_shuffle_ps(yj,y3,0x9e)));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
-   }
-   if (j < len)
-   {
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-      if (++j < len)
-      {
-         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-         if (++j < len)
-         {
-            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
-         }
-      }
-   }
-   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
-}
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse4_1(
+    const opus_int16 *x,
+    const opus_int16 *y,
+    int               N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(
+    const opus_int16 *x,
+    const opus_int16 *y,
+    int               N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse(
+    const opus_val16 *x,
+    const opus_val16 *y,
+    int               N);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse4_1(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse2(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+	((void)arch, celt_inner_prod_sse(x, y, N))
+
+
+#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+	(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    int               N);
+
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+    ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
 #define OVERRIDE_DUAL_INNER_PROD
-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
-{
-   int i;
-   __m128 xsum1, xsum2;
-   xsum1 = _mm_setzero_ps();
-   xsum2 = _mm_setzero_ps();
-   for (i=0;i<N-3;i+=4)
-   {
-      __m128 xi = _mm_loadu_ps(x+i);
-      __m128 y1i = _mm_loadu_ps(y01+i);
-      __m128 y2i = _mm_loadu_ps(y02+i);
-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
-   }
-   /* Horizontal sum */
-   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
-   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
-   _mm_store_ss(xy1, xsum1);
-   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
-   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
-   _mm_store_ss(xy2, xsum2);
-   for (;i<N;i++)
-   {
-      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
-      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
-   }
-}
-
 #define OVERRIDE_COMB_FILTER_CONST
-static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
-      opus_val16 g10, opus_val16 g11, opus_val16 g12)
-{
-   int i;
-   __m128 x0v;
-   __m128 g10v, g11v, g12v;
-   g10v = _mm_load1_ps(&g10);
-   g11v = _mm_load1_ps(&g11);
-   g12v = _mm_load1_ps(&g12);
-   x0v = _mm_loadu_ps(&x[-T-2]);
-   for (i=0;i<N-3;i+=4)
-   {
-      __m128 yi, yi2, x1v, x2v, x3v, x4v;
-      const opus_val32 *xp = &x[i-T-2];
-      yi = _mm_loadu_ps(x+i);
-      x4v = _mm_loadu_ps(xp+4);
-#if 0
-      /* Slower version with all loads */
-      x1v = _mm_loadu_ps(xp+1);
-      x2v = _mm_loadu_ps(xp+2);
-      x3v = _mm_loadu_ps(xp+3);
-#else
-      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
-      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
-      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
-#endif
 
-      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
-#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
-      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
-      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#undef dual_inner_prod
+#undef comb_filter_const
+
+void dual_inner_prod_sse(const opus_val16 *x,
+	const opus_val16 *y01,
+	const opus_val16 *y02,
+	int               N,
+	opus_val32       *xy1,
+	opus_val32       *xy2);
+
+void comb_filter_const_sse(opus_val32 *y,
+	opus_val32 *x,
+	int         T,
+	int         N,
+	opus_val16  g10,
+	opus_val16  g11,
+	opus_val16  g12);
+
+
+#if defined(OPUS_X86_PRESUME_SSE)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+    ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
+
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+    ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
 #else
-      /* Use partial sums */
-      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
-                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
-      yi = _mm_add_ps(yi, yi2);
+
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+              const opus_val16 *x,
+              const opus_val16 *y01,
+              const opus_val16 *y02,
+              int               N,
+              opus_val32       *xy1,
+              opus_val32       *xy2);
+
+#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)			\
+    ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+
+extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+              opus_val32 *y,
+              opus_val32 *x,
+              int         T,
+              int         N,
+              opus_val16  g10,
+              opus_val16  g11,
+              opus_val16  g12);
+
+#define comb_filter_const(y, x, T, N, g10, g11, g12, arch)				\
+    ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))
+
+#define NON_STATIC_COMB_FILTER_CONST_C
+
 #endif
-      x0v=x4v;
-      _mm_storeu_ps(y+i, yi);
-   }
-#ifdef CUSTOM_MODES
-   for (;i<N;i++)
-   {
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x[i-T])
-               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
-               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
-   }
 #endif
-}
 
 #endif
diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c
new file mode 100644
index 0000000..a0e7d1b
--- /dev/null
+++ b/celt/x86/pitch_sse2.c
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32( acc1, acc2 );
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++) {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+#endif
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
new file mode 100644
index 0000000..a092c68
--- /dev/null
+++ b/celt/x86/pitch_sse4_1.c
@@ -0,0 +1,195 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#include <smmintrin.h>
+#include "x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+    __m128i inVec1_3210, inVec2_3210;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16) {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32(acc1, acc2);
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    if (N - i >= 4)
+    {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_3210);
+        i += 4;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++)
+    {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
+{
+    int j;
+
+    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+    __m128i vecY0, vecY1, vecY2, vecY3;
+    __m128i sum0, sum1, sum2, sum3, vecSum;
+    __m128i initSum;
+
+    celt_assert(len >= 3);
+
+    sum0 = _mm_setzero_si128();
+    sum1 = _mm_setzero_si128();
+    sum2 = _mm_setzero_si128();
+    sum3 = _mm_setzero_si128();
+
+    for (j=0;j<(len-7);j+=8)
+    {
+        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+    }
+
+    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+          _mm_unpacklo_epi32(sum2, sum3));
+
+    for (;j<(len-3);j+=4)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+        sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+        sum0 = _mm_add_epi32(sum0, sum1);
+        sum2 = _mm_add_epi32(sum2, sum3);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+
+    for (;j<len;j++)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+    }
+
+    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+    initSum = _mm_add_epi32(initSum, vecSum);
+    _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
new file mode 100644
index 0000000..8e5e449
--- /dev/null
+++ b/celt/x86/x86_celt_map.c
@@ -0,0 +1,155 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "celt_lpc.h"
+#include "pitch.h"
+#include "pitch_sse.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16       *y,
+         int              N,
+         int              ord,
+         opus_val16       *mem,
+         int              arch
+) = {
+  celt_fir_c,                /* non-sse */
+  celt_fir_c,
+  celt_fir_c,
+  MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_fir)  /* avx  */
+};
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* non-sse */
+  xcorr_kernel_c,
+  xcorr_kernel_c,
+  MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
+  MAY_HAVE_SSE4_1(xcorr_kernel)  /* avx  */
+};
+
+#endif
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) ||  \
+	(!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         int              N
+) = {
+  celt_inner_prod_c,                /* non-sse */
+  celt_inner_prod_c,
+  MAY_HAVE_SSE2(celt_inner_prod),
+  MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_inner_prod)  /* avx  */
+};
+
+#endif
+
+# else
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* non-sse */
+  MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel)
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         int              N
+) = {
+  celt_inner_prod_c,                /* non-sse */
+  MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod)
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y01,
+                    const opus_val16 *y02,
+                    int               N,
+                    opus_val32       *xy1,
+                    opus_val32       *xy2
+) = {
+  dual_inner_prod_c,                /* non-sse */
+  MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod)
+};
+
+void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+              opus_val32 *y,
+              opus_val32 *x,
+              int         T,
+              int         N,
+              opus_val16  g10,
+              opus_val16  g11,
+              opus_val16  g12
+) = {
+  comb_filter_const_c,                /* non-sse */
+  MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const)
+};
+
+
+#endif
+
+#endif
+#endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
new file mode 100644
index 0000000..555a576
--- /dev/null
+++ b/celt/x86/x86cpu.c
@@ -0,0 +1,157 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cpu_support.h"
+#include "macros.h"
+#include "main.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+	__cpuid((int*)CPUInfo, InfoType);
+}
+
+#else
+
+#if defined(CPU_INFO_BY_C)
+#include <cpuid.h>
+#endif
+
+static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+#if defined(CPU_INFO_BY_ASM)
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+    __asm__ __volatile__ (
+        "xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n":
+        "=a" (CPUInfo[0]),
+        "=r" (CPUInfo[1]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "0" (InfoType)
+    );
+#else
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (CPUInfo[0]),
+        "=b" (CPUInfo[1]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "0" (InfoType)
+    );
+#endif
+#elif defined(CPU_INFO_BY_C)
+    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+#endif
+}
+
+#endif
+
+typedef struct CPU_Feature{
+    /*  SIMD: 128-bit */
+    int HW_SSE;
+    int HW_SSE2;
+    int HW_SSE41;
+    /*  SIMD: 256-bit */
+    int HW_AVX;
+} CPU_Feature;
+
+static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
+{
+    unsigned int info[4] = {0};
+    unsigned int nIds = 0;
+
+    cpuid(info, 0);
+    nIds = info[0];
+
+    if (nIds >= 1){
+        cpuid(info, 1);
+        cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
+        cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
+        cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+        cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+    }
+    else {
+        cpu_feature->HW_SSE = 0;
+        cpu_feature->HW_SSE2 = 0;
+        cpu_feature->HW_SSE41 = 0;
+        cpu_feature->HW_AVX = 0;
+    }
+}
+
+int opus_select_arch(void)
+{
+    CPU_Feature cpu_feature;
+    int arch;
+
+    opus_cpu_feature_check(&cpu_feature);
+
+    arch = 0;
+    if (!cpu_feature.HW_SSE)
+    {
+       return arch;
+    }
+    arch++;
+
+    if (!cpu_feature.HW_SSE2)
+    {
+       return arch;
+    }
+    arch++;
+
+    if (!cpu_feature.HW_SSE41)
+    {
+        return arch;
+    }
+    arch++;
+
+    if (!cpu_feature.HW_AVX)
+    {
+        return arch;
+    }
+    arch++;
+
+    return arch;
+}
+
+#endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
new file mode 100644
index 0000000..04fd48a
--- /dev/null
+++ b/celt/x86/x86cpu.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(X86CPU_H)
+# define X86CPU_H
+
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+#  define MAY_HAVE_SSE(name) name ## _sse
+# else
+#  define MAY_HAVE_SSE(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  define MAY_HAVE_SSE2(name) name ## _sse2
+# else
+#  define MAY_HAVE_SSE2(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#  define MAY_HAVE_SSE4_1(name) name ## _sse4_1
+# else
+#  define MAY_HAVE_SSE4_1(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_AVX)
+#  define MAY_HAVE_AVX(name) name ## _avx
+# else
+#  define MAY_HAVE_AVX(name) name ## _c
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
+  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
+  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
+  reference, these require 16-byte alignment and load a full 16 bytes (instead
+  of 4 or 8), possibly reading out of bounds.
+
+  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
+  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
+  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
+  optimize this out when optimizations ARE enabled.
+
+  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
+  (which is fair, since technically the compiler is always allowed to do the
+  dereference before invoking the function implementing the intrinsic).
+  However, it is smart enough to eliminate the extra MOVD instruction.
+  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
+  the extra MOVQ if it's specified explicitly */
+
+# if defined(__clang__) || !defined(__OPTIMIZE__)
+#  define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
+# else
+#  define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(*(__m128i *)(x)))
+#endif
+
+# if !defined(__OPTIMIZE__)
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+# endif
+
+#endif
diff --git a/celt_headers.mk b/celt_headers.mk
index 8811e16..0eca6e6 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -24,16 +24,28 @@
 celt/os_support.h \
 celt/pitch.h \
 celt/celt_lpc.h \
+celt/x86/celt_lpc_sse.h \
 celt/quant_bands.h \
 celt/rate.h \
 celt/stack_alloc.h \
 celt/vq.h \
 celt/static_modes_float.h \
 celt/static_modes_fixed.h \
+celt/static_modes_float_arm_ne10.h \
+celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
 celt/arm/pitch_arm.h \
-celt/x86/pitch_sse.h
+celt/arm/fft_arm.h \
+celt/arm/mdct_arm.h \
+celt/mips/celt_mipsr1.h \
+celt/mips/fixed_generic_mipsr1.h \
+celt/mips/kiss_fft_mipsr1.h \
+celt/mips/mdct_mipsr1.h \
+celt/mips/pitch_mipsr1.h \
+celt/mips/vq_mipsr1.h \
+celt/x86/pitch_sse.h \
+celt/x86/x86cpu.h
diff --git a/celt_sources.mk b/celt_sources.mk
index 2bbe770..2ffe99a 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -17,6 +17,15 @@
 celt/rate.c \
 celt/vq.c
 
+CELT_SOURCES_SSE = celt/x86/x86cpu.c \
+celt/x86/x86_celt_map.c \
+celt/x86/pitch_sse.c
+
+CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c
+
+CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \
+celt/x86/pitch_sse4_1.c
+
 CELT_SOURCES_ARM = \
 celt/arm/armcpu.c \
 celt/arm/arm_celt_map.c
@@ -26,3 +35,10 @@
 
 CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
+
+CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_neon_intr.c
+
+CELT_SOURCES_ARM_NE10= \
+celt/arm/celt_ne10_fft.c \
+celt/arm/celt_ne10_mdct.c
diff --git a/compile b/compile
index 862a14e..a85b723 100755
--- a/compile
+++ b/compile
@@ -1,10 +1,9 @@
 #! /bin/sh
 # Wrapper for compilers which do not understand '-c -o'.
 
-scriptversion=2012-03-05.13; # UTC
+scriptversion=2012-10-14.11; # UTC
 
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free
-# Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 # Written by Tom Tromey <tromey@cygnus.com>.
 #
 # This program is free software; you can redistribute it and/or modify
@@ -113,6 +112,11 @@
       lib=$dir/$lib.lib
       break
     fi
+    if test -f "$dir/lib$lib.a"; then
+      found=yes
+      lib=$dir/lib$lib.a
+      break
+    fi
   done
   IFS=$save_IFS
 
diff --git a/config.guess b/config.guess
index d622a44..dbfb978 100755
--- a/config.guess
+++ b/config.guess
@@ -1,14 +1,12 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2012-02-10'
+timestamp='2015-01-01'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -22,19 +20,17 @@
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner.  Please send patches (context
-# diff format) to <config-patches@gnu.org> and include a ChangeLog
-# entry.
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
 #
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
 #
 # You can get the latest version of this script from:
 # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+#
+# Please send patches to <config-patches@gnu.org>.
+
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -54,9 +50,7 @@
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -138,6 +132,27 @@
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
+case "${UNAME_SYSTEM}" in
+Linux|GNU|GNU/*)
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	LIBC=gnu
+
+	eval $set_cc_for_build
+	cat <<-EOF > $dummy.c
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#else
+	LIBC=gnu
+	#endif
+	EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+	;;
+esac
+
 # Note: order is significant - the case branches are not exclusive.
 
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@@ -200,6 +215,10 @@
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
 	echo "${machine}-${os}${release}"
 	exit ;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+	exit ;;
     *:OpenBSD:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
@@ -302,7 +321,7 @@
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
 	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
 	echo arm-unknown-riscos
 	exit ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@@ -560,8 +579,9 @@
 	else
 		IBM_ARCH=powerpc
 	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
+	if [ -x /usr/bin/lslpp ] ; then
+		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
+			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
 	else
 		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
@@ -801,10 +821,13 @@
     i*:CYGWIN*:*)
 	echo ${UNAME_MACHINE}-pc-cygwin
 	exit ;;
+    *:MINGW64*:*)
+	echo ${UNAME_MACHINE}-pc-mingw64
+	exit ;;
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    i*:MSYS*:*)
+    *:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
     i*:windows32*:*)
@@ -852,21 +875,21 @@
 	exit ;;
     *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
 	exit ;;
     *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
 	exit ;;
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
     aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     aarch64_be:Linux:*:*)
 	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@@ -879,59 +902,54 @@
 	  EV68*) UNAME_MACHINE=alphaev68 ;;
 	esac
 	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    arc:Linux:*:* | arceb:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	else
 	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
 		| grep -q __ARM_PCS_VFP
 	    then
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
 	    else
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
 	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     i*86:Linux:*:*)
-	LIBC=gnu
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
-	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
 	exit ;;
     ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
@@ -950,54 +968,63 @@
 	#endif
 EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
-    or32:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+    openrisc*:Linux:*:*)
+	echo or1k-unknown-linux-${LIBC}
+	exit ;;
+    or32:Linux:*:* | or1k*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     padre:Linux:*:*)
-	echo sparc-unknown-linux-gnu
+	echo sparc-unknown-linux-${LIBC}
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
+	echo hppa64-unknown-linux-${LIBC}
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
+	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
+	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
+	  *)    echo hppa-unknown-linux-${LIBC} ;;
 	esac
 	exit ;;
     ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
+	echo powerpc64-unknown-linux-${LIBC}
 	exit ;;
     ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
+	echo powerpc-unknown-linux-${LIBC}
+	exit ;;
+    ppc64le:Linux:*:*)
+	echo powerpc64le-unknown-linux-${LIBC}
+	exit ;;
+    ppcle:Linux:*:*)
+	echo powerpcle-unknown-linux-${LIBC}
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
+	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
 	exit ;;
     sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
+	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
 	exit ;;
     x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@@ -1201,6 +1228,9 @@
     BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
 	echo i586-pc-haiku
 	exit ;;
+    x86_64:Haiku:*:*)
+	echo x86_64-unknown-haiku
+	exit ;;
     SX-4:SUPER-UX:*:*)
 	echo sx4-nec-superux${UNAME_RELEASE}
 	exit ;;
@@ -1227,19 +1257,31 @@
 	exit ;;
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    i386)
-		eval $set_cc_for_build
-		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		      grep IS_64BIT_ARCH >/dev/null
-		  then
-		      UNAME_PROCESSOR="x86_64"
-		  fi
-		fi ;;
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
+	eval $set_cc_for_build
+	if test "$UNAME_PROCESSOR" = unknown ; then
+	    UNAME_PROCESSOR=powerpc
+	fi
+	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
+	    if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		    (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		    grep IS_64BIT_ARCH >/dev/null
+		then
+		    case $UNAME_PROCESSOR in
+			i386) UNAME_PROCESSOR=x86_64 ;;
+			powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		    esac
+		fi
+	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # Avoid executing cc on OS X 10.9, as it ships with a stub
+	    # that puts up a graphical alert prompting to install
+	    # developer tools.  Any system running Mac OS X 10.7 or
+	    # later (Darwin 11 and later) is required to have a 64-bit
+	    # processor. This is not true of the ARM version of Darwin
+	    # that Apple uses in portable devices.
+	    UNAME_PROCESSOR=x86_64
+	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
@@ -1256,7 +1298,7 @@
     NEO-?:NONSTOP_KERNEL:*:*)
 	echo neo-tandem-nsk${UNAME_RELEASE}
 	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
+    NSE-*:NONSTOP_KERNEL:*:*)
 	echo nse-tandem-nsk${UNAME_RELEASE}
 	exit ;;
     NSR-?:NONSTOP_KERNEL:*:*)
@@ -1330,157 +1372,6 @@
 	exit ;;
 esac
 
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-	"4"
-#else
-	""
-#endif
-	); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
 cat >&2 <<EOF
 $0: unable to guess system type
 
diff --git a/config.h.in b/config.h.in
index 36f4fb7..8a26ea0 100644
--- a/config.h.in
+++ b/config.h.in
@@ -1,5 +1,11 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
+/* Get CPU Info by asm method */
+#undef CPU_INFO_BY_ASM
+
+/* Get CPU Info by c method */
+#undef CPU_INFO_BY_C
+
 /* Custom modes */
 #undef CUSTOM_MODES
 
@@ -24,6 +30,9 @@
 /* Define to 1 if you have the <alloca.h> header file. */
 #undef HAVE_ALLOCA_H
 
+/* NE10 library is installed on host. Make sure it is on target! */
+#undef HAVE_ARM_NE10
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
@@ -67,9 +76,6 @@
    */
 #undef LT_OBJDIR
 
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-#undef NO_MINUS_C_MINUS_O
-
 /* Make use of ARM asm optimization */
 #undef OPUS_ARM_ASM
 
@@ -94,6 +100,9 @@
 /* Define if compiler supports NEON instructions */
 #undef OPUS_ARM_MAY_HAVE_NEON
 
+/* Compiler supports ARMv7 Neon Intrinsics */
+#undef OPUS_ARM_MAY_HAVE_NEON_INTR
+
 /* Define if binary requires EDSP instruction support */
 #undef OPUS_ARM_PRESUME_EDSP
 
@@ -103,12 +112,39 @@
 /* Define if binary requires NEON instruction support */
 #undef OPUS_ARM_PRESUME_NEON
 
+/* Define if binary requires NEON intrinsics support */
+#undef OPUS_ARM_PRESUME_NEON_INTR
+
 /* This is a build of OPUS */
 #undef OPUS_BUILD
 
 /* Use run-time CPU capabilities detection */
 #undef OPUS_HAVE_RTCD
 
+/* Compiler supports X86 AVX Intrinsics */
+#undef OPUS_X86_MAY_HAVE_AVX
+
+/* Compiler supports X86 SSE Intrinsics */
+#undef OPUS_X86_MAY_HAVE_SSE
+
+/* Compiler supports X86 SSE2 Intrinsics */
+#undef OPUS_X86_MAY_HAVE_SSE2
+
+/* Compiler supports X86 SSE4.1 Intrinsics */
+#undef OPUS_X86_MAY_HAVE_SSE4_1
+
+/* Define if binary requires AVX intrinsics support */
+#undef OPUS_X86_PRESUME_AVX
+
+/* Define if binary requires SSE intrinsics support */
+#undef OPUS_X86_PRESUME_SSE
+
+/* Define if binary requires SSE2 intrinsics support */
+#undef OPUS_X86_PRESUME_SSE2
+
+/* Define if binary requires SSE4.1 intrinsics support */
+#undef OPUS_X86_PRESUME_SSE4_1
+
 /* Define to the address where bug reports for this package should be sent. */
 #undef PACKAGE_BUGREPORT
 
diff --git a/config.sub b/config.sub
index c894da4..6467c95 100755
--- a/config.sub
+++ b/config.sub
@@ -1,24 +1,18 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
+#   Copyright 1992-2015 Free Software Foundation, Inc.
 
-timestamp='2012-02-10'
+timestamp='2015-01-01'
 
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
@@ -26,11 +20,12 @@
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
 
 
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted GNU ChangeLog entry.
+# Please send patches to <config-patches@gnu.org>.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@@ -73,9 +68,7 @@
 version="\
 GNU config.sub ($timestamp)
 
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
+Copyright 1992-2015 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -123,7 +116,7 @@
 maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
-  linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
+  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
   knetbsd*-gnu* | netbsd*-gnu* | \
   kopensolaris*-gnu* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
@@ -156,7 +149,7 @@
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray | -microblaze)
+	-apple | -axis | -knuth | -cray | -microblaze*)
 		os=
 		basic_machine=$1
 		;;
@@ -225,6 +218,12 @@
 	-isc*)
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
 		;;
+	-lynx*178)
+		os=-lynxos178
+		;;
+	-lynx*5)
+		os=-lynxos5
+		;;
 	-lynx*)
 		os=-lynxos
 		;;
@@ -253,21 +252,24 @@
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
-	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-        | be32 | be64 \
+	| arc | arceb \
+	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
+	| avr | avr32 \
+	| be32 | be64 \
 	| bfin \
-	| c4x | clipper \
+	| c4x | c8051 | clipper \
 	| d10v | d30v | dlx | dsp16xx \
 	| epiphany \
-	| fido | fr30 | frv \
+	| fido | fr30 | frv | ft32 \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| hexagon \
 	| i370 | i860 | i960 | ia64 \
 	| ip2k | iq2000 \
+	| k1om \
 	| le32 | le64 \
 	| lm32 \
 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | mcore | mep | metag \
+	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
 	| mips | mipsbe | mipseb | mipsel | mipsle \
 	| mips16 \
 	| mips64 | mips64el \
@@ -281,23 +283,26 @@
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipsr5900 | mipsr5900el \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
 	| moxie \
 	| mt \
 	| msp430 \
 	| nds32 | nds32le | nds32be \
-	| nios | nios2 \
+	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
-	| open8 \
-	| or32 \
+	| open8 | or1k | or1knd | or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
+	| riscv32 | riscv64 \
 	| rl78 | rx \
 	| score \
 	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
@@ -308,6 +313,7 @@
 	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
 	| ubicom32 \
 	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+	| visium \
 	| we32k \
 	| x86 | xc16x | xstormy16 | xtensa \
 	| z8k | z80)
@@ -322,7 +328,10 @@
 	c6x)
 		basic_machine=tic6x-unknown
 		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
+	leon|leon[3-9])
+		basic_machine=sparc-$basic_machine
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
@@ -364,13 +373,13 @@
 	| aarch64-* | aarch64_be-* \
 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| clipper-* | craynv-* | cydra-* \
+	| c8051-* | clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
 	| elxsi-* \
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
@@ -379,11 +388,13 @@
 	| hexagon-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
 	| ip2k-* | iq2000-* \
+	| k1om-* \
 	| le32-* | le64-* \
 	| lm32-* \
 	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
+	| microblaze-* | microblazeel-* \
 	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
 	| mips16-* \
 	| mips64-* | mips64el-* \
@@ -397,18 +408,22 @@
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipsr5900-* | mipsr5900el-* \
 	| mipstx39-* | mipstx39el-* \
 	| mmix-* \
 	| mt-* \
 	| msp430-* \
 	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* \
+	| nios-* | nios2-* | nios2eb-* | nios2el-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
+	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
@@ -426,6 +441,7 @@
 	| ubicom32-* \
 	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
 	| vax-* \
+	| visium-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* \
 	| xstormy16-* | xtensa*-* \
@@ -763,6 +779,9 @@
 		basic_machine=m68k-isi
 		os=-sysv
 		;;
+	leon-*|leon[3-9]-*)
+		basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
+		;;
 	m68knommu)
 		basic_machine=m68k-unknown
 		os=-linux
@@ -782,11 +801,15 @@
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
-	microblaze)
+	microblaze*)
 		basic_machine=microblaze-xilinx
 		;;
+	mingw64)
+		basic_machine=x86_64-pc
+		os=-mingw64
+		;;
 	mingw32)
-		basic_machine=i386-pc
+		basic_machine=i686-pc
 		os=-mingw32
 		;;
 	mingw32ce)
@@ -814,6 +837,10 @@
 		basic_machine=powerpc-unknown
 		os=-morphos
 		;;
+	moxiebox)
+		basic_machine=moxie-unknown
+		os=-moxiebox
+		;;
 	msdos)
 		basic_machine=i386-pc
 		os=-msdos
@@ -822,7 +849,7 @@
 		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
 		;;
 	msys)
-		basic_machine=i386-pc
+		basic_machine=i686-pc
 		os=-msys
 		;;
 	mvs)
@@ -998,7 +1025,7 @@
 		;;
 	ppc64)	basic_machine=powerpc64-unknown
 		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+	ppc64-* | ppc64p7-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
 	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
 		basic_machine=powerpc64le-unknown
@@ -1013,7 +1040,11 @@
 		basic_machine=i586-unknown
 		os=-pw32
 		;;
-	rdos)
+	rdos | rdos64)
+		basic_machine=x86_64-pc
+		os=-rdos
+		;;
+	rdos32)
 		basic_machine=i386-pc
 		os=-rdos
 		;;
@@ -1340,29 +1371,29 @@
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
 	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* \
+	      | -sym* | -kopensolaris* | -plan9* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
 	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
 	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -openbsd* | -solidbsd* \
+	      | -bitrig* | -openbsd* | -solidbsd* \
 	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
 	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
 	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
 	      | -chorusos* | -chorusrdb* | -cegcc* \
 	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -linux-android* \
-	      | -linux-newlib* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
+	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
 	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
 	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
 	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1486,9 +1517,6 @@
 	-aros*)
 		os=-aros
 		;;
-	-kaos*)
-		os=-kaos
-		;;
 	-zvmoe)
 		os=-zvmoe
 		;;
@@ -1537,6 +1565,12 @@
 	c4x-* | tic4x-*)
 		os=-coff
 		;;
+	c8051-*)
+		os=-elf
+		;;
+	hexagon-*)
+		os=-elf
+		;;
 	tic54x-*)
 		os=-coff
 		;;
diff --git a/configure b/configure
index c6cb40a..4ea7327 100755
--- a/configure
+++ b/configure
@@ -1,13 +1,11 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for opus 1.1.
+# Generated by GNU Autoconf 2.69 for opus 1.1.2.
 #
 # Report bugs to <opus@xiph.org>.
 #
 #
-# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
-# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software
-# Foundation, Inc.
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
 #
 #
 # This configure script is free software; the Free Software Foundation
@@ -136,6 +134,31 @@
 # CDPATH.
 (unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
 if test "x$CONFIG_SHELL" = x; then
   as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
   emulate sh
@@ -169,7 +192,8 @@
 else
   exitcode=1; echo positional parameters were not saved.
 fi
-test x\$exitcode = x0 || exit 1"
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
   as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
   as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
   eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
@@ -222,21 +246,25 @@
 
 
       if test "x$CONFIG_SHELL" != x; then :
-  # We cannot yet assume a decent shell, so we have to provide a
-	# neutralization value for shells without unset; and this also
-	# works around shells that cannot unset nonexistent variables.
-	# Preserve -v and -x to the replacement shell.
-	BASH_ENV=/dev/null
-	ENV=/dev/null
-	(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
-	export CONFIG_SHELL
-	case $- in # ((((
-	  *v*x* | *x*v* ) as_opts=-vx ;;
-	  *v* ) as_opts=-v ;;
-	  *x* ) as_opts=-x ;;
-	  * ) as_opts= ;;
-	esac
-	exec "$CONFIG_SHELL" $as_opts "$as_myself" ${1+"$@"}
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
 fi
 
     if test x$as_have_required = xno; then :
@@ -339,6 +367,14 @@
 
 
 } # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
 # as_fn_append VAR VALUE
 # ----------------------
 # Append the text in VALUE to the end of the definition contained in VAR. Take
@@ -460,6 +496,10 @@
   chmod +x "$as_me.lineno" ||
     { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
 
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
   # Don't try to exec as it changes $[0], causing all sort of problems
   # (the dirname of $[0] is not the place where we might find the
   # original and so on.  Autoconf is especially sensitive to this).
@@ -494,16 +534,16 @@
     # ... but there are two gotchas:
     # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
     # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -p'.
+    # In both cases, we have to default to `cp -pR'.
     ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-      as_ln_s='cp -p'
+      as_ln_s='cp -pR'
   elif ln conf$$.file conf$$ 2>/dev/null; then
     as_ln_s=ln
   else
-    as_ln_s='cp -p'
+    as_ln_s='cp -pR'
   fi
 else
-  as_ln_s='cp -p'
+  as_ln_s='cp -pR'
 fi
 rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
 rmdir conf$$.dir 2>/dev/null
@@ -515,28 +555,8 @@
   as_mkdir_p=false
 fi
 
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-	test -d "$1/.";
-      else
-	case $1 in #(
-	-*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
 
 # Sed expression to map a string onto a valid CPP name.
 as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
@@ -570,8 +590,8 @@
 # Identity of this package.
 PACKAGE_NAME='opus'
 PACKAGE_TARNAME='opus'
-PACKAGE_VERSION='1.1'
-PACKAGE_STRING='opus 1.1'
+PACKAGE_VERSION='1.1.2'
+PACKAGE_STRING='opus 1.1.2'
 PACKAGE_BUGREPORT='opus@xiph.org'
 PACKAGE_URL=''
 
@@ -623,8 +643,34 @@
 HAVE_DOXYGEN_TRUE
 HAVE_DOXYGEN
 OPUS_HAVE_RTCD
+HAVE_ARM_NE10_FALSE
+HAVE_ARM_NE10_TRUE
+OPUS_ARM_NEON_INTR_FALSE
+OPUS_ARM_NEON_INTR_TRUE
 CPU_ARM_FALSE
 CPU_ARM_TRUE
+OPUS_X86_AVX_CFLAGS
+OPUS_X86_SSE4_1_CFLAGS
+OPUS_X86_SSE2_CFLAGS
+OPUS_X86_SSE_CFLAGS
+NE10_LIBS
+NE10_CFLAGS
+HAVE_ARM_NE10
+OPUS_ARM_NEON_INTR_CFLAGS
+ARM_NEON_INTR_CFLAGS
+X86_AVX_CFLAGS
+X86_SSE4_1_CFLAGS
+X86_SSE2_CFLAGS
+X86_SSE_CFLAGS
+HAVE_AVX_FALSE
+HAVE_AVX_TRUE
+HAVE_SSE4_1_FALSE
+HAVE_SSE4_1_TRUE
+HAVE_SSE2_FALSE
+HAVE_SSE2_TRUE
+HAVE_SSE_FALSE
+HAVE_SSE_TRUE
+ARM2GNU_PARAMS
 OPUS_ARM_MAY_HAVE_NEON
 OPUS_ARM_MAY_HAVE_MEDIA
 OPUS_ARM_MAY_HAVE_EDSP
@@ -783,6 +829,10 @@
 enable_float_approx
 enable_asm
 enable_rtcd
+enable_intrinsics
+with_NE10
+with_NE10_libraries
+with_NE10_includes
 enable_assertions
 enable_fuzzing
 enable_doc
@@ -798,7 +848,12 @@
 CPPFLAGS
 CPP
 CCAS
-CCASFLAGS'
+CCASFLAGS
+X86_SSE_CFLAGS
+X86_SSE2_CFLAGS
+X86_SSE4_1_CFLAGS
+X86_AVX_CFLAGS
+ARM_NEON_INTR_CFLAGS'
 
 
 # Initialize some variables set by options.
@@ -1254,8 +1309,6 @@
 if test "x$host_alias" != x; then
   if test "x$build_alias" = x; then
     cross_compiling=maybe
-    $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host.
-    If a cross compiler is detected then cross compile mode will be used" >&2
   elif test "x$build_alias" != "x$host_alias"; then
     cross_compiling=yes
   fi
@@ -1341,7 +1394,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures opus 1.1 to adapt to many kinds of systems.
+\`configure' configures opus 1.1.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1411,7 +1464,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of opus 1.1:";;
+     short | recursive ) echo "Configuration of opus 1.1.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1419,16 +1472,19 @@
   --disable-option-checking  ignore unrecognized --enable/--with options
   --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
-  --enable-silent-rules          less verbose build output (undo: `make V=1')
-  --disable-silent-rules         verbose build output (undo: `make V=0')
-  --disable-maintainer-mode  disable make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer
+  --enable-silent-rules   less verbose build output (undo: "make V=1")
+  --disable-silent-rules  verbose build output (undo: "make V=0")
+  --disable-maintainer-mode
+                          disable make rules and dependencies not useful (and
+                          sometimes confusing) to the casual installer
   --enable-shared[=PKGS]  build shared libraries [default=yes]
   --enable-static[=PKGS]  build static libraries [default=yes]
   --enable-fast-install[=PKGS]
                           optimize for fast installation [default=yes]
-  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors
+  --enable-dependency-tracking
+                          do not reject slow dependency extractors
+  --disable-dependency-tracking
+                          speeds up one-time build
   --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-fixed-point    compile without floating point (for machines without
                           a fast enough FPU)
@@ -1440,6 +1496,8 @@
   --enable-float-approx   enable fast approximations for floating point
   --disable-asm           Disable assembly optimizations
   --disable-rtcd          Disable run-time CPU capabilities detection
+  --enable-intrinsics     Enable intrinsics optimizations for ARM(float)
+                          X86(fixed)
   --enable-assertions     enable additional software error checking
   --enable-fuzzing        causes the encoder to make random decisions
   --disable-doc           Do not build API documentation
@@ -1454,6 +1512,13 @@
   --with-gnu-ld           assume the C compiler uses GNU ld [default=no]
   --with-sysroot=DIR Search for dependent libraries within DIR
                         (or the compiler's sysroot if not specified).
+  --with-NE10=PFX         Prefix where libNE10 is installed (optional)
+  --with-NE10-libraries=DIR
+                          Directory where libNE10 library is installed
+                          (optional)
+  --with-NE10-includes=DIR
+                          Directory where libNE10 header files are installed
+                          (optional)
 
 Some influential environment variables:
   CC          C compiler command
@@ -1466,6 +1531,17 @@
   CPP         C preprocessor
   CCAS        assembler compiler command (defaults to CC)
   CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
+  X86_SSE_CFLAGS
+              C compiler flags to compile SSE intrinsics [default=-msse]
+  X86_SSE2_CFLAGS
+              C compiler flags to compile SSE2 intrinsics [default=-msse2]
+  X86_SSE4_1_CFLAGS
+              C compiler flags to compile SSE4.1 intrinsics [default=-msse4.1]
+  X86_AVX_CFLAGS
+              C compiler flags to compile AVX intrinsics [default=-mavx]
+  ARM_NEON_INTR_CFLAGS
+              C compiler flags to compile ARM NEON intrinsics
+              [default=-mfpu=neon / -mfpu=neon -mfloat-abi=softfp]
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
@@ -1533,10 +1609,10 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-opus configure 1.1
-generated by GNU Autoconf 2.68
+opus configure 1.1.2
+generated by GNU Autoconf 2.69
 
-Copyright (C) 2010 Free Software Foundation, Inc.
+Copyright (C) 2012 Free Software Foundation, Inc.
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 _ACEOF
@@ -1612,7 +1688,7 @@
 	 test ! -s conftest.err
        } && test -s conftest$ac_exeext && {
 	 test "$cross_compiling" = yes ||
-	 $as_test_x conftest$ac_exeext
+	 test -x conftest$ac_exeext
        }; then :
   ac_retval=0
 else
@@ -1902,8 +1978,8 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by opus $as_me 1.1, which was
-generated by GNU Autoconf 2.68.  Invocation command line was
+It was created by opus $as_me 1.1.2, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
 
@@ -2259,10 +2335,10 @@
   enableval=$enable_silent_rules;
 fi
 
-case $enable_silent_rules in
-yes) AM_DEFAULT_VERBOSITY=0;;
-no)  AM_DEFAULT_VERBOSITY=1;;
-*)   AM_DEFAULT_VERBOSITY=0;;
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=0;;
 esac
 am_make=${MAKE-make}
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
@@ -2296,14 +2372,14 @@
 
 # For libtool.
 OPUS_LT_CURRENT=5
-OPUS_LT_REVISION=0
+OPUS_LT_REVISION=2
 OPUS_LT_AGE=5
 
 
 
 
 
-am__api_version='1.11'
+am__api_version='1.15'
 
 ac_aux_dir=
 for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
@@ -2371,7 +2447,7 @@
     # by default.
     for ac_prog in ginstall scoinst install; do
       for ac_exec_ext in '' $ac_executable_extensions; do
-	if { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; }; then
+	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
 	  if test $ac_prog = install &&
 	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
 	    # AIX install.  It has an incompatible calling convention.
@@ -2429,9 +2505,6 @@
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
 $as_echo_n "checking whether build environment is sane... " >&6; }
-# Just in case
-sleep 1
-echo timestamp > conftest.file
 # Reject unsafe characters in $srcdir or the absolute working directory
 # name.  Accept space and tab only in the latter.
 am_lf='
@@ -2442,32 +2515,40 @@
 esac
 case $srcdir in
   *[\\\"\#\$\&\'\`$am_lf\ \	]*)
-    as_fn_error $? "unsafe srcdir value: \`$srcdir'" "$LINENO" 5;;
+    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
 esac
 
-# Do `set' in a subshell so we don't clobber the current shell's
+# Do 'set' in a subshell so we don't clobber the current shell's
 # arguments.  Must try -L first in case configure is actually a
 # symlink; some systems play weird games with the mod time of symlinks
 # (eg FreeBSD returns the mod time of the symlink's containing
 # directory).
 if (
-   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
-   if test "$*" = "X"; then
-      # -L didn't work.
-      set X `ls -t "$srcdir/configure" conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$*" != "X $srcdir/configure conftest.file" \
-      && test "$*" != "X conftest.file $srcdir/configure"; then
+   am_has_slept=no
+   for am_try in 1 2; do
+     echo "timestamp, slept: $am_has_slept" > conftest.file
+     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+     if test "$*" = "X"; then
+	# -L didn't work.
+	set X `ls -t "$srcdir/configure" conftest.file`
+     fi
+     if test "$*" != "X $srcdir/configure conftest.file" \
+	&& test "$*" != "X conftest.file $srcdir/configure"; then
 
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
-alias in your environment" "$LINENO" 5
-   fi
-
+	# If neither matched, then we have a broken ls.  This can happen
+	# if, for instance, CONFIG_SHELL is bash and it inherits a
+	# broken ls alias from the environment.  This has actually
+	# happened.  Such a system could not be considered "sane".
+	as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
+  alias in your environment" "$LINENO" 5
+     fi
+     if test "$2" = conftest.file || test $am_try -eq 2; then
+       break
+     fi
+     # Just in case.
+     sleep 1
+     am_has_slept=yes
+   done
    test "$2" = conftest.file
    )
 then
@@ -2479,6 +2560,16 @@
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+# If we didn't sleep, we still need to ensure time stamps of config.status and
+# generated files are strictly newer.
+am_sleep_pid=
+if grep 'slept: no' conftest.file >/dev/null 2>&1; then
+  ( sleep 1 ) &
+  am_sleep_pid=$!
+fi
+
+rm -f conftest.file
+
 test "$program_prefix" != NONE &&
   program_transform_name="s&^&$program_prefix&;$program_transform_name"
 # Use a double $ so make ignores it.
@@ -2489,8 +2580,8 @@
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 
 if test x"${MISSING+set}" != xset; then
   case $am_aux_dir in
@@ -2501,15 +2592,15 @@
   esac
 fi
 # Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
 else
   am_missing_run=
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`missing' script is too old or missing" >&5
-$as_echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5
+$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi
 
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2518,10 +2609,10 @@
   esac
 fi
 
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
+# Installed binaries are usually stripped using 'strip' when the user
+# run "make install-strip".  However 'strip' might not be the right
 # tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
+# will honor the 'STRIP' environment variable to overrule this program.
 if test "$cross_compiling" != no; then
   if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
@@ -2540,7 +2631,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_STRIP="${ac_tool_prefix}strip"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -2580,7 +2671,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_STRIP="strip"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -2631,7 +2722,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_prog in mkdir gmkdir; do
 	 for ac_exec_ext in '' $ac_executable_extensions; do
-	   { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; } || continue
+	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
 	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
 	     'mkdir (GNU coreutils) '* | \
 	     'mkdir (coreutils) '* | \
@@ -2660,12 +2751,6 @@
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
 $as_echo "$MKDIR_P" >&6; }
 
-mkdir_p="$MKDIR_P"
-case $mkdir_p in
-  [\\/$]* | ?:[\\/]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-
 for ac_prog in gawk mawk nawk awk
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
@@ -2684,7 +2769,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_AWK="$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -2770,7 +2855,7 @@
 
 # Define the identity of the package.
  PACKAGE='opus'
- VERSION='1.1'
+ VERSION='1.1.2'
 
 
 # Some tools Automake needs.
@@ -2789,12 +2874,22 @@
 
 MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# For better backward compatibility.  To be removed once Automake 1.9.x
+# dies out for good.  For more background, see:
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+mkdir_p='$(MKDIR_P)'
+
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
 
+
+# We'll loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar  pax cpio none'
+
 am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
 
 
@@ -2802,6 +2897,49 @@
 
 
 
+# POSIX will say in a future version that running "rm -f" with no argument
+# is OK; and we want to be able to make that assumption in our Makefile
+# recipes.  So use an aggressive probe to check that the usage we want is
+# actually supported "in the wild" to an acceptable degree.
+# See automake bug#10828.
+# To make any issue more visible, cause the running configure to be aborted
+# by default if the 'rm' program in use doesn't match our expectations; the
+# user can still override this though.
+if rm -f && rm -fr && rm -rf; then : OK; else
+  cat >&2 <<'END'
+Oops!
+
+Your 'rm' program seems unable to run without file operands specified
+on the command line, even when the '-f' option is present.  This is contrary
+to the behaviour of most rm programs out there, and not conforming with
+the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
+
+Please tell bug-automake@gnu.org about your system, including the value
+of your $PATH and any error possibly output before this message.  This
+can help us improve future automake versions.
+
+END
+  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
+    echo 'Configuration will proceed anyway, since you have set the' >&2
+    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
+    echo >&2
+  else
+    cat >&2 <<'END'
+Aborting the configuration process, to ensure you take notice of the issue.
+
+You can download and install GNU coreutils to get an 'rm' implementation
+that behaves properly: <http://www.gnu.org/software/coreutils/>.
+
+If you want to complete the configuration process using your problematic
+'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
+to "yes", and re-run configure.
+
+END
+    as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
+  fi
+fi
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
 $as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
     # Check whether --enable-maintainer-mode was given.
@@ -3015,7 +3153,7 @@
 _am_result=none
 # First try GNU make style include.
 echo "include confinc" > confmf
-# Ignore all kinds of additional output from `make'.
+# Ignore all kinds of additional output from 'make'.
 case `$am_make -s -f confmf 2> /dev/null` in #(
 *the\ am__doit\ target*)
   am__include=include
@@ -3081,7 +3219,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="${ac_tool_prefix}gcc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -3121,7 +3259,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_CC="gcc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -3174,7 +3312,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="${ac_tool_prefix}cc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -3215,7 +3353,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
        ac_prog_rejected=yes
        continue
@@ -3273,7 +3411,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -3317,7 +3455,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_CC="$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -3763,8 +3901,7 @@
 /* end confdefs.h.  */
 #include <stdarg.h>
 #include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+struct stat;
 /* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
 struct buf { int x; };
 FILE * (*rcsopen) (struct buf *, struct stat *, int);
@@ -3849,6 +3986,65 @@
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
+$as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
+if ${am_cv_prog_cc_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+  # Make sure it works both with $CC and with simple cc.
+  # Following AC_PROG_CC_C_O, we do the test twice because some
+  # compilers refuse to overwrite an existing .o file with -o,
+  # though they will create one.
+  am_cv_prog_cc_c_o=yes
+  for am_i in 1 2; do
+    if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
+   ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } \
+         && test -f conftest2.$ac_objext; then
+      : OK
+    else
+      am_cv_prog_cc_c_o=no
+      break
+    fi
+  done
+  rm -f core conftest*
+  unset am_i
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
+$as_echo "$am_cv_prog_cc_c_o" >&6; }
+if test "$am_cv_prog_cc_c_o" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
 depcc="$CC"   am_compiler_list=
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
@@ -3860,8 +4056,8 @@
   # We make a subdir and do the tests there.  Otherwise we can end up
   # making bogus files that we don't know about and never remove.  For
   # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
   rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
@@ -3896,16 +4092,16 @@
     : > sub/conftest.c
     for i in 1 2 3 4 5 6; do
       echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
     done
     echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
 
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
     # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
     am__obj=sub/conftest.${OBJEXT-o}
     am__minus_obj="-o $am__obj"
     case $depmode in
@@ -3914,8 +4110,8 @@
       test "$am__universal" = false || continue
       ;;
     nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
       if test "x$enable_dependency_tracking" = xyes; then
 	continue
       else
@@ -3923,7 +4119,7 @@
       fi
       ;;
     msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
+      # This compiler won't grok '-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
       am__obj=conftest.${OBJEXT-o}
@@ -3999,7 +4195,7 @@
     for ac_prog in sed gsed; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_SED="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_SED" && $as_test_x "$ac_path_SED"; } || continue
+      as_fn_executable_p "$ac_path_SED" || continue
 # Check for GNU ac_path_SED and select it if it is found.
   # Check for GNU $ac_path_SED
 case `"$ac_path_SED" --version 2>&1` in
@@ -4075,7 +4271,7 @@
     for ac_prog in grep ggrep; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+      as_fn_executable_p "$ac_path_GREP" || continue
 # Check for GNU ac_path_GREP and select it if it is found.
   # Check for GNU $ac_path_GREP
 case `"$ac_path_GREP" --version 2>&1` in
@@ -4141,7 +4337,7 @@
     for ac_prog in egrep; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+      as_fn_executable_p "$ac_path_EGREP" || continue
 # Check for GNU ac_path_EGREP and select it if it is found.
   # Check for GNU $ac_path_EGREP
 case `"$ac_path_EGREP" --version 2>&1` in
@@ -4208,7 +4404,7 @@
     for ac_prog in fgrep; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_FGREP="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_FGREP" && $as_test_x "$ac_path_FGREP"; } || continue
+      as_fn_executable_p "$ac_path_FGREP" || continue
 # Check for GNU ac_path_FGREP and select it if it is found.
   # Check for GNU $ac_path_FGREP
 case `"$ac_path_FGREP" --version 2>&1` in
@@ -4464,7 +4660,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_DUMPBIN="$ac_tool_prefix$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -4508,7 +4704,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_DUMPBIN="$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -4932,7 +5128,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_OBJDUMP="${ac_tool_prefix}objdump"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -4972,7 +5168,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_OBJDUMP="objdump"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5278,7 +5474,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_DLLTOOL="${ac_tool_prefix}dlltool"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5318,7 +5514,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_DLLTOOL="dlltool"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5422,7 +5618,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5466,7 +5662,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_AR="$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5591,7 +5787,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_STRIP="${ac_tool_prefix}strip"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5631,7 +5827,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_STRIP="strip"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5690,7 +5886,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -5730,7 +5926,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_RANLIB="ranlib"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6222,7 +6418,7 @@
   rm -rf conftest*
   ;;
 
-x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
+x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
 s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
   # Find out which ABI we are using.
   echo 'int i;' > conftest.$ac_ext
@@ -6240,7 +6436,10 @@
 	  x86_64-*linux*)
 	    LD="${LD-ld} -m elf_i386"
 	    ;;
-	  ppc64-*linux*|powerpc64-*linux*)
+	  powerpc64le-*linux*)
+	    LD="${LD-ld} -m elf32lppclinux"
+	    ;;
+	  powerpc64-*linux*)
 	    LD="${LD-ld} -m elf32ppclinux"
 	    ;;
 	  s390x-*linux*)
@@ -6259,7 +6458,10 @@
 	  x86_64-*linux*)
 	    LD="${LD-ld} -m elf_x86_64"
 	    ;;
-	  ppc*-*linux*|powerpc*-*linux*)
+	  powerpcle-*linux*)
+	    LD="${LD-ld} -m elf64lppc"
+	    ;;
+	  powerpc-*linux*)
 	    LD="${LD-ld} -m elf64ppc"
 	    ;;
 	  s390*-*linux*|s390*-*tpf*)
@@ -6379,7 +6581,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_MANIFEST_TOOL="${ac_tool_prefix}mt"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6419,7 +6621,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_MANIFEST_TOOL="mt"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6499,7 +6701,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_DSYMUTIL="${ac_tool_prefix}dsymutil"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6539,7 +6741,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_DSYMUTIL="dsymutil"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6591,7 +6793,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_NMEDIT="${ac_tool_prefix}nmedit"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6631,7 +6833,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_NMEDIT="nmedit"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6683,7 +6885,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_LIPO="${ac_tool_prefix}lipo"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6723,7 +6925,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_LIPO="lipo"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6775,7 +6977,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_OTOOL="${ac_tool_prefix}otool"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6815,7 +7017,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_OTOOL="otool"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6867,7 +7069,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_OTOOL64="${ac_tool_prefix}otool64"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -6907,7 +7109,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_ac_ct_OTOOL64="otool64"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -11398,131 +11600,6 @@
 # Only expand once:
 
 
-if test "x$CC" != xcc; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC and cc understand -c and -o together" >&5
-$as_echo_n "checking whether $CC and cc understand -c and -o together... " >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether cc understands -c and -o together" >&5
-$as_echo_n "checking whether cc understands -c and -o together... " >&6; }
-fi
-set dummy $CC; ac_cc=`$as_echo "$2" |
-		      sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
-if eval \${ac_cv_prog_cc_${ac_cc}_c_o+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-# Make sure it works both with $CC and with simple cc.
-# We do the test twice because some compilers refuse to overwrite an
-# existing .o file with -o, though they will create one.
-ac_try='$CC -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
-rm -f conftest2.*
-if { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } &&
-   test -f conftest2.$ac_objext && { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; };
-then
-  eval ac_cv_prog_cc_${ac_cc}_c_o=yes
-  if test "x$CC" != xcc; then
-    # Test first that cc exists at all.
-    if { ac_try='cc -c conftest.$ac_ext >&5'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-      ac_try='cc -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
-      rm -f conftest2.*
-      if { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } &&
-	 test -f conftest2.$ac_objext && { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; };
-      then
-	# cc works too.
-	:
-      else
-	# cc exists but doesn't like -o.
-	eval ac_cv_prog_cc_${ac_cc}_c_o=no
-      fi
-    fi
-  fi
-else
-  eval ac_cv_prog_cc_${ac_cc}_c_o=no
-fi
-rm -f core conftest*
-
-fi
-if eval test \$ac_cv_prog_cc_${ac_cc}_c_o = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-$as_echo "#define NO_MINUS_C_MINUS_O 1" >>confdefs.h
-
-fi
-
-# FIXME: we rely on the cache variable name because
-# there is no other way.
-set dummy $CC
-am_cc=`echo $2 | sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
-eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
-if test "$am_t" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-
 
 
    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5
@@ -11671,7 +11748,7 @@
   return 0;
 }
 _ACEOF
-for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -xc99=all -qlanglvl=extc99
+for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99
 do
   CC="$ac_save_CC $ac_arg"
   if ac_fn_c_try_compile "$LINENO"; then :
@@ -11713,11 +11790,11 @@
 int
 main ()
 {
-/* FIXME: Include the comments suggested by Paul. */
+
 #ifndef __cplusplus
-  /* Ultrix mips cc rejects this.  */
+  /* Ultrix mips cc rejects this sort of thing.  */
   typedef int charset[2];
-  const charset cs;
+  const charset cs = { 0, 0 };
   /* SunOS 4.1.1 cc rejects this.  */
   char const *const *pcpcc;
   char **ppc;
@@ -11734,8 +11811,9 @@
   ++pcpcc;
   ppc = (char**) pcpcc;
   pcpcc = (char const *const *) ppc;
-  { /* SCO 3.2v4 cc rejects this.  */
-    char *t;
+  { /* SCO 3.2v4 cc rejects this sort of thing.  */
+    char tx;
+    char *t = &tx;
     char const *s = 0 ? (char *) 0 : (char const *) 0;
 
     *t++ = 0;
@@ -11751,10 +11829,10 @@
     iptr p = 0;
     ++p;
   }
-  { /* AIX XL C 1.02.0.0 rejects this saying
+  { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying
        "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
-    struct s { int j; const int *ap[3]; };
-    struct s *b; b->j = 5;
+    struct s { int j; const int *ap[3]; } bx;
+    struct s *b = &bx; b->j = 5;
   }
   { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
     const int foo = 10;
@@ -11843,8 +11921,8 @@
   # We make a subdir and do the tests there.  Otherwise we can end up
   # making bogus files that we don't know about and never remove.  For
   # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
   rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
@@ -11877,16 +11955,16 @@
     : > sub/conftest.c
     for i in 1 2 3 4 5 6; do
       echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
     done
     echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
 
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
     # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
     am__obj=sub/conftest.${OBJEXT-o}
     am__minus_obj="-o $am__obj"
     case $depmode in
@@ -11895,8 +11973,8 @@
       test "$am__universal" = false || continue
       ;;
     nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
       if test "x$enable_dependency_tracking" = xyes; then
 	continue
       else
@@ -11904,7 +11982,7 @@
       fi
       ;;
     msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
+      # This compiler won't grok '-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
       am__obj=conftest.${OBJEXT-o}
@@ -12363,12 +12441,20 @@
 fi
 
 
+# Check whether --enable-intrinsics was given.
+if test "${enable_intrinsics+set}" = set; then :
+  enableval=$enable_intrinsics;
+else
+  enable_intrinsics=no
+fi
+
+
 rtcd_support=no
 cpu_arm=no
 
 if test x"${enable_asm}" = x"yes"; then :
 
-    inline_optimization="No ASM for your platform, please send patches"
+    inline_optimization="No inline ASM for your platform, please send patches"
     case $host_cpu in
       arm*)
                 if test "$enable_float" != "yes"; then :
@@ -12549,7 +12635,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_HAVE_PERL="yes"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -12779,6 +12865,31 @@
   rtcd_support="no"
 
 fi
+                { $as_echo "$as_me:${as_lineno-$LINENO}: checking for apple style tools" >&5
+$as_echo_n "checking for apple style tools... " >&6; }
+                cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifndef __APPLE__
+#error 1
+#endif
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }; ARM2GNU_PARAMS="--apple"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }; ARM2GNU_PARAMS=""
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
 
 else
 
@@ -12799,6 +12910,855 @@
 
 fi
 
+ if test x"${inline_optimization%% *}" = x"ARM"; then
+  OPUS_ARM_INLINE_ASM_TRUE=
+  OPUS_ARM_INLINE_ASM_FALSE='#'
+else
+  OPUS_ARM_INLINE_ASM_TRUE='#'
+  OPUS_ARM_INLINE_ASM_FALSE=
+fi
+
+ if test x"${asm_optimization%% *}" = x"ARM"; then
+  OPUS_ARM_EXTERNAL_ASM_TRUE=
+  OPUS_ARM_EXTERNAL_ASM_FALSE='#'
+else
+  OPUS_ARM_EXTERNAL_ASM_TRUE='#'
+  OPUS_ARM_EXTERNAL_ASM_FALSE=
+fi
+
+
+ if false; then
+  HAVE_SSE_TRUE=
+  HAVE_SSE_FALSE='#'
+else
+  HAVE_SSE_TRUE='#'
+  HAVE_SSE_FALSE=
+fi
+
+ if false; then
+  HAVE_SSE2_TRUE=
+  HAVE_SSE2_FALSE='#'
+else
+  HAVE_SSE2_TRUE='#'
+  HAVE_SSE2_FALSE=
+fi
+
+ if false; then
+  HAVE_SSE4_1_TRUE=
+  HAVE_SSE4_1_FALSE='#'
+else
+  HAVE_SSE4_1_TRUE='#'
+  HAVE_SSE4_1_FALSE=
+fi
+
+ if false; then
+  HAVE_AVX_TRUE=
+  HAVE_AVX_FALSE='#'
+else
+  HAVE_AVX_TRUE='#'
+  HAVE_AVX_FALSE=
+fi
+
+
+
+
+
+
+
+# With GCC on ARM32 softfp architectures (e.g. Android, or older Ubuntu) you need to specify
+# -mfloat-abi=softfp for -mfpu=neon to work.  However, on ARM32 hardfp architectures (e.g. newer Ubuntu),
+# this option will break things.
+
+# As a heuristic, if host matches arm*eabi* but not arm*hf*, it's probably soft-float.
+
+
+case $host in #(
+  arm*hf*) :
+    RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS="-mfpu=neon" ;; #(
+  arm*eabi*) :
+    RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS="-mfpu=neon -mfloat-abi=softfp" ;; #(
+  *) :
+    RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS="-mfpu=neon" ;;
+esac
+
+
+
+
+
+
+
+if ${X86_SSE_CFLAGS+:} false; then :
+
+else
+  X86_SSE_CFLAGS="-msse"
+fi
+if ${X86_SSE2_CFLAGS+:} false; then :
+
+else
+  X86_SSE2_CFLAGS="-msse2"
+fi
+if ${X86_SSE4_1_CFLAGS+:} false; then :
+
+else
+  X86_SSE4_1_CFLAGS="-msse4.1"
+fi
+if ${X86_AVX_CFLAGS+:} false; then :
+
+else
+  X86_AVX_CFLAGS="-mavx"
+fi
+if ${ARM_NEON_INTR_CFLAGS+:} false; then :
+
+else
+  ARM_NEON_INTR_CFLAGS="$RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS"
+fi
+
+
+
+if test x"$enable_intrinsics" = x"yes"; then :
+
+   intrinsics_support=""
+   case $host_cpu in #(
+  arm*) :
+
+      cpu_arm=yes
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports ARM Neon intrinsics" >&5
+$as_echo_n "checking if compiler supports ARM Neon intrinsics... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+
+int
+main ()
+{
+
+            static float32x4_t A0, A1, SUMM;
+            SUMM = vmlaq_f32(SUMM, A0, A1);
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+        OPUS_ARM_MAY_HAVE_NEON_INTR=1
+        OPUS_ARM_PRESUME_NEON_INTR=1
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+        OPUS_ARM_PRESUME_NEON_INTR=0
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports ARM Neon intrinsics with $ARM_NEON_INTR_CFLAGS" >&5
+$as_echo_n "checking if compiler supports ARM Neon intrinsics with $ARM_NEON_INTR_CFLAGS... " >&6; }
+        save_CFLAGS="$CFLAGS"; CFLAGS="$ARM_NEON_INTR_CFLAGS $CFLAGS"
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+
+int
+main ()
+{
+
+            static float32x4_t A0, A1, SUMM;
+            SUMM = vmlaq_f32(SUMM, A0, A1);
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+           OPUS_ARM_MAY_HAVE_NEON_INTR=1
+
+else
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+           OPUS_ARM_MAY_HAVE_NEON_INTR=0
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        CFLAGS="$save_CFLAGS"
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      if test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"; then :
+
+             OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS"
+
+
+
+fi
+
+      if test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"; then :
+
+
+$as_echo "#define OPUS_ARM_MAY_HAVE_NEON_INTR 1" >>confdefs.h
+
+         intrinsics_support="$intrinsics_support (Neon_Intrinsics)"
+
+         if test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"; then :
+  rtcd_support="$rtcd_support (ARMv7_Neon_Intrinsics)"
+fi
+
+         if test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"; then :
+
+$as_echo "#define OPUS_ARM_PRESUME_NEON_INTR 1" >>confdefs.h
+
+fi
+
+
+
+# Check whether --with-NE10 was given.
+if test "${with_NE10+set}" = set; then :
+  withval=$with_NE10; NE10_prefix="$withval"
+else
+  NE10_prefix=""
+fi
+
+
+# Check whether --with-NE10-libraries was given.
+if test "${with_NE10_libraries+set}" = set; then :
+  withval=$with_NE10_libraries; NE10_libraries="$withval"
+else
+  NE10_libraries=""
+fi
+
+
+# Check whether --with-NE10-includes was given.
+if test "${with_NE10_includes+set}" = set; then :
+  withval=$with_NE10_includes; NE10_includes="$withval"
+else
+  NE10_includes=""
+fi
+
+
+      if test "x$NE10_libraries" != "x" ; then
+         NE10_LIBS="-L$NE10_libraries"
+      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
+         NE10_LIBS=""
+      elif test "x$NE10_prefix" != "x" ; then
+         NE10_LIBS="-L$NE10_prefix/lib"
+      elif test "x$prefix" != "xNONE" ; then
+         NE10_LIBS="-L$prefix/lib"
+      fi
+
+      if test "x$NE10_prefix" != "xno" ; then
+         NE10_LIBS="$NE10_LIBS -lNE10"
+      fi
+
+      if test "x$NE10_includes" != "x" ; then
+         NE10_CFLAGS="-I$NE10_includes"
+      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
+         NE10_CFLAGS=""
+      elif test "x$ogg_prefix" != "x" ; then
+         NE10_CFLAGS="-I$NE10_prefix/include"
+      elif test "x$prefix" != "xNONE"; then
+         NE10_CFLAGS="-I$prefix/include"
+      fi
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for NE10" >&5
+$as_echo_n "checking for NE10... " >&6; }
+      save_CFLAGS="$CFLAGS"; CFLAGS="$NE10_CFLAGS"
+      save_LIBS="$LIBS"; LIBS="$NE10_LIBS $LIBM"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+            #include <NE10_init.h>
+
+int
+main ()
+{
+
+                  ne10_fft_cfg_float32_t cfg;
+                  cfg = ne10_fft_alloc_c2c_float32_neon(480);
+
+
+  ;
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+            HAVE_ARM_NE10=1
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+            HAVE_ARM_NE10=0
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+            NE10_CFLAGS=""
+            NE10_LIBS=""
+
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+      CFLAGS="$save_CFLAGS"; LIBS="$save_LIBS"
+      #Now we know if libNE10 is installed or not
+      if test x"$HAVE_ARM_NE10" = x"1"; then :
+
+
+$as_echo "#define HAVE_ARM_NE10 1" >>confdefs.h
+
+
+
+
+
+
+fi
+
+
+         if test x"$NE10_LIBS" != x""; then :
+
+              intrinsics_support="$intrinsics_support (NE10)"
+              if test x"enable_rtcd" != x"" \
+               && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"; then :
+  rtcd_support="$rtcd_support (NE10)"
+fi
+
+fi
+
+         if test x"$rtcd_support" = x""; then :
+  rtcd_support=no
+fi
+
+         if test x"$intrinsics_support" = x""; then :
+  intrinsics_support=no
+else
+  intrinsics_support="arm$intrinsics_support"
+fi
+
+else
+
+         { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Compiler does not support ARM intrinsics" >&5
+$as_echo "$as_me: WARNING: Compiler does not support ARM intrinsics" >&2;}
+         intrinsics_support=no
+
+fi
+    ;; #(
+  i?86|x86_64) :
+
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE intrinsics" >&5
+$as_echo_n "checking if compiler supports SSE intrinsics... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <xmmintrin.h>
+
+int
+main ()
+{
+
+             static __m128 mtest;
+             mtest = _mm_setzero_ps();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+        OPUS_X86_MAY_HAVE_SSE=1
+        OPUS_X86_PRESUME_SSE=1
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+        OPUS_X86_PRESUME_SSE=0
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE intrinsics with $X86_SSE_CFLAGS" >&5
+$as_echo_n "checking if compiler supports SSE intrinsics with $X86_SSE_CFLAGS... " >&6; }
+        save_CFLAGS="$CFLAGS"; CFLAGS="$X86_SSE_CFLAGS $CFLAGS"
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <xmmintrin.h>
+
+int
+main ()
+{
+
+             static __m128 mtest;
+             mtest = _mm_setzero_ps();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+           OPUS_X86_MAY_HAVE_SSE=1
+
+else
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+           OPUS_X86_MAY_HAVE_SSE=0
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        CFLAGS="$save_CFLAGS"
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      if test x"$OPUS_X86_MAY_HAVE_SSE" = x"1" && test x"$OPUS_X86_PRESUME_SSE" != x"1"; then :
+
+             OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS"
+
+
+
+fi
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE2 intrinsics" >&5
+$as_echo_n "checking if compiler supports SSE2 intrinsics... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <emmintrin.h>
+
+int
+main ()
+{
+
+             static __m128i mtest;
+             mtest = _mm_setzero_si128();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+        OPUS_X86_MAY_HAVE_SSE2=1
+        OPUS_X86_PRESUME_SSE2=1
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+        OPUS_X86_PRESUME_SSE2=0
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE2 intrinsics with $X86_SSE2_CFLAGS" >&5
+$as_echo_n "checking if compiler supports SSE2 intrinsics with $X86_SSE2_CFLAGS... " >&6; }
+        save_CFLAGS="$CFLAGS"; CFLAGS="$X86_SSE2_CFLAGS $CFLAGS"
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <emmintrin.h>
+
+int
+main ()
+{
+
+             static __m128i mtest;
+             mtest = _mm_setzero_si128();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+           OPUS_X86_MAY_HAVE_SSE2=1
+
+else
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+           OPUS_X86_MAY_HAVE_SSE2=0
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        CFLAGS="$save_CFLAGS"
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      if test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"; then :
+
+             OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS"
+
+
+
+fi
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE4.1 intrinsics" >&5
+$as_echo_n "checking if compiler supports SSE4.1 intrinsics... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <smmintrin.h>
+
+int
+main ()
+{
+
+            static __m128i mtest;
+            mtest = _mm_setzero_si128();
+            mtest = _mm_cmpeq_epi64(mtest, mtest);
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+        OPUS_X86_MAY_HAVE_SSE4_1=1
+        OPUS_X86_PRESUME_SSE4_1=1
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+        OPUS_X86_PRESUME_SSE4_1=0
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports SSE4.1 intrinsics with $X86_SSE4_1_CFLAGS" >&5
+$as_echo_n "checking if compiler supports SSE4.1 intrinsics with $X86_SSE4_1_CFLAGS... " >&6; }
+        save_CFLAGS="$CFLAGS"; CFLAGS="$X86_SSE4_1_CFLAGS $CFLAGS"
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <smmintrin.h>
+
+int
+main ()
+{
+
+            static __m128i mtest;
+            mtest = _mm_setzero_si128();
+            mtest = _mm_cmpeq_epi64(mtest, mtest);
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+           OPUS_X86_MAY_HAVE_SSE4_1=1
+
+else
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+           OPUS_X86_MAY_HAVE_SSE4_1=0
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        CFLAGS="$save_CFLAGS"
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      if test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1" && test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"; then :
+
+             OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS"
+
+
+
+fi
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports AVX intrinsics" >&5
+$as_echo_n "checking if compiler supports AVX intrinsics... " >&6; }
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+
+int
+main ()
+{
+
+            static __m256 mtest;
+            mtest = _mm256_setzero_ps();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+        OPUS_X86_MAY_HAVE_AVX=1
+        OPUS_X86_PRESUME_AVX=1
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+        OPUS_X86_PRESUME_AVX=0
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking if compiler supports AVX intrinsics with $X86_AVX_CFLAGS" >&5
+$as_echo_n "checking if compiler supports AVX intrinsics with $X86_AVX_CFLAGS... " >&6; }
+        save_CFLAGS="$CFLAGS"; CFLAGS="$X86_AVX_CFLAGS $CFLAGS"
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+
+int
+main ()
+{
+
+            static __m256 mtest;
+            mtest = _mm256_setzero_ps();
+
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+           OPUS_X86_MAY_HAVE_AVX=1
+
+else
+
+           { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+           OPUS_X86_MAY_HAVE_AVX=0
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+        CFLAGS="$save_CFLAGS"
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      if test x"$OPUS_X86_MAY_HAVE_AVX" = x"1" && test x"$OPUS_X86_PRESUME_AVX" != x"1"; then :
+
+             OPUS_X86_AVX_CFLAGS="$X86_AVX_CFLAGS"
+
+
+
+fi
+         if test x"$rtcd_support" = x"no"; then :
+  rtcd_support=""
+fi
+         if test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"; then :
+
+
+$as_echo "#define OPUS_X86_MAY_HAVE_SSE 1" >>confdefs.h
+
+            intrinsics_support="$intrinsics_support SSE"
+
+            if test x"$OPUS_X86_PRESUME_SSE" = x"1"; then :
+
+$as_echo "#define OPUS_X86_PRESUME_SSE 1" >>confdefs.h
+
+else
+  rtcd_support="$rtcd_support SSE"
+fi
+
+else
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Compiler does not support SSE intrinsics" >&5
+$as_echo "$as_me: WARNING: Compiler does not support SSE intrinsics" >&2;}
+
+fi
+
+         if test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"; then :
+
+
+$as_echo "#define OPUS_X86_MAY_HAVE_SSE2 1" >>confdefs.h
+
+            intrinsics_support="$intrinsics_support SSE2"
+
+            if test x"$OPUS_X86_PRESUME_SSE2" = x"1"; then :
+
+$as_echo "#define OPUS_X86_PRESUME_SSE2 1" >>confdefs.h
+
+else
+  rtcd_support="$rtcd_support SSE2"
+fi
+
+else
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Compiler does not support SSE2 intrinsics" >&5
+$as_echo "$as_me: WARNING: Compiler does not support SSE2 intrinsics" >&2;}
+
+fi
+
+         if test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"; then :
+
+
+$as_echo "#define OPUS_X86_MAY_HAVE_SSE4_1 1" >>confdefs.h
+
+            intrinsics_support="$intrinsics_support SSE4.1"
+
+            if test x"$OPUS_X86_PRESUME_SSE4_1" = x"1"; then :
+
+$as_echo "#define OPUS_X86_PRESUME_SSE4_1 1" >>confdefs.h
+
+else
+  rtcd_support="$rtcd_support SSE4.1"
+fi
+
+else
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Compiler does not support SSE4.1 intrinsics" >&5
+$as_echo "$as_me: WARNING: Compiler does not support SSE4.1 intrinsics" >&2;}
+
+fi
+         if test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"; then :
+
+
+$as_echo "#define OPUS_X86_MAY_HAVE_AVX 1" >>confdefs.h
+
+            intrinsics_support="$intrinsics_support AVX"
+
+            if test x"$OPUS_X86_PRESUME_AVX" = x"1"; then :
+
+$as_echo "#define OPUS_X86_PRESUME_AVX 1" >>confdefs.h
+
+else
+  rtcd_support="$rtcd_support AVX"
+fi
+
+else
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Compiler does not support AVX intrinsics" >&5
+$as_echo "$as_me: WARNING: Compiler does not support AVX intrinsics" >&2;}
+
+fi
+
+         if test x"$intrinsics_support" = x""; then :
+  intrinsics_support=no
+else
+  intrinsics_support="x86$intrinsics_support"
+
+fi
+         if test x"$rtcd_support" = x""; then :
+  rtcd_support=no
+elif rtcd_support="x86$rtcd_support"; then :
+
+fi
+
+    if test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""; then :
+
+            get_cpuid_by_asm="no"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking How to get X86 CPU Info" >&5
+$as_echo_n "checking How to get X86 CPU Info... " >&6; }
+            cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+                 #include <stdio.h>
+
+int
+main ()
+{
+
+                 unsigned int CPUInfo0;
+                 unsigned int CPUInfo1;
+                 unsigned int CPUInfo2;
+                 unsigned int CPUInfo3;
+                 unsigned int InfoType;
+                 __asm__ __volatile__ (
+                 "cpuid":
+                 "=a" (CPUInfo0),
+                 "=b" (CPUInfo1),
+                 "=c" (CPUInfo2),
+                 "=d" (CPUInfo3) :
+                 "a" (InfoType), "c" (0)
+                );
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  get_cpuid_by_asm="yes"
+             { $as_echo "$as_me:${as_lineno-$LINENO}: result: Inline Assembly" >&5
+$as_echo "Inline Assembly" >&6; }
+
+$as_echo "#define CPU_INFO_BY_ASM 1" >>confdefs.h
+
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+                 #include <cpuid.h>
+
+int
+main ()
+{
+
+                 unsigned int CPUInfo0;
+                 unsigned int CPUInfo1;
+                 unsigned int CPUInfo2;
+                 unsigned int CPUInfo3;
+                 unsigned int InfoType;
+                 __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: C method" >&5
+$as_echo "C method" >&6; }
+
+$as_echo "#define CPU_INFO_BY_C 1" >>confdefs.h
+
+else
+  as_fn_error $? "no supported Get CPU Info method, please disable intrinsics" "$LINENO" 5
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+    ;; #(
+  *) :
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: No intrinsics support for your architecture" >&5
+$as_echo "$as_me: WARNING: No intrinsics support for your architecture" >&2;}
+      intrinsics_support="no"
+    ;;
+esac
+
+else
+
+   intrinsics_support="no"
+
+fi
+
  if test "$cpu_arm" = "yes"; then
   CPU_ARM_TRUE=
   CPU_ARM_FALSE='#'
@@ -12807,20 +13767,52 @@
   CPU_ARM_FALSE=
 fi
 
- if test x"${inline_optimization:0:3}" = x"ARM"; then
-  OPUS_ARM_INLINE_ASM_TRUE=
-  OPUS_ARM_INLINE_ASM_FALSE='#'
+ if test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"; then
+  OPUS_ARM_NEON_INTR_TRUE=
+  OPUS_ARM_NEON_INTR_FALSE='#'
 else
-  OPUS_ARM_INLINE_ASM_TRUE='#'
-  OPUS_ARM_INLINE_ASM_FALSE=
+  OPUS_ARM_NEON_INTR_TRUE='#'
+  OPUS_ARM_NEON_INTR_FALSE=
 fi
 
- if test x"${asm_optimization:0:3}" = x"ARM"; then
-  OPUS_ARM_EXTERNAL_ASM_TRUE=
-  OPUS_ARM_EXTERNAL_ASM_FALSE='#'
+ if test x"$HAVE_ARM_NE10" = x"1"; then
+  HAVE_ARM_NE10_TRUE=
+  HAVE_ARM_NE10_FALSE='#'
 else
-  OPUS_ARM_EXTERNAL_ASM_TRUE='#'
-  OPUS_ARM_EXTERNAL_ASM_FALSE=
+  HAVE_ARM_NE10_TRUE='#'
+  HAVE_ARM_NE10_FALSE=
+fi
+
+ if test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"; then
+  HAVE_SSE_TRUE=
+  HAVE_SSE_FALSE='#'
+else
+  HAVE_SSE_TRUE='#'
+  HAVE_SSE_FALSE=
+fi
+
+ if test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"; then
+  HAVE_SSE2_TRUE=
+  HAVE_SSE2_FALSE='#'
+else
+  HAVE_SSE2_TRUE='#'
+  HAVE_SSE2_FALSE=
+fi
+
+ if test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"; then
+  HAVE_SSE4_1_TRUE=
+  HAVE_SSE4_1_FALSE='#'
+else
+  HAVE_SSE4_1_TRUE='#'
+  HAVE_SSE4_1_FALSE=
+fi
+
+ if test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"; then
+  HAVE_AVX_TRUE=
+  HAVE_AVX_FALSE='#'
+else
+  HAVE_AVX_TRUE='#'
+  HAVE_AVX_FALSE=
 fi
 
 
@@ -12900,7 +13892,7 @@
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
     ac_cv_prog_HAVE_DOXYGEN="yes"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
@@ -13151,6 +14143,14 @@
 LTLIBOBJS=$ac_ltlibobjs
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5
+$as_echo_n "checking that generated files are newer than configure... " >&6; }
+   if test -n "$am_sleep_pid"; then
+     # Hide warnings about reused PIDs.
+     wait $am_sleep_pid 2>/dev/null
+   fi
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: done" >&5
+$as_echo "done" >&6; }
  if test -n "$EXEEXT"; then
   am__EXEEXT_TRUE=
   am__EXEEXT_FALSE='#'
@@ -13195,10 +14195,6 @@
   as_fn_error $? "conditional \"OPUS_ARM_EXTERNAL_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${CPU_ARM_TRUE}" && test -z "${CPU_ARM_FALSE}"; then
-  as_fn_error $? "conditional \"CPU_ARM\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${OPUS_ARM_INLINE_ASM_TRUE}" && test -z "${OPUS_ARM_INLINE_ASM_FALSE}"; then
   as_fn_error $? "conditional \"OPUS_ARM_INLINE_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -13207,6 +14203,50 @@
   as_fn_error $? "conditional \"OPUS_ARM_EXTERNAL_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_SSE_TRUE}" && test -z "${HAVE_SSE_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE2_TRUE}" && test -z "${HAVE_SSE2_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE2\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE4_1_TRUE}" && test -z "${HAVE_SSE4_1_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE4_1\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_AVX_TRUE}" && test -z "${HAVE_AVX_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_AVX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${CPU_ARM_TRUE}" && test -z "${CPU_ARM_FALSE}"; then
+  as_fn_error $? "conditional \"CPU_ARM\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${OPUS_ARM_NEON_INTR_TRUE}" && test -z "${OPUS_ARM_NEON_INTR_FALSE}"; then
+  as_fn_error $? "conditional \"OPUS_ARM_NEON_INTR\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_ARM_NE10_TRUE}" && test -z "${HAVE_ARM_NE10_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_ARM_NE10\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE_TRUE}" && test -z "${HAVE_SSE_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE2_TRUE}" && test -z "${HAVE_SSE2_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE2\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SSE4_1_TRUE}" && test -z "${HAVE_SSE4_1_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SSE4_1\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_AVX_TRUE}" && test -z "${HAVE_AVX_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_AVX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${HAVE_DOXYGEN_TRUE}" && test -z "${HAVE_DOXYGEN_FALSE}"; then
   as_fn_error $? "conditional \"HAVE_DOXYGEN\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -13513,16 +14553,16 @@
     # ... but there are two gotchas:
     # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
     # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -p'.
+    # In both cases, we have to default to `cp -pR'.
     ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-      as_ln_s='cp -p'
+      as_ln_s='cp -pR'
   elif ln conf$$.file conf$$ 2>/dev/null; then
     as_ln_s=ln
   else
-    as_ln_s='cp -p'
+    as_ln_s='cp -pR'
   fi
 else
-  as_ln_s='cp -p'
+  as_ln_s='cp -pR'
 fi
 rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
 rmdir conf$$.dir 2>/dev/null
@@ -13582,28 +14622,16 @@
   as_mkdir_p=false
 fi
 
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-	test -d "$1/.";
-      else
-	case $1 in #(
-	-*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
 
 # Sed expression to map a string onto a valid CPP name.
 as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
@@ -13624,8 +14652,8 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by opus $as_me 1.1, which was
-generated by GNU Autoconf 2.68.  Invocation command line was
+This file was extended by opus $as_me 1.1.2, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
   CONFIG_HEADERS  = $CONFIG_HEADERS
@@ -13690,11 +14718,11 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-opus config.status 1.1
-configured by $0, generated by GNU Autoconf 2.68,
+opus config.status 1.1.2
+configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
-Copyright (C) 2010 Free Software Foundation, Inc.
+Copyright (C) 2012 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
@@ -13785,7 +14813,7 @@
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 if \$ac_cs_recheck; then
-  set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
   shift
   \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
   CONFIG_SHELL='$SHELL'
@@ -14701,7 +15729,7 @@
 
   case $ac_file$ac_mode in
     "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # Older Autoconf quotes --file arguments for eval, but not when files
   # are listed without --file.  Let's play safe and only enable the eval
   # if we detect the quoting.
   case $CONFIG_FILES in
@@ -14714,7 +15742,7 @@
     # Strip MF so we end up with the name of the file.
     mf=`echo "$mf" | sed -e 's/:.*$//'`
     # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named `Makefile.in', but
+    # We used to match only the files named 'Makefile.in', but
     # some people rename them; so instead we look at the file content.
     # Grep'ing the first line is not enough: some people post-process
     # each Makefile.in and add a new line on top of each file to say so.
@@ -14748,21 +15776,19 @@
       continue
     fi
     # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running `make'.
+    # from the Makefile without running 'make'.
     DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
     test -z "$DEPDIR" && continue
     am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
+    test -z "$am__include" && continue
     am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # When using ansi2knr, U may be empty or an underscore; expand it
-    U=`sed -n 's/^U = //p' < "$mf"`
     # Find all dependency output files, they are included files with
     # $(DEPDIR) in their names.  We invoke sed twice because it is the
     # simplest approach to changing $(DEPDIR) to its actual value in the
     # expansion.
     for file in `sed -n "
       s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
       # Make sure the directory exists.
       test -f "$dirpart/$file" && continue
       fdir=`$as_dirname -- "$file" ||
@@ -15481,6 +16507,7 @@
       Fixed point debugging: ......... ${enable_fixed_point_debug}
       Inline Assembly Optimizations: . ${inline_optimization}
       External Assembly Optimizations: ${asm_optimization}
+      Intrinsics Optimizations.......: ${intrinsics_support}
       Run-time CPU detection: ........ ${rtcd_support}
       Custom modes: .................. ${enable_custom_modes}
       Assertion checking: ............ ${enable_assertions}
@@ -15510,6 +16537,7 @@
       Fixed point debugging: ......... ${enable_fixed_point_debug}
       Inline Assembly Optimizations: . ${inline_optimization}
       External Assembly Optimizations: ${asm_optimization}
+      Intrinsics Optimizations.......: ${intrinsics_support}
       Run-time CPU detection: ........ ${rtcd_support}
       Custom modes: .................. ${enable_custom_modes}
       Assertion checking: ............ ${enable_assertions}
diff --git a/configure.ac b/configure.ac
index 0ba4a80..a67aa37 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
 # For libtool.
 dnl Please update these for releases.
 OPUS_LT_CURRENT=5
-OPUS_LT_REVISION=0
+OPUS_LT_REVISION=2
 OPUS_LT_AGE=5
 
 AC_SUBST(OPUS_LT_CURRENT)
@@ -189,11 +189,15 @@
     [AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,
     [enable_rtcd=yes])
 
+AC_ARG_ENABLE([intrinsics],
+    [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations for ARM(float) X86(fixed)])],,
+    [enable_intrinsics=no])
+
 rtcd_support=no
 cpu_arm=no
 
 AS_IF([test x"${enable_asm}" = x"yes"],[
-    inline_optimization="No ASM for your platform, please send patches"
+    inline_optimization="No inline ASM for your platform, please send patches"
     case $host_cpu in
       arm*)
         dnl Currently we only have asm for fixed-point
@@ -317,6 +321,14 @@
                     [rtcd_support=ARM"$rtcd_support"],
                     [rtcd_support="no"]
                 )
+                AC_MSG_CHECKING([for apple style tools])
+                AC_PREPROC_IFELSE([AC_LANG_PROGRAM([
+#ifndef __APPLE__
+#error 1
+#endif],[])],
+                    [AC_MSG_RESULT([yes]); ARM2GNU_PARAMS="--apple"],
+                    [AC_MSG_RESULT([no]); ARM2GNU_PARAMS=""])
+                AC_SUBST(ARM2GNU_PARAMS)
             ],
             [
                 AC_MSG_WARN(
@@ -331,11 +343,371 @@
    asm_optimization="disabled"
 ])
 
-AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
 AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
-    [test x"${inline_optimization:0:3}" = x"ARM"])
+    [test x"${inline_optimization%% *}" = x"ARM"])
 AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
-    [test x"${asm_optimization:0:3}" = x"ARM"])
+    [test x"${asm_optimization%% *}" = x"ARM"])
+
+AM_CONDITIONAL([HAVE_SSE], [false])
+AM_CONDITIONAL([HAVE_SSE2], [false])
+AM_CONDITIONAL([HAVE_SSE4_1], [false])
+AM_CONDITIONAL([HAVE_AVX], [false])
+
+m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse])
+m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2])
+m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1])
+m4_define([DEFAULT_X86_AVX_CFLAGS], [-mavx])
+m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon])
+# With GCC on ARM32 softfp architectures (e.g. Android, or older Ubuntu) you need to specify
+# -mfloat-abi=softfp for -mfpu=neon to work.  However, on ARM32 hardfp architectures (e.g. newer Ubuntu),
+# this option will break things.
+
+# As a heuristic, if host matches arm*eabi* but not arm*hf*, it's probably soft-float.
+m4_define([DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS], [-mfpu=neon -mfloat-abi=softfp])
+
+AS_CASE([$host],
+	[arm*hf*], [AS_VAR_SET([RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS], "DEFAULT_ARM_NEON_INTR_CFLAGS")],
+	[arm*eabi*], [AS_VAR_SET([RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS], "DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS")],
+	[AS_VAR_SET([RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS], "DEFAULT_ARM_NEON_INTR_CFLAGS")])
+
+AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@])
+AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@])
+AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@])
+AC_ARG_VAR([X86_AVX_CFLAGS], [C compiler flags to compile AVX intrinsics @<:@default=]DEFAULT_X86_AVX_CFLAGS[@:>@])
+AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS / DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS[@:>@])
+
+AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], "DEFAULT_X86_SSE_CFLAGS")])
+AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], "DEFAULT_X86_SSE2_CFLAGS")])
+AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], "DEFAULT_X86_SSE4_1_CFLAGS")])
+AS_VAR_SET_IF([X86_AVX_CFLAGS], [], [AS_VAR_SET([X86_AVX_CFLAGS], "DEFAULT_X86_AVX_CFLAGS")])
+AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], ["$RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS"])])
+
+AC_DEFUN([OPUS_PATH_NE10],
+   [
+      AC_ARG_WITH(NE10,
+                  AC_HELP_STRING([--with-NE10=PFX],[Prefix where libNE10 is installed (optional)]),
+                  NE10_prefix="$withval", NE10_prefix="")
+      AC_ARG_WITH(NE10-libraries,
+                  AC_HELP_STRING([--with-NE10-libraries=DIR],
+                        [Directory where libNE10 library is installed (optional)]),
+                  NE10_libraries="$withval", NE10_libraries="")
+      AC_ARG_WITH(NE10-includes,
+                  AC_HELP_STRING([--with-NE10-includes=DIR],
+                                 [Directory where libNE10 header files are installed (optional)]),
+                  NE10_includes="$withval", NE10_includes="")
+
+      if test "x$NE10_libraries" != "x" ; then
+         NE10_LIBS="-L$NE10_libraries"
+      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
+         NE10_LIBS=""
+      elif test "x$NE10_prefix" != "x" ; then
+         NE10_LIBS="-L$NE10_prefix/lib"
+      elif test "x$prefix" != "xNONE" ; then
+         NE10_LIBS="-L$prefix/lib"
+      fi
+
+      if test "x$NE10_prefix" != "xno" ; then
+         NE10_LIBS="$NE10_LIBS -lNE10"
+      fi
+
+      if test "x$NE10_includes" != "x" ; then
+         NE10_CFLAGS="-I$NE10_includes"
+      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
+         NE10_CFLAGS=""
+      elif test "x$ogg_prefix" != "x" ; then
+         NE10_CFLAGS="-I$NE10_prefix/include"
+      elif test "x$prefix" != "xNONE"; then
+         NE10_CFLAGS="-I$prefix/include"
+      fi
+
+      AC_MSG_CHECKING(for NE10)
+      save_CFLAGS="$CFLAGS"; CFLAGS="$NE10_CFLAGS"
+      save_LIBS="$LIBS"; LIBS="$NE10_LIBS $LIBM"
+      AC_LINK_IFELSE(
+         [
+            AC_LANG_PROGRAM(
+               [[#include <NE10_init.h>
+               ]],
+               [[
+                  ne10_fft_cfg_float32_t cfg;
+                  cfg = ne10_fft_alloc_c2c_float32_neon(480);
+               ]]
+            )
+         ],[
+            HAVE_ARM_NE10=1
+            AC_MSG_RESULT([yes])
+         ],[
+            HAVE_ARM_NE10=0
+            AC_MSG_RESULT([no])
+            NE10_CFLAGS=""
+            NE10_LIBS=""
+         ]
+      )
+      CFLAGS="$save_CFLAGS"; LIBS="$save_LIBS"
+      #Now we know if libNE10 is installed or not
+      AS_IF([test x"$HAVE_ARM_NE10" = x"1"],
+         [
+            AC_DEFINE([HAVE_ARM_NE10], 1, [NE10 library is installed on host. Make sure it is on target!])
+            AC_SUBST(HAVE_ARM_NE10)
+            AC_SUBST(NE10_CFLAGS)
+            AC_SUBST(NE10_LIBS)
+         ]
+      )
+   ]
+)
+
+AS_IF([test x"$enable_intrinsics" = x"yes"],[
+   intrinsics_support=""
+   AS_CASE([$host_cpu],
+   [arm*],
+   [
+      cpu_arm=yes
+      OPUS_CHECK_INTRINSICS(
+         [ARM Neon],
+         [$ARM_NEON_INTR_CFLAGS],
+         [OPUS_ARM_MAY_HAVE_NEON_INTR],
+         [OPUS_ARM_PRESUME_NEON_INTR],
+         [[#include <arm_neon.h>
+         ]],
+         [[
+            static float32x4_t A0, A1, SUMM;
+            SUMM = vmlaq_f32(SUMM, A0, A1);
+         ]]
+      )
+      AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
+          [
+             OPUS_ARM_NEON_INTR_CFLAGS="$ARM_NEON_INTR_CFLAGS"
+             AC_SUBST([OPUS_ARM_NEON_INTR_CFLAGS])
+          ]
+      )
+
+      AS_IF([test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"],
+      [
+         AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics])
+         intrinsics_support="$intrinsics_support (Neon_Intrinsics)"
+
+         AS_IF([test x"enable_rtcd" != x"" && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
+            [rtcd_support="$rtcd_support (ARMv7_Neon_Intrinsics)"])
+
+         AS_IF([test x"$OPUS_ARM_PRESUME_NEON_INTR" = x"1"],
+            [AC_DEFINE([OPUS_ARM_PRESUME_NEON_INTR], 1, [Define if binary requires NEON intrinsics support])])
+
+         OPUS_PATH_NE10()
+         AS_IF([test x"$NE10_LIBS" != x""],
+         [
+              intrinsics_support="$intrinsics_support (NE10)"
+              AS_IF([test x"enable_rtcd" != x"" \
+               && test x"$OPUS_ARM_PRESUME_NEON_INTR" != x"1"],
+                 [rtcd_support="$rtcd_support (NE10)"])
+         ])
+
+         AS_IF([test x"$rtcd_support" = x""],
+            [rtcd_support=no])
+
+         AS_IF([test x"$intrinsics_support" = x""],
+            [intrinsics_support=no],
+            [intrinsics_support="arm$intrinsics_support"])
+      ],
+      [
+         AC_MSG_WARN([Compiler does not support ARM intrinsics])
+         intrinsics_support=no
+      ])
+   ],
+   [i?86|x86_64],
+   [
+      OPUS_CHECK_INTRINSICS(
+         [SSE],
+         [$X86_SSE_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE],
+         [OPUS_X86_PRESUME_SSE],
+         [[#include <xmmintrin.h>
+         ]],
+         [[
+             static __m128 mtest;
+             mtest = _mm_setzero_ps();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1" && test x"$OPUS_X86_PRESUME_SSE" != x"1"],
+          [
+             OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE_CFLAGS])
+          ]
+      )
+      OPUS_CHECK_INTRINSICS(
+         [SSE2],
+         [$X86_SSE2_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE2],
+         [OPUS_X86_PRESUME_SSE2],
+         [[#include <emmintrin.h>
+         ]],
+         [[
+             static __m128i mtest;
+             mtest = _mm_setzero_si128();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"],
+          [
+             OPUS_X86_SSE2_CFLAGS="$X86_SSE2_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE2_CFLAGS])
+          ]
+      )
+      OPUS_CHECK_INTRINSICS(
+         [SSE4.1],
+         [$X86_SSE4_1_CFLAGS],
+         [OPUS_X86_MAY_HAVE_SSE4_1],
+         [OPUS_X86_PRESUME_SSE4_1],
+         [[#include <smmintrin.h>
+         ]],
+         [[
+            static __m128i mtest;
+            mtest = _mm_setzero_si128();
+            mtest = _mm_cmpeq_epi64(mtest, mtest);
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1" && test x"$OPUS_X86_PRESUME_SSE4_1" != x"1"],
+          [
+             OPUS_X86_SSE4_1_CFLAGS="$X86_SSE4_1_CFLAGS"
+             AC_SUBST([OPUS_X86_SSE4_1_CFLAGS])
+          ]
+      )
+      OPUS_CHECK_INTRINSICS(
+         [AVX],
+         [$X86_AVX_CFLAGS],
+         [OPUS_X86_MAY_HAVE_AVX],
+         [OPUS_X86_PRESUME_AVX],
+         [[#include <immintrin.h>
+         ]],
+         [[
+            static __m256 mtest;
+            mtest = _mm256_setzero_ps();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_AVX" = x"1" && test x"$OPUS_X86_PRESUME_AVX" != x"1"],
+          [
+             OPUS_X86_AVX_CFLAGS="$X86_AVX_CFLAGS"
+             AC_SUBST([OPUS_X86_AVX_CFLAGS])
+          ]
+      )
+         AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""])
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86 SSE Intrinsics])
+            intrinsics_support="$intrinsics_support SSE"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary requires SSE intrinsics support])],
+               [rtcd_support="$rtcd_support SSE"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE intrinsics])
+         ])
+
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86 SSE2 Intrinsics])
+            intrinsics_support="$intrinsics_support SSE2"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE2" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE2], 1, [Define if binary requires SSE2 intrinsics support])],
+               [rtcd_support="$rtcd_support SSE2"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE2 intrinsics])
+         ])
+
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], 1, [Compiler supports X86 SSE4.1 Intrinsics])
+            intrinsics_support="$intrinsics_support SSE4.1"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_SSE4_1" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_SSE4_1], 1, [Define if binary requires SSE4.1 intrinsics support])],
+               [rtcd_support="$rtcd_support SSE4.1"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics])
+         ])
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_AVX], 1, [Compiler supports X86 AVX Intrinsics])
+            intrinsics_support="$intrinsics_support AVX"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_AVX" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_AVX], 1, [Define if binary requires AVX intrinsics support])],
+               [rtcd_support="$rtcd_support AVX"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support AVX intrinsics])
+         ])
+
+         AS_IF([test x"$intrinsics_support" = x""],
+            [intrinsics_support=no],
+            [intrinsics_support="x86$intrinsics_support"]
+         )
+         AS_IF([test x"$rtcd_support" = x""],
+            [rtcd_support=no],
+            [rtcd_support="x86$rtcd_support"],
+        )
+
+    AS_IF([test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""],[
+            get_cpuid_by_asm="no"
+            AC_MSG_CHECKING([How to get X86 CPU Info])
+            AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+                 #include <stdio.h>
+            ]],[[
+                 unsigned int CPUInfo0;
+                 unsigned int CPUInfo1;
+                 unsigned int CPUInfo2;
+                 unsigned int CPUInfo3;
+                 unsigned int InfoType;
+                 __asm__ __volatile__ (
+                 "cpuid":
+                 "=a" (CPUInfo0),
+                 "=b" (CPUInfo1),
+                 "=c" (CPUInfo2),
+                 "=d" (CPUInfo3) :
+                 "a" (InfoType), "c" (0)
+                );
+            ]])],
+            [get_cpuid_by_asm="yes"
+             AC_MSG_RESULT([Inline Assembly])
+			 AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
+             [AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+                 #include <cpuid.h>
+            ]],[[
+                 unsigned int CPUInfo0;
+                 unsigned int CPUInfo1;
+                 unsigned int CPUInfo2;
+                 unsigned int CPUInfo3;
+                 unsigned int InfoType;
+                 __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+            ]])],
+            [AC_MSG_RESULT([C method])
+			 AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])],
+            [AC_MSG_ERROR([no supported Get CPU Info method, please disable intrinsics])])])])
+   ],
+   [
+      AC_MSG_WARN([No intrinsics support for your architecture])
+      intrinsics_support="no"
+   ])
+],
+[
+   intrinsics_support="no"
+])
+
+AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
+AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
+    [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
+AM_CONDITIONAL([HAVE_ARM_NE10],
+    [test x"$HAVE_ARM_NE10" = x"1"])
+AM_CONDITIONAL([HAVE_SSE],
+    [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
+AM_CONDITIONAL([HAVE_SSE2],
+    [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"])
+AM_CONDITIONAL([HAVE_SSE4_1],
+    [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"])
+AM_CONDITIONAL([HAVE_AVX],
+    [test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"])
 
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
@@ -443,6 +815,7 @@
       Fixed point debugging: ......... ${enable_fixed_point_debug}
       Inline Assembly Optimizations: . ${inline_optimization}
       External Assembly Optimizations: ${asm_optimization}
+      Intrinsics Optimizations.......: ${intrinsics_support}
       Run-time CPU detection: ........ ${rtcd_support}
       Custom modes: .................. ${enable_custom_modes}
       Assertion checking: ............ ${enable_assertions}
diff --git a/convert_android_asm.sh b/convert_android_asm.sh
new file mode 100755
index 0000000..ea3d198
--- /dev/null
+++ b/convert_android_asm.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+ASM_CONVERTER="./celt/arm/arm2gnu.pl"
+
+if [[ ! -x "${ASM_CONVERTER}" ]]; then
+  echo "This script should be run from external/libopus."
+  exit
+fi
+
+while read file; do
+  # This check is required because the ASM conversion script doesn't seem to be
+  # idempotent.
+  if [[ ! "${file}" =~ .*_gnu\.s$ ]]; then
+    gnu_file="${file%.s}_gnu.s"
+    ${ASM_CONVERTER} "${file}" > "${gnu_file}"
+    # The ASM conversion script replaces includes with *_gnu.S. So, replace
+    # occurences of "*-gnu.S" with "*_gnu.s".
+    sed -i "s/-gnu\.S/_gnu\.s/g" "${gnu_file}"
+    rm -f "${file}"
+  fi
+done < <(find . -iname '*.s')
+
+# Generate armopts.s from armopts.s.in
+sed \
+  -e "s/@OPUS_ARM_MAY_HAVE_EDSP@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_MEDIA@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_NEON@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_NEON_INTR@/1/g" \
+	celt/arm/armopts.s.in > celt/arm/armopts.s.temp
+${ASM_CONVERTER} "celt/arm/armopts.s.temp" > "celt/arm/armopts_gnu.s"
+rm "celt/arm/armopts.s.temp"
+echo "Converted all ASM files and generated armopts.s successfully."
diff --git a/depcomp b/depcomp
index 25a39e6..fc98710 100755
--- a/depcomp
+++ b/depcomp
@@ -1,10 +1,9 @@
 #! /bin/sh
 # depcomp - compile a program generating dependencies as side-effects
 
-scriptversion=2012-03-27.16; # UTC
+scriptversion=2013-05-30.07; # UTC
 
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009, 2010,
-# 2011, 2012 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,9 +27,9 @@
 
 case $1 in
   '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
+    echo "$0: No command.  Try '$0 --help' for more information." 1>&2
+    exit 1;
+    ;;
   -h | --h*)
     cat <<\EOF
 Usage: depcomp [--help] [--version] PROGRAM [ARGS]
@@ -57,11 +56,65 @@
     ;;
 esac
 
+# Get the directory component of the given path, and save it in the
+# global variables '$dir'.  Note that this directory component will
+# be either empty or ending with a '/' character.  This is deliberate.
+set_dir_from ()
+{
+  case $1 in
+    */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;;
+      *) dir=;;
+  esac
+}
+
+# Get the suffix-stripped basename of the given path, and save it the
+# global variable '$base'.
+set_base_from ()
+{
+  base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'`
+}
+
+# If no dependency file was actually created by the compiler invocation,
+# we still have to create a dummy depfile, to avoid errors with the
+# Makefile "include basename.Plo" scheme.
+make_dummy_depfile ()
+{
+  echo "#dummy" > "$depfile"
+}
+
+# Factor out some common post-processing of the generated depfile.
+# Requires the auxiliary global variable '$tmpdepfile' to be set.
+aix_post_process_depfile ()
+{
+  # If the compiler actually managed to produce a dependency file,
+  # post-process it.
+  if test -f "$tmpdepfile"; then
+    # Each line is of the form 'foo.o: dependency.h'.
+    # Do two passes, one to just change these to
+    #   $object: dependency.h
+    # and one to simply output
+    #   dependency.h:
+    # which is needed to avoid the deleted-header problem.
+    { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile"
+      sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile"
+    } > "$depfile"
+    rm -f "$tmpdepfile"
+  else
+    make_dummy_depfile
+  fi
+}
+
 # A tabulation character.
 tab='	'
 # A newline character.
 nl='
 '
+# Character ranges might be problematic outside the C locale.
+# These definitions help.
+upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ
+lower=abcdefghijklmnopqrstuvwxyz
+digits=0123456789
+alpha=${upper}${lower}
 
 if test -z "$depmode" || test -z "$source" || test -z "$object"; then
   echo "depcomp: Variables source, object and depmode must be set" 1>&2
@@ -75,6 +128,9 @@
 
 rm -f "$tmpdepfile"
 
+# Avoid interferences from the environment.
+gccflag= dashmflag=
+
 # Some modes work just like other modes, but use different flags.  We
 # parameterize here, but still list the modes in the big case below,
 # to make depend.m4 easier to write.  Note that we *cannot* use a case
@@ -86,32 +142,32 @@
 fi
 
 if test "$depmode" = dashXmstdout; then
-   # This is just like dashmstdout with a different argument.
-   dashmflag=-xM
-   depmode=dashmstdout
+  # This is just like dashmstdout with a different argument.
+  dashmflag=-xM
+  depmode=dashmstdout
 fi
 
 cygpath_u="cygpath -u -f -"
 if test "$depmode" = msvcmsys; then
-   # This is just like msvisualcpp but w/o cygpath translation.
-   # Just convert the backslash-escaped backslashes to single forward
-   # slashes to satisfy depend.m4
-   cygpath_u='sed s,\\\\,/,g'
-   depmode=msvisualcpp
+  # This is just like msvisualcpp but w/o cygpath translation.
+  # Just convert the backslash-escaped backslashes to single forward
+  # slashes to satisfy depend.m4
+  cygpath_u='sed s,\\\\,/,g'
+  depmode=msvisualcpp
 fi
 
 if test "$depmode" = msvc7msys; then
-   # This is just like msvc7 but w/o cygpath translation.
-   # Just convert the backslash-escaped backslashes to single forward
-   # slashes to satisfy depend.m4
-   cygpath_u='sed s,\\\\,/,g'
-   depmode=msvc7
+  # This is just like msvc7 but w/o cygpath translation.
+  # Just convert the backslash-escaped backslashes to single forward
+  # slashes to satisfy depend.m4
+  cygpath_u='sed s,\\\\,/,g'
+  depmode=msvc7
 fi
 
 if test "$depmode" = xlc; then
-   # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency informations.
-   gccflag=-qmakedep=gcc,-MF
-   depmode=gcc
+  # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information.
+  gccflag=-qmakedep=gcc,-MF
+  depmode=gcc
 fi
 
 case "$depmode" in
@@ -134,8 +190,7 @@
   done
   "$@"
   stat=$?
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile"
     exit $stat
   fi
@@ -143,13 +198,17 @@
   ;;
 
 gcc)
+## Note that this doesn't just cater to obsosete pre-3.x GCC compilers.
+## but also to in-use compilers like IMB xlc/xlC and the HP C compiler.
+## (see the conditional assignment to $gccflag above).
 ## There are various ways to get dependency output from gcc.  Here's
 ## why we pick this rather obscure method:
 ## - Don't want to use -MD because we'd like the dependencies to end
 ##   up in a subdir.  Having to rename by hand is ugly.
 ##   (We might end up doing this anyway to support other compilers.)
 ## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).
+##   -MM, not -M (despite what the docs say).  Also, it might not be
+##   supported by the other compilers which use the 'gcc' depmode.
 ## - Using -M directly means running the compiler twice (even worse
 ##   than renaming).
   if test -z "$gccflag"; then
@@ -157,15 +216,14 @@
   fi
   "$@" -Wp,"$gccflag$tmpdepfile"
   stat=$?
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile"
     exit $stat
   fi
   rm -f "$depfile"
   echo "$object : \\" > "$depfile"
-  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-## The second -e expression handles DOS-style file names with drive letters.
+  # The second -e expression handles DOS-style file names with drive
+  # letters.
   sed -e 's/^[^:]*: / /' \
       -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
 ## This next piece of magic avoids the "deleted header file" problem.
@@ -174,15 +232,15 @@
 ## typically no way to rebuild the header).  We avoid this by adding
 ## dummy dependencies for each header file.  Too bad gcc doesn't do
 ## this for us directly.
-  tr ' ' "$nl" < "$tmpdepfile" |
 ## Some versions of gcc put a space before the ':'.  On the theory
 ## that the space means something, we add a space to the output as
 ## well.  hp depmode also adds that space, but also prefixes the VPATH
 ## to the object.  Take care to not repeat it in the output.
 ## Some versions of the HPUX 10.20 sed can't process this invocation
 ## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
-      | sed -e 's/$/ :/' >> "$depfile"
+  tr ' ' "$nl" < "$tmpdepfile" \
+    | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
+    | sed -e 's/$/ :/' >> "$depfile"
   rm -f "$tmpdepfile"
   ;;
 
@@ -200,8 +258,7 @@
     "$@" -MDupdate "$tmpdepfile"
   fi
   stat=$?
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile"
     exit $stat
   fi
@@ -209,7 +266,6 @@
 
   if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
     echo "$object : \\" > "$depfile"
-
     # Clip off the initial element (the dependent).  Don't try to be
     # clever and replace this with sed code, as IRIX sed won't handle
     # lines with more than a fixed number of characters (4096 in
@@ -217,19 +273,15 @@
     # the IRIX cc adds comments like '#:fec' to the end of the
     # dependency line.
     tr ' ' "$nl" < "$tmpdepfile" \
-    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
-    tr "$nl" ' ' >> "$depfile"
+      | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \
+      | tr "$nl" ' ' >> "$depfile"
     echo >> "$depfile"
-
     # The second pass generates a dummy entry for each header file.
     tr ' ' "$nl" < "$tmpdepfile" \
-   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-   >> "$depfile"
+      | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
+      >> "$depfile"
   else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
+    make_dummy_depfile
   fi
   rm -f "$tmpdepfile"
   ;;
@@ -247,9 +299,8 @@
   # current directory.  Also, the AIX compiler puts '$object:' at the
   # start of each line; $object doesn't have directory information.
   # Version 6 uses the directory in both cases.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  set_dir_from "$object"
+  set_base_from "$object"
   if test "$libtool" = yes; then
     tmpdepfile1=$dir$base.u
     tmpdepfile2=$base.u
@@ -262,9 +313,7 @@
     "$@" -M
   fi
   stat=$?
-
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
     exit $stat
   fi
@@ -273,65 +322,113 @@
   do
     test -f "$tmpdepfile" && break
   done
-  if test -f "$tmpdepfile"; then
-    # Each line is of the form 'foo.o: dependent.h'.
-    # Do two passes, one to just change these to
-    # '$object: dependent.h' and one to simply 'dependent.h:'.
-    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-    sed -e 's,^.*\.[a-z]*:['"$tab"' ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
+  aix_post_process_depfile
   ;;
 
-icc)
-  # Intel's C compiler anf tcc (Tiny C Compiler) understand '-MD -MF file'.
-  # However on
-  #    $CC -MD -MF foo.d -c -o sub/foo.o sub/foo.c
-  # ICC 7.0 will fill foo.d with something like
-  #    foo.o: sub/foo.c
-  #    foo.o: sub/foo.h
-  # which is wrong.  We want
-  #    sub/foo.o: sub/foo.c
-  #    sub/foo.o: sub/foo.h
-  #    sub/foo.c:
-  #    sub/foo.h:
-  # ICC 7.1 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using '\':
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-  # tcc 0.9.26 (FIXME still under development at the moment of writing)
-  # will emit a similar output, but also prepend the continuation lines
-  # with horizontal tabulation characters.
+tcc)
+  # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26
+  # FIXME: That version still under development at the moment of writing.
+  #        Make that this statement remains true also for stable, released
+  #        versions.
+  # It will wrap lines (doesn't matter whether long or short) with a
+  # trailing '\', as in:
+  #
+  #   foo.o : \
+  #    foo.c \
+  #    foo.h \
+  #
+  # It will put a trailing '\' even on the last line, and will use leading
+  # spaces rather than leading tabs (at least since its commit 0394caf7
+  # "Emit spaces for -MD").
   "$@" -MD -MF "$tmpdepfile"
   stat=$?
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile"
     exit $stat
   fi
   rm -f "$depfile"
-  # Each line is of the form 'foo.o: dependent.h',
-  # or 'foo.o: dep1.h dep2.h \', or ' dep3.h dep4.h \'.
+  # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'.
+  # We have to change lines of the first kind to '$object: \'.
+  sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile"
+  # And for each line of the second kind, we have to emit a 'dep.h:'
+  # dummy dependency, to avoid the deleted-header problem.
+  sed -n -e 's|^  *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+## The order of this option in the case statement is important, since the
+## shell code in configure will try each of these formats in the order
+## listed in this file.  A plain '-MD' option would be understood by many
+## compilers, so we must ensure this comes after the gcc and icc options.
+pgcc)
+  # Portland's C compiler understands '-MD'.
+  # Will always output deps to 'file.d' where file is the root name of the
+  # source file under compilation, even if file resides in a subdirectory.
+  # The object file name does not affect the name of the '.d' file.
+  # pgcc 10.2 will output
+  #    foo.o: sub/foo.c sub/foo.h
+  # and will wrap long lines using '\' :
+  #    foo.o: sub/foo.c ... \
+  #     sub/foo.h ... \
+  #     ...
+  set_dir_from "$object"
+  # Use the source, not the object, to determine the base name, since
+  # that's sadly what pgcc will do too.
+  set_base_from "$source"
+  tmpdepfile=$base.d
+
+  # For projects that build the same source file twice into different object
+  # files, the pgcc approach of using the *source* file root name can cause
+  # problems in parallel builds.  Use a locking strategy to avoid stomping on
+  # the same $tmpdepfile.
+  lockdir=$base.d-lock
+  trap "
+    echo '$0: caught signal, cleaning up...' >&2
+    rmdir '$lockdir'
+    exit 1
+  " 1 2 13 15
+  numtries=100
+  i=$numtries
+  while test $i -gt 0; do
+    # mkdir is a portable test-and-set.
+    if mkdir "$lockdir" 2>/dev/null; then
+      # This process acquired the lock.
+      "$@" -MD
+      stat=$?
+      # Release the lock.
+      rmdir "$lockdir"
+      break
+    else
+      # If the lock is being held by a different process, wait
+      # until the winning process is done or we timeout.
+      while test -d "$lockdir" && test $i -gt 0; do
+        sleep 1
+        i=`expr $i - 1`
+      done
+    fi
+    i=`expr $i - 1`
+  done
+  trap - 1 2 13 15
+  if test $i -le 0; then
+    echo "$0: failed to acquire lock after $numtries attempts" >&2
+    echo "$0: check lockdir '$lockdir'" >&2
+    exit 1
+  fi
+
+  if test $stat -ne 0; then
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  # Each line is of the form `foo.o: dependent.h',
+  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
   # Do two passes, one to just change these to
-  # '$object: dependent.h' and one to simply 'dependent.h:'.
-  sed -e "s/^[ $tab][ $tab]*/  /" -e "s,^[^:]*:,$object :," \
-    < "$tmpdepfile" > "$depfile"
-  sed '
-    s/[ '"$tab"'][ '"$tab"']*/ /g
-    s/^ *//
-    s/ *\\*$//
-    s/^[^:]*: *//
-    /^$/d
-    /:$/d
-    s/$/ :/
-  ' < "$tmpdepfile" >> "$depfile"
+  # `$object: dependent.h' and one to simply `dependent.h:'.
+  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process this invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \
+    | sed -e 's/$/ :/' >> "$depfile"
   rm -f "$tmpdepfile"
   ;;
 
@@ -342,9 +439,8 @@
   # 'foo.d', which lands next to the object file, wherever that
   # happens to be.
   # Much of this is similar to the tru64 case; see comments there.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  set_dir_from  "$object"
+  set_base_from "$object"
   if test "$libtool" = yes; then
     tmpdepfile1=$dir$base.d
     tmpdepfile2=$dir.libs/$base.d
@@ -355,8 +451,7 @@
     "$@" +Maked
   fi
   stat=$?
-  if test $stat -eq 0; then :
-  else
+  if test $stat -ne 0; then
      rm -f "$tmpdepfile1" "$tmpdepfile2"
      exit $stat
   fi
@@ -366,76 +461,61 @@
     test -f "$tmpdepfile" && break
   done
   if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
+    sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile"
     # Add 'dependent.h:' lines.
     sed -ne '2,${
-	       s/^ *//
-	       s/ \\*$//
-	       s/$/:/
-	       p
-	     }' "$tmpdepfile" >> "$depfile"
+               s/^ *//
+               s/ \\*$//
+               s/$/:/
+               p
+             }' "$tmpdepfile" >> "$depfile"
   else
-    echo "#dummy" > "$depfile"
+    make_dummy_depfile
   fi
   rm -f "$tmpdepfile" "$tmpdepfile2"
   ;;
 
 tru64)
-   # The Tru64 compiler uses -MD to generate dependencies as a side
-   # effect.  'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'.
-   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-   # dependencies in 'foo.d' instead, so we check for that too.
-   # Subdirectories are respected.
-   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-   test "x$dir" = "x$object" && dir=
-   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  # The Tru64 compiler uses -MD to generate dependencies as a side
+  # effect.  'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'.
+  # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
+  # dependencies in 'foo.d' instead, so we check for that too.
+  # Subdirectories are respected.
+  set_dir_from  "$object"
+  set_base_from "$object"
 
-   if test "$libtool" = yes; then
-      # With Tru64 cc, shared objects can also be used to make a
-      # static library.  This mechanism is used in libtool 1.4 series to
-      # handle both shared and static libraries in a single compilation.
-      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
-      #
-      # With libtool 1.5 this exception was removed, and libtool now
-      # generates 2 separate objects for the 2 libraries.  These two
-      # compilations output dependencies in $dir.libs/$base.o.d and
-      # in $dir$base.o.d.  We have to check for both files, because
-      # one of the two compilations can be disabled.  We should prefer
-      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-      # automatically cleaned when .libs/ is deleted, while ignoring
-      # the former would cause a distcleancheck panic.
-      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
-      tmpdepfile2=$dir$base.o.d          # libtool 1.5
-      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
-      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
-      "$@" -Wc,-MD
-   else
-      tmpdepfile1=$dir$base.o.d
-      tmpdepfile2=$dir$base.d
-      tmpdepfile3=$dir$base.d
-      tmpdepfile4=$dir$base.d
-      "$@" -MD
-   fi
+  if test "$libtool" = yes; then
+    # Libtool generates 2 separate objects for the 2 libraries.  These
+    # two compilations output dependencies in $dir.libs/$base.o.d and
+    # in $dir$base.o.d.  We have to check for both files, because
+    # one of the two compilations can be disabled.  We should prefer
+    # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
+    # automatically cleaned when .libs/ is deleted, while ignoring
+    # the former would cause a distcleancheck panic.
+    tmpdepfile1=$dir$base.o.d          # libtool 1.5
+    tmpdepfile2=$dir.libs/$base.o.d    # Likewise.
+    tmpdepfile3=$dir.libs/$base.d      # Compaq CCC V6.2-504
+    "$@" -Wc,-MD
+  else
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir$base.d
+    tmpdepfile3=$dir$base.d
+    "$@" -MD
+  fi
 
-   stat=$?
-   if test $stat -eq 0; then :
-   else
-      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-      exit $stat
-   fi
+  stat=$?
+  if test $stat -ne 0; then
+    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+    exit $stat
+  fi
 
-   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-   do
-     test -f "$tmpdepfile" && break
-   done
-   if test -f "$tmpdepfile"; then
-      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-      sed -e 's,^.*\.[a-z]*:['"$tab"' ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-   else
-      echo "#dummy" > "$depfile"
-   fi
-   rm -f "$tmpdepfile"
-   ;;
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  # Same post-processing that is required for AIX mode.
+  aix_post_process_depfile
+  ;;
 
 msvc7)
   if test "$libtool" = yes; then
@@ -446,8 +526,7 @@
   "$@" $showIncludes > "$tmpdepfile"
   stat=$?
   grep -v '^Note: including file: ' "$tmpdepfile"
-  if test "$stat" = 0; then :
-  else
+  if test $stat -ne 0; then
     rm -f "$tmpdepfile"
     exit $stat
   fi
@@ -473,6 +552,7 @@
   G
   p
 }' >> "$depfile"
+  echo >> "$depfile" # make sure the fragment doesn't end with a backslash
   rm -f "$tmpdepfile"
   ;;
 
@@ -524,13 +604,14 @@
   # in the target name.  This is to cope with DOS-style filenames:
   # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise.
   "$@" $dashmflag |
-    sed 's:^['"$tab"' ]*[^:'"$tab"' ][^:][^:]*\:['"$tab"' ]*:'"$object"'\: :' > "$tmpdepfile"
+    sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile"
   rm -f "$depfile"
   cat < "$tmpdepfile" > "$depfile"
-  tr ' ' "$nl" < "$tmpdepfile" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process this sed invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  tr ' ' "$nl" < "$tmpdepfile" \
+    | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
+    | sed -e 's/$/ :/' >> "$depfile"
   rm -f "$tmpdepfile"
   ;;
 
@@ -583,10 +664,12 @@
   # makedepend may prepend the VPATH from the source file name to the object.
   # No need to regex-escape $object, excess matching of '.' is harmless.
   sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
-  sed '1,2d' "$tmpdepfile" | tr ' ' "$nl" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process the last invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  sed '1,2d' "$tmpdepfile" \
+    | tr ' ' "$nl" \
+    | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
+    | sed -e 's/$/ :/' >> "$depfile"
   rm -f "$tmpdepfile" "$tmpdepfile".bak
   ;;
 
@@ -622,10 +705,10 @@
     esac
   done
 
-  "$@" -E |
-    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
-    sed '$ s: \\$::' > "$tmpdepfile"
+  "$@" -E \
+    | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+             -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+    | sed '$ s: \\$::' > "$tmpdepfile"
   rm -f "$depfile"
   echo "$object : \\" > "$depfile"
   cat < "$tmpdepfile" >> "$depfile"
@@ -657,15 +740,15 @@
       shift
       ;;
     "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-	set fnord "$@"
-	shift
-	shift
-	;;
+        set fnord "$@"
+        shift
+        shift
+        ;;
     *)
-	set fnord "$@" "$arg"
-	shift
-	shift
-	;;
+        set fnord "$@" "$arg"
+        shift
+        shift
+        ;;
     esac
   done
   "$@" -E 2>/dev/null |
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 8a63d5e..0c016f3 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -611,7 +611,7 @@
 # with spaces.
 
 INPUT                  = @top_srcdir@/include/opus.h \
-			 @top_srcdir@/include/opus_types.h   \
+                         @top_srcdir@/include/opus_types.h   \
                          @top_srcdir@/include/opus_defines.h \
                          @top_srcdir@/include/opus_multistream.h \
                          @top_srcdir@/include/opus_custom.h
diff --git a/doc/Makefile.in b/doc/Makefile.in
index 7f76fd6..0e308aa 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -1,9 +1,8 @@
-# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -15,23 +14,61 @@
 
 @SET_MAKE@
 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
     case $$MAKEFLAGS in \
       *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
     esac; \
-    test $$am__dry = yes; \
-  }
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@@ -51,25 +88,31 @@
 build_triplet = @build@
 host_triplet = @host@
 subdir = doc
-DIST_COMMON = $(srcdir)/Doxyfile.in $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/as-gcc-inline-assembly.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/opus-intrinsics.m4 $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
 mkinstalldirs = $(install_sh) -d
 CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES = Doxyfile
 CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
 AM_V_GEN = $(am__v_GEN_@AM_V@)
 am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN   " $@;
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
 AM_V_at = $(am__v_at_@AM_V@)
 am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
 am__v_at_0 = @
+am__v_at_1 = 
 SOURCES =
 DIST_SOURCES =
 am__can_run_installinfo = \
@@ -77,11 +120,15 @@
     n|no|NO) false;; \
     *) (install-info --version) >/dev/null 2>&1;; \
   esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+am__DIST_COMMON = $(srcdir)/Doxyfile.in $(srcdir)/Makefile.in TODO
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM2GNU_PARAMS = @ARM2GNU_PARAMS@
+ARM_NEON_INTR_CFLAGS = @ARM_NEON_INTR_CFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
@@ -107,6 +154,7 @@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
+HAVE_ARM_NE10 = @HAVE_ARM_NE10@
 HAVE_DOXYGEN = @HAVE_DOXYGEN@
 HAVE_PERL = @HAVE_PERL@
 INSTALL = @INSTALL@
@@ -127,6 +175,8 @@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
+NE10_CFLAGS = @NE10_CFLAGS@
+NE10_LIBS = @NE10_LIBS@
 NM = @NM@
 NMEDIT = @NMEDIT@
 OBJDUMP = @OBJDUMP@
@@ -134,10 +184,15 @@
 OPUS_ARM_MAY_HAVE_EDSP = @OPUS_ARM_MAY_HAVE_EDSP@
 OPUS_ARM_MAY_HAVE_MEDIA = @OPUS_ARM_MAY_HAVE_MEDIA@
 OPUS_ARM_MAY_HAVE_NEON = @OPUS_ARM_MAY_HAVE_NEON@
+OPUS_ARM_NEON_INTR_CFLAGS = @OPUS_ARM_NEON_INTR_CFLAGS@
 OPUS_HAVE_RTCD = @OPUS_HAVE_RTCD@
 OPUS_LT_AGE = @OPUS_LT_AGE@
 OPUS_LT_CURRENT = @OPUS_LT_CURRENT@
 OPUS_LT_REVISION = @OPUS_LT_REVISION@
+OPUS_X86_AVX_CFLAGS = @OPUS_X86_AVX_CFLAGS@
+OPUS_X86_SSE2_CFLAGS = @OPUS_X86_SSE2_CFLAGS@
+OPUS_X86_SSE4_1_CFLAGS = @OPUS_X86_SSE4_1_CFLAGS@
+OPUS_X86_SSE_CFLAGS = @OPUS_X86_SSE_CFLAGS@
 OTOOL = @OTOOL@
 OTOOL64 = @OTOOL64@
 PACKAGE = @PACKAGE@
@@ -155,6 +210,10 @@
 SHELL = @SHELL@
 STRIP = @STRIP@
 VERSION = @VERSION@
+X86_AVX_CFLAGS = @X86_AVX_CFLAGS@
+X86_SSE2_CFLAGS = @X86_SSE2_CFLAGS@
+X86_SSE4_1_CFLAGS = @X86_SSE4_1_CFLAGS@
+X86_SSE_CFLAGS = @X86_SSE_CFLAGS@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
 abs_top_builddir = @abs_top_builddir@
@@ -234,7 +293,6 @@
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu doc/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu doc/Makefile
-.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -260,11 +318,11 @@
 
 clean-libtool:
 	-rm -rf .libs _libs
-tags: TAGS
-TAGS:
+tags TAGS:
 
-ctags: CTAGS
-CTAGS:
+ctags CTAGS:
+
+cscope cscopelist:
 
 
 distdir: $(DISTFILES)
@@ -332,9 +390,9 @@
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
 	@echo "it deletes files that may require special tools to rebuild."
+@HAVE_DOXYGEN_FALSE@clean-local:
 @HAVE_DOXYGEN_FALSE@uninstall-local:
 @HAVE_DOXYGEN_FALSE@install-data-local:
-@HAVE_DOXYGEN_FALSE@clean-local:
 clean: clean-am
 
 clean-am: clean-generic clean-libtool clean-local mostlyclean-am
@@ -404,17 +462,19 @@
 .MAKE: install-am install-strip
 
 .PHONY: all all-am all-local check check-am clean clean-generic \
-	clean-libtool clean-local distclean distclean-generic \
-	distclean-libtool distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am \
-	install-data-local install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
+	clean-libtool clean-local cscopelist-am ctags-am distclean \
+	distclean-generic distclean-libtool distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-data-local install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am uninstall uninstall-am \
-	uninstall-local
+	mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
+	uninstall-am uninstall-local
+
+.PRECIOUS: Makefile
 
 
 @HAVE_DOXYGEN_TRUE@all-local: doxygen-build.stamp
diff --git a/doc/TODO b/doc/TODO
new file mode 100644
index 0000000..9e1c2d5
--- /dev/null
+++ b/doc/TODO
@@ -0,0 +1,7 @@
+define audio bandwidth as frequency range
+
+repeat padding recommendation
+
+ptime: refer to RFC
+
+Opus does not provide any confidentiality or integrity protection
diff --git a/doc/customdoxygen.css b/doc/customdoxygen.css
index 4bce44d..7004778 100644
--- a/doc/customdoxygen.css
+++ b/doc/customdoxygen.css
@@ -1,62 +1,62 @@
 /* The standard CSS for doxygen */
 
 body, table, div, p, dl {
-	font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;
-	font-size: 13px;
-	line-height: 1.3;
+        font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;
+        font-size: 13px;
+        line-height: 1.3;
 }
 
 /* @group Heading Levels */
 
 h1 {
-	font-size: 150%;
+        font-size: 150%;
 }
 
 .title {
-	font-size: 150%;
-	font-weight: bold;
-	margin: 10px 2px;
+        font-size: 150%;
+        font-weight: bold;
+        margin: 10px 2px;
 }
 
 h2 {
-	font-size: 120%;
+        font-size: 120%;
 }
 
 h3 {
-	font-size: 100%;
+        font-size: 100%;
 }
 
 dt {
-	font-weight: bold;
+        font-weight: bold;
 }
 
 div.multicol {
-	-moz-column-gap: 1em;
-	-webkit-column-gap: 1em;
-	-moz-column-count: 3;
-	-webkit-column-count: 3;
+        -moz-column-gap: 1em;
+        -webkit-column-gap: 1em;
+        -moz-column-count: 3;
+        -webkit-column-count: 3;
 }
 
 p.startli, p.startdd, p.starttd {
-	margin-top: 2px;
+        margin-top: 2px;
 }
 
 p.endli {
-	margin-bottom: 0px;
+        margin-bottom: 0px;
 }
 
 p.enddd {
-	margin-bottom: 4px;
+        margin-bottom: 4px;
 }
 
 p.endtd {
-	margin-bottom: 2px;
+        margin-bottom: 2px;
 }
 
 /* @end */
 
 caption {
-	font-weight: bold;
+        font-weight: bold;
 }
 
 span.legend {
@@ -70,45 +70,45 @@
 }
 
 div.qindex, div.navtab{
-	background-color: #F1F1F1;
-	border: 1px solid #BDBDBD;
-	text-align: center;
+        background-color: #F1F1F1;
+        border: 1px solid #BDBDBD;
+        text-align: center;
 }
 
 div.qindex, div.navpath {
-	width: 100%;
-	line-height: 140%;
+        width: 100%;
+        line-height: 140%;
 }
 
 div.navtab {
-	margin-right: 15px;
+        margin-right: 15px;
 }
 
 /* @group Link Styling */
 
 a {
-	color: #646464;
-	font-weight: normal;
-	text-decoration: none;
+        color: #646464;
+        font-weight: normal;
+        text-decoration: none;
 }
 
 .contents a:visited {
-	color: #747474;
+        color: #747474;
 }
 
 a:hover {
-	text-decoration: underline;
+        text-decoration: underline;
 }
 
 a.qindex {
-	font-weight: bold;
+        font-weight: bold;
 }
 
 a.qindexHL {
-	font-weight: bold;
-	background-color: #B8B8B8;
-	color: #ffffff;
-	border: 1px double #A8A8A8;
+        font-weight: bold;
+        background-color: #B8B8B8;
+        color: #ffffff;
+        border: 1px double #A8A8A8;
 }
 
 .contents a.qindexHL:visited {
@@ -116,181 +116,181 @@
 }
 
 a.el {
-	font-weight: bold;
+        font-weight: bold;
 }
 
 a.elRef {
 }
 
 a.code, a.code:visited {
-	color: #4665A2; 
+        color: #4665A2;
 }
 
 a.codeRef, a.codeRef:visited {
-	color: #4665A2; 
+        color: #4665A2;
 }
 
 /* @end */
 
 dl.el {
-	margin-left: -1cm;
+        margin-left: -1cm;
 }
 
 .fragment {
-	font-family: monospace, fixed;
-	font-size: 105%;
+        font-family: monospace, fixed;
+        font-size: 105%;
 }
 
 pre.fragment {
-	border: 1px solid #D5D5D5;
-	background-color: #FCFCFC;
-	padding: 4px 6px;
-	margin: 4px 8px 4px 2px;
-	overflow: auto;
-	word-wrap: break-word;
-	font-size:  9pt;
-	line-height: 125%;
+        border: 1px solid #D5D5D5;
+        background-color: #FCFCFC;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
 }
 
 div.ah {
-	background-color: black;
-	font-weight: bold;
-	color: #ffffff;
-	margin-bottom: 3px;
-	margin-top: 3px;
-	padding: 0.2em;
-	border: solid thin #333;
-	border-radius: 0.5em;
-	-webkit-border-radius: .5em;
-	-moz-border-radius: .5em;
-	box-shadow: 2px 2px 3px #999;
-	-webkit-box-shadow: 2px 2px 3px #999;
-	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
-	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
-	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000);
+        background-color: black;
+        font-weight: bold;
+        color: #ffffff;
+        margin-bottom: 3px;
+        margin-top: 3px;
+        padding: 0.2em;
+        border: solid thin #333;
+        border-radius: 0.5em;
+        -webkit-border-radius: .5em;
+        -moz-border-radius: .5em;
+        box-shadow: 2px 2px 3px #999;
+        -webkit-box-shadow: 2px 2px 3px #999;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+        background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000);
 }
 
 div.groupHeader {
-	margin-left: 16px;
-	margin-top: 12px;
-	font-weight: bold;
+        margin-left: 16px;
+        margin-top: 12px;
+        font-weight: bold;
 }
 
 div.groupText {
-	margin-left: 16px;
-	font-style: italic;
+        margin-left: 16px;
+        font-style: italic;
 }
 
 body {
-	background-color: white;
-	color: black;
+        background-color: white;
+        color: black;
         margin: 0;
 }
 
 div.contents {
-	margin-top: 10px;
-	margin-left: 8px;
-	margin-right: 8px;
+        margin-top: 10px;
+        margin-left: 8px;
+        margin-right: 8px;
 }
 
 td.indexkey {
-	background-color: #F1F1F1;
-	font-weight: bold;
-	border: 1px solid #D5D5D5;
-	margin: 2px 0px 2px 0;
-	padding: 2px 10px;
+        background-color: #F1F1F1;
+        font-weight: bold;
+        border: 1px solid #D5D5D5;
+        margin: 2px 0px 2px 0;
+        padding: 2px 10px;
         white-space: nowrap;
         vertical-align: top;
 }
 
 td.indexvalue {
-	background-color: #F1F1F1;
-	border: 1px solid #D5D5D5;
-	padding: 2px 10px;
-	margin: 2px 0px;
+        background-color: #F1F1F1;
+        border: 1px solid #D5D5D5;
+        padding: 2px 10px;
+        margin: 2px 0px;
 }
 
 tr.memlist {
-	background-color: #F2F2F2;
+        background-color: #F2F2F2;
 }
 
 p.formulaDsp {
-	text-align: center;
+        text-align: center;
 }
 
 img.formulaDsp {
-	
+
 }
 
 img.formulaInl {
-	vertical-align: middle;
+        vertical-align: middle;
 }
 
 div.center {
-	text-align: center;
+        text-align: center;
         margin-top: 0px;
         margin-bottom: 0px;
         padding: 0px;
 }
 
 div.center img {
-	border: 0px;
+        border: 0px;
 }
 
 address.footer {
-	text-align: right;
-	padding-right: 12px;
+        text-align: right;
+        padding-right: 12px;
 }
 
 img.footer {
-	border: 0px;
-	vertical-align: middle;
+        border: 0px;
+        vertical-align: middle;
 }
 
 /* @group Code Colorization */
 
 span.keyword {
-	color: #008000
+        color: #008000
 }
 
 span.keywordtype {
-	color: #604020
+        color: #604020
 }
 
 span.keywordflow {
-	color: #e08000
+        color: #e08000
 }
 
 span.comment {
-	color: #800000
+        color: #800000
 }
 
 span.preprocessor {
-	color: #806020
+        color: #806020
 }
 
 span.stringliteral {
-	color: #002080
+        color: #002080
 }
 
 span.charliteral {
-	color: #008080
+        color: #008080
 }
 
-span.vhdldigit { 
-	color: #ff00ff 
+span.vhdldigit {
+        color: #ff00ff
 }
 
-span.vhdlchar { 
-	color: #000000 
+span.vhdlchar {
+        color: #000000
 }
 
-span.vhdlkeyword { 
-	color: #700070 
+span.vhdlkeyword {
+        color: #700070
 }
 
-span.vhdllogic { 
-	color: #ff0000 
+span.vhdllogic {
+        color: #ff0000
 }
 
 blockquote {
@@ -304,71 +304,71 @@
 
 /*
 .search {
-	color: #003399;
-	font-weight: bold;
+        color: #003399;
+        font-weight: bold;
 }
 
 form.search {
-	margin-bottom: 0px;
-	margin-top: 0px;
+        margin-bottom: 0px;
+        margin-top: 0px;
 }
 
 input.search {
-	font-size: 75%;
-	color: #000080;
-	font-weight: normal;
-	background-color: #e8eef2;
+        font-size: 75%;
+        color: #000080;
+        font-weight: normal;
+        background-color: #e8eef2;
 }
 */
 
 td.tiny {
-	font-size: 75%;
+        font-size: 75%;
 }
 
 .dirtab {
-	padding: 4px;
-	border-collapse: collapse;
-	border: 1px solid #BDBDBD;
+        padding: 4px;
+        border-collapse: collapse;
+        border: 1px solid #BDBDBD;
 }
 
 th.dirtab {
-	background: #F1F1F1;
-	font-weight: bold;
+        background: #F1F1F1;
+        font-weight: bold;
 }
 
 hr {
-	height: 0px;
-	border: none;
-	border-top: 1px solid #7A7A7A;
+        height: 0px;
+        border: none;
+        border-top: 1px solid #7A7A7A;
 }
 
 hr.footer {
-	height: 1px;
+        height: 1px;
 }
 
 /* @group Member Descriptions */
 
 table.memberdecls {
-	border-spacing: 0px;
-	padding: 0px;
+        border-spacing: 0px;
+        padding: 0px;
 }
 
 .mdescLeft, .mdescRight,
 .memItemLeft, .memItemRight,
 .memTemplItemLeft, .memTemplItemRight, .memTemplParams {
-	background-color: #FAFAFA;
-	border: none;
-	margin: 4px;
-	padding: 1px 0 0 8px;
+        background-color: #FAFAFA;
+        border: none;
+        margin: 4px;
+        padding: 1px 0 0 8px;
 }
 
 .mdescLeft, .mdescRight {
-	padding: 0px 8px 4px 8px;
-	color: #555;
+        padding: 0px 8px 4px 8px;
+        color: #555;
 }
 
 .memItemLeft, .memItemRight, .memTemplParams {
-	border-top: 1px solid #D5D5D5;
+        border-top: 1px solid #D5D5D5;
 }
 
 .memItemLeft, .memTemplItemLeft {
@@ -376,11 +376,11 @@
 }
 
 .memItemRight {
-	width: 100%;
+        width: 100%;
 }
 
 .memTemplParams {
-	color: #747474;
+        color: #747474;
         white-space: nowrap;
 }
 
@@ -391,29 +391,29 @@
 /* Styles for detailed member documentation */
 
 .memtemplate {
-	font-size: 80%;
-	color: #747474;
-	font-weight: normal;
-	margin-left: 9px;
+        font-size: 80%;
+        color: #747474;
+        font-weight: normal;
+        margin-left: 9px;
 }
 
 .memnav {
-	background-color: #F1F1F1;
-	border: 1px solid #BDBDBD;
-	text-align: center;
-	margin: 2px;
-	margin-right: 15px;
-	padding: 2px;
+        background-color: #F1F1F1;
+        border: 1px solid #BDBDBD;
+        text-align: center;
+        margin: 2px;
+        margin-right: 15px;
+        padding: 2px;
 }
 
 .mempage {
-	width: 100%;
+        width: 100%;
 }
 
 .memitem {
-	padding: 0;
-	margin-bottom: 10px;
-	margin-right: 5px;
+        padding: 0;
+        margin-bottom: 10px;
+        margin-right: 5px;
 }
 
 .memname {
@@ -449,9 +449,9 @@
 }
 
 .memdoc, dl.reflist dd {
-        border-bottom: 1px solid #C0C0C0;      
-        border-left: 1px solid #C0C0C0;      
-        border-right: 1px solid #C0C0C0; 
+        border-bottom: 1px solid #C0C0C0;
+        border-left: 1px solid #C0C0C0;
+        border-right: 1px solid #C0C0C0;
         padding: 2px 5px;
         background-color: #FCFCFC;
         border-top-width: 0;
@@ -481,35 +481,35 @@
 }
 
 .paramkey {
-	text-align: right;
+        text-align: right;
 }
 
 .paramtype {
-	white-space: nowrap;
+        white-space: nowrap;
 }
 
 .paramname {
-	color: #602020;
-	white-space: nowrap;
+        color: #602020;
+        white-space: nowrap;
 }
 .paramname em {
-	font-style: normal;
+        font-style: normal;
 }
 
 .params, .retval, .exception, .tparams {
         border-spacing: 6px 2px;
-}       
+}
 
 .params .paramname, .retval .paramname {
         font-weight: bold;
         vertical-align: top;
 }
-        
+
 .params .paramtype {
         font-style: italic;
         vertical-align: top;
-}       
-        
+}
+
 .params .paramdir {
         font-family: "courier new",courier,monospace;
         vertical-align: top;
@@ -525,22 +525,22 @@
 /* for the tree view */
 
 .ftvtree {
-	font-family: sans-serif;
-	margin: 0px;
+        font-family: sans-serif;
+        margin: 0px;
 }
 
 /* these are for tree view when used as main index */
 
 .directory {
-	font-size: 9pt;
-	font-weight: bold;
-	margin: 5px;
+        font-size: 9pt;
+        font-weight: bold;
+        margin: 5px;
 }
 
 .directory h3 {
-	margin: 0px;
-	margin-top: 1em;
-	font-size: 11pt;
+        margin: 0px;
+        margin-top: 1em;
+        font-size: 11pt;
 }
 
 /*
@@ -552,62 +552,62 @@
 
 /*
 .directory h3.swap {
-	height: 61px;
-	background-repeat: no-repeat;
-	background-image: url("yourimage.gif");
+        height: 61px;
+        background-repeat: no-repeat;
+        background-image: url("yourimage.gif");
 }
 .directory h3.swap span {
-	display: none;
+        display: none;
 }
 */
 
 .directory > h3 {
-	margin-top: 0;
+        margin-top: 0;
 }
 
 .directory p {
-	margin: 0px;
-	white-space: nowrap;
+        margin: 0px;
+        white-space: nowrap;
 }
 
 .directory div {
-	display: none;
-	margin: 0px;
+        display: none;
+        margin: 0px;
 }
 
 .directory img {
-	vertical-align: -30%;
+        vertical-align: -30%;
 }
 
 /* these are for tree view when not used as main index */
 
 .directory-alt {
-	font-size: 100%;
-	font-weight: bold;
+        font-size: 100%;
+        font-weight: bold;
 }
 
 .directory-alt h3 {
-	margin: 0px;
-	margin-top: 1em;
-	font-size: 11pt;
+        margin: 0px;
+        margin-top: 1em;
+        font-size: 11pt;
 }
 
 .directory-alt > h3 {
-	margin-top: 0;
+        margin-top: 0;
 }
 
 .directory-alt p {
-	margin: 0px;
-	white-space: nowrap;
+        margin: 0px;
+        white-space: nowrap;
 }
 
 .directory-alt div {
-	display: none;
-	margin: 0px;
+        display: none;
+        margin: 0px;
 }
 
 .directory-alt img {
-	vertical-align: -30%;
+        vertical-align: -30%;
 }
 
 /* @end */
@@ -617,27 +617,27 @@
 }
 
 address {
-	font-style: normal;
-	color: #464646;
+        font-style: normal;
+        color: #464646;
 }
 
 table.doxtable {
-	border-collapse:collapse;
+        border-collapse:collapse;
         margin-top: 4px;
         margin-bottom: 4px;
 }
 
 table.doxtable td, table.doxtable th {
-	border: 1px solid #4A4A4A;
-	padding: 3px 7px 2px;
+        border: 1px solid #4A4A4A;
+        padding: 3px 7px 2px;
 }
 
 table.doxtable th {
-	background-color: #5B5B5B;
-	color: #FFFFFF;
-	font-size: 110%;
-	padding-bottom: 4px;
-	padding-top: 5px;
+        background-color: #5B5B5B;
+        color: #FFFFFF;
+        font-size: 110%;
+        padding-bottom: 4px;
+        padding-top: 5px;
 }
 
 table.fieldtable {
@@ -693,52 +693,52 @@
 
 
 .tabsearch {
-	top: 0px;
-	left: 10px;
-	height: 36px;
-	background-image: url('tab_b.png');
-	z-index: 101;
-	overflow: hidden;
-	font-size: 13px;
+        top: 0px;
+        left: 10px;
+        height: 36px;
+        background-image: url('tab_b.png');
+        z-index: 101;
+        overflow: hidden;
+        font-size: 13px;
 }
 
 .navpath ul
 {
-	font-size: 11px;
-	background-image:url('tab_b.png');
-	background-repeat:repeat-x;
-	height:30px;
-	line-height:30px;
-	color:#ABABAB;
-	border:solid 1px #D3D3D3;
-	overflow:hidden;
-	margin:0px;
-	padding:0px;
+        font-size: 11px;
+        background-image:url('tab_b.png');
+        background-repeat:repeat-x;
+        height:30px;
+        line-height:30px;
+        color:#ABABAB;
+        border:solid 1px #D3D3D3;
+        overflow:hidden;
+        margin:0px;
+        padding:0px;
 }
 
 .navpath li
 {
-	list-style-type:none;
-	float:left;
-	padding-left:10px;
-	padding-right:15px;
-	background-image:url('bc_s.png');
-	background-repeat:no-repeat;
-	background-position:right;
-	color:#595959;
+        list-style-type:none;
+        float:left;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:url('bc_s.png');
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#595959;
 }
 
 .navpath li.navelem a
 {
-	height:32px;
-	display:block;
-	text-decoration: none;
-	outline: none;
+        height:32px;
+        display:block;
+        text-decoration: none;
+        outline: none;
 }
 
 .navpath li.navelem a:hover
 {
-	color:#929292;
+        color:#929292;
 }
 
 .navpath li.footer
@@ -757,44 +757,44 @@
 
 div.summary
 {
-	float: right;
-	font-size: 8pt;
-	padding-right: 5px;
-	width: 50%;
-	text-align: right;
-}       
+        float: right;
+        font-size: 8pt;
+        padding-right: 5px;
+        width: 50%;
+        text-align: right;
+}
 
 div.summary a
 {
-	white-space: nowrap;
+        white-space: nowrap;
 }
 
 div.ingroups
 {
-	margin-left: 5px;
-	font-size: 8pt;
-	padding-left: 5px;
-	width: 50%;
-	text-align: left;
+        margin-left: 5px;
+        font-size: 8pt;
+        padding-left: 5px;
+        width: 50%;
+        text-align: left;
 }
 
 div.ingroups a
 {
-	white-space: nowrap;
+        white-space: nowrap;
 }
 
 div.header
 {
         background-image:url('nav_h.png');
         background-repeat:repeat-x;
-	background-color: #FAFAFA;
-	margin:  0px;
-	border-bottom: 1px solid #D5D5D5;
+        background-color: #FAFAFA;
+        margin:  0px;
+        border-bottom: 1px solid #D5D5D5;
 }
 
 div.headertitle
 {
-	padding: 5px 5px 5px 7px;
+        padding: 5px 5px 5px 7px;
 }
 
 dl
@@ -845,49 +845,49 @@
 }
 
 dl.section dd {
-	margin-bottom: 6px;
+        margin-bottom: 6px;
 }
 
 
 #projectlogo
 {
-	text-align: center;
-	vertical-align: bottom;
-	border-collapse: separate;
+        text-align: center;
+        vertical-align: bottom;
+        border-collapse: separate;
 }
- 
+
 #projectlogo img
-{ 
-	border: 0px none;
+{
+        border: 0px none;
 }
- 
+
 #projectname
 {
-	font: 300% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 2px 0px;
+        font: 300% Tahoma, Arial,sans-serif;
+        margin: 0px;
+        padding: 2px 0px;
 }
-    
+
 #projectbrief
 {
-	font: 120% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 0px;
+        font: 120% Tahoma, Arial,sans-serif;
+        margin: 0px;
+        padding: 0px;
 }
 
 #projectnumber
 {
-	font: 100% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 0px;
+        font: 100% Tahoma, Arial,sans-serif;
+        margin: 0px;
+        padding: 0px;
 }
 
 #titlearea
 {
-	padding: 0px;
-	margin: 0px;
-	width: 100%;
-	border-bottom: 1px solid #848484;
+        padding: 0px;
+        margin: 0px;
+        width: 100%;
+        border-bottom: 1px solid #848484;
 }
 
 .image
@@ -907,12 +907,12 @@
 
 .caption
 {
-	font-weight: bold;
+        font-weight: bold;
 }
 
 div.zoom
 {
-	border: 1px solid #AFAFAF;
+        border: 1px solid #AFAFAF;
 }
 
 dl.citelist {
@@ -953,7 +953,7 @@
 
 div.toc h3 {
         font: bold 12px/1.2 Arial,FreeSans,sans-serif;
-	color: #747474;
+        color: #747474;
         border-bottom: 0 none;
         margin: 0;
 }
@@ -962,7 +962,7 @@
         list-style: none outside none;
         border: medium none;
         padding: 0px;
-}       
+}
 
 div.toc li.level1 {
         margin-left: 0px;
@@ -1009,4 +1009,3 @@
     word-wrap: break-word; /* IE 5.5+ */
   }
 }
-
diff --git a/doc/opus_logo.svg b/doc/opus_logo.svg
index 97112af..db2879e 100644
--- a/doc/opus_logo.svg
+++ b/doc/opus_logo.svg
@@ -59,11 +59,11 @@
    y1="95.107399"
    x2="194.53169"
    y2="9.9475983e-14">
-	<stop
+        <stop
    offset="0.0056"
    style="stop-color:#8E8E8E"
    id="stop7" />
-	<stop
+        <stop
    offset="1"
    style="stop-color:#B5B5B5"
    id="stop9" />
@@ -76,11 +76,11 @@
    y1="116.208"
    x2="229.61819"
    y2="164.46291">
-	<stop
+        <stop
    offset="0.0056"
    style="stop-color:#494748"
    id="stop14" />
-	<stop
+        <stop
    offset="1"
    style="stop-color:#000000"
    id="stop16" />
@@ -93,11 +93,11 @@
    y1="115.4395"
    x2="43.9897"
    y2="165.2314">
-	<stop
+        <stop
    offset="0.0056"
    style="stop-color:#494748"
    id="stop21" />
-	<stop
+        <stop
    offset="1"
    style="stop-color:#000000"
    id="stop23" />
@@ -110,11 +110,11 @@
    y1="115.7188"
    x2="311.2847"
    y2="165.2822">
-	<stop
+        <stop
    offset="0.0056"
    style="stop-color:#494748"
    id="stop28" />
-	<stop
+        <stop
    offset="1"
    style="stop-color:#000000"
    id="stop30" />
@@ -127,11 +127,11 @@
    y1="115.5791"
    x2="129.1987"
    y2="204.4863">
-	<stop
+        <stop
    offset="0.0056"
    style="stop-color:#494748"
    id="stop35" />
-	<stop
+        <stop
    offset="1"
    style="stop-color:#000000"
    id="stop37" />
diff --git a/doc/trivial_example.c b/doc/trivial_example.c
index 7a0fc56..c65dfe0 100644
--- a/doc/trivial_example.c
+++ b/doc/trivial_example.c
@@ -88,7 +88,7 @@
    fin = fopen(inFile, "r");
    if (fin==NULL)
    {
-      fprintf(stderr, "failed to open file: %s\n", strerror(errno));
+      fprintf(stderr, "failed to open input file: %s\n", strerror(errno));
       return EXIT_FAILURE;
    }
 
@@ -104,7 +104,7 @@
    fout = fopen(outFile, "w");
    if (fout==NULL)
    {
-      fprintf(stderr, "failed to open file: %s\n", strerror(errno));
+      fprintf(stderr, "failed to open output file: %s\n", strerror(errno));
       return EXIT_FAILURE;
    }
 
@@ -138,7 +138,7 @@
       frame_size = opus_decode(decoder, cbits, nbBytes, out, MAX_FRAME_SIZE, 0);
       if (frame_size<0)
       {
-         fprintf(stderr, "decoder failed: %s\n", opus_strerror(err));
+         fprintf(stderr, "decoder failed: %s\n", opus_strerror(frame_size));
          return EXIT_FAILURE;
       }
 
diff --git a/include/opus.h b/include/opus.h
index 93a53a2..b0bdf6f 100644
--- a/include/opus.h
+++ b/include/opus.h
@@ -616,7 +616,10 @@
   * merged. Splitting valid Opus packets is always guaranteed to succeed,
   * whereas merging valid packets only succeeds if all frames have the same
   * mode, bandwidth, and frame size, and when the total duration of the merged
-  * packet is no more than 120 ms.
+  * packet is no more than 120 ms. The 120 ms limit comes from the
+  * specification and limits decoder memory requirements at a point where
+  * framing overhead becomes negligible.
+  *
   * The repacketizer currently only operates on elementary Opus
   * streams. It will not manipualte multistream packets successfully, except in
   * the degenerate case where they consist of data from a single stream.
diff --git a/include/opus_defines.h b/include/opus_defines.h
index 265089f..647ed5d 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -46,7 +46,7 @@
 #define OPUS_OK                0
 /** One or more invalid/out of range arguments @hideinitializer*/
 #define OPUS_BAD_ARG          -1
-/** The mode struct passed is invalid @hideinitializer*/
+/** Not enough bytes allocated in the buffer @hideinitializer*/
 #define OPUS_BUFFER_TOO_SMALL -2
 /** An internal error was detected @hideinitializer*/
 #define OPUS_INTERNAL_ERROR   -3
@@ -274,7 +274,6 @@
 /** Enables or disables variable bitrate (VBR) in the encoder.
   * The configured bitrate may not be met exactly because frames must
   * be an integer number of bytes in length.
-  * @warning Only the MDCT mode of Opus can provide hard CBR behavior.
   * @see OPUS_GET_VBR
   * @see OPUS_SET_VBR_CONSTRAINT
   * @param[in] x <tt>opus_int32</tt>: Allowed values:
@@ -454,14 +453,6 @@
   * @hideinitializer */
 #define OPUS_GET_APPLICATION(x) OPUS_GET_APPLICATION_REQUEST, __opus_check_int_ptr(x)
 
-/** Gets the sampling rate the encoder or decoder was initialized with.
-  * This simply returns the <code>Fs</code> value passed to opus_encoder_init()
-  * or opus_decoder_init().
-  * @param[out] x <tt>opus_int32 *</tt>: Sampling rate of encoder or decoder.
-  * @hideinitializer
-  */
-#define OPUS_GET_SAMPLE_RATE(x) OPUS_GET_SAMPLE_RATE_REQUEST, __opus_check_int_ptr(x)
-
 /** Gets the total samples of delay added by the entire codec.
   * This can be queried by the encoder and then the provided number of samples can be
   * skipped on from the start of the decoder's output to provide time aligned input
@@ -498,9 +489,9 @@
 #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x)
 
 /** Configures the encoder's expected packet loss percentage.
-  * Higher values with trigger progressively more loss resistant behavior in the encoder
-  * at the expense of quality at a given bitrate in the lossless case, but greater quality
-  * under loss.
+  * Higher values trigger progressively more loss resistant behavior in the encoder
+  * at the expense of quality at a given bitrate in the absence of packet loss, but
+  * greater quality under loss.
   * @see OPUS_GET_PACKET_LOSS_PERC
   * @param[in] x <tt>opus_int32</tt>:   Loss percentage in the range 0-100, inclusive (default: 0).
   * @hideinitializer */
@@ -532,7 +523,19 @@
   * @hideinitializer */
 #define OPUS_GET_DTX(x) OPUS_GET_DTX_REQUEST, __opus_check_int_ptr(x)
 /** Configures the depth of signal being encoded.
+  *
   * This is a hint which helps the encoder identify silence and near-silence.
+  * It represents the number of significant bits of linear intensity below
+  * which the signal contains ignorable quantization or other noise.
+  *
+  * For example, OPUS_SET_LSB_DEPTH(14) would be an appropriate setting
+  * for G.711 u-law input. OPUS_SET_LSB_DEPTH(16) would be appropriate
+  * for 16-bit linear pcm input with opus_encode_float().
+  *
+  * When using opus_encode() instead of opus_encode_float(), or when libopus
+  * is compiled for fixed-point, the encoder uses the minimum of the value
+  * set here and the value 16.
+  *
   * @see OPUS_GET_LSB_DEPTH
   * @param[in] x <tt>opus_int32</tt>: Input precision in bits, between 8 and 24
   *                                   (default: 24).
@@ -545,11 +548,6 @@
   * @hideinitializer */
 #define OPUS_GET_LSB_DEPTH(x) OPUS_GET_LSB_DEPTH_REQUEST, __opus_check_int_ptr(x)
 
-/** Gets the duration (in samples) of the last packet successfully decoded or concealed.
-  * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate).
-  * @hideinitializer */
-#define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x)
-
 /** Configures the encoder's use of variable duration frames.
   * When variable duration is enabled, the encoder is free to use a shorter frame
   * size than the one requested in the opus_encode*() call.
@@ -558,12 +556,12 @@
   * packet. The part of the audio that was not encoded needs to be resent to the
   * encoder for the next call. Do not use this option unless you <b>really</b>
   * know what you are doing.
-  * @see OPUS_GET_EXPERT_VARIABLE_DURATION
+  * @see OPUS_GET_EXPERT_FRAME_DURATION
   * @param[in] x <tt>opus_int32</tt>: Allowed values:
   * <dl>
   * <dt>OPUS_FRAMESIZE_ARG</dt><dd>Select frame size from the argument (default).</dd>
   * <dt>OPUS_FRAMESIZE_2_5_MS</dt><dd>Use 2.5 ms frames.</dd>
-  * <dt>OPUS_FRAMESIZE_5_MS</dt><dd>Use 2.5 ms frames.</dd>
+  * <dt>OPUS_FRAMESIZE_5_MS</dt><dd>Use 5 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_10_MS</dt><dd>Use 10 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_20_MS</dt><dd>Use 20 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_40_MS</dt><dd>Use 40 ms frames.</dd>
@@ -573,12 +571,12 @@
   * @hideinitializer */
 #define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x)
 /** Gets the encoder's configured use of variable duration frames.
-  * @see OPUS_SET_EXPERT_VARIABLE_DURATION
+  * @see OPUS_SET_EXPERT_FRAME_DURATION
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
   * <dl>
   * <dt>OPUS_FRAMESIZE_ARG</dt><dd>Select frame size from the argument (default).</dd>
   * <dt>OPUS_FRAMESIZE_2_5_MS</dt><dd>Use 2.5 ms frames.</dd>
-  * <dt>OPUS_FRAMESIZE_5_MS</dt><dd>Use 2.5 ms frames.</dd>
+  * <dt>OPUS_FRAMESIZE_5_MS</dt><dd>Use 5 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_10_MS</dt><dd>Use 10 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_20_MS</dt><dd>Use 20 ms frames.</dd>
   * <dt>OPUS_FRAMESIZE_40_MS</dt><dd>Use 40 ms frames.</dd>
@@ -589,10 +587,22 @@
 #define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x)
 
 /** If set to 1, disables almost all use of prediction, making frames almost
-    completely independent. This reduces quality. (default : 0)
+  * completely independent. This reduces quality.
+  * @see OPUS_GET_PREDICTION_DISABLED
+  * @param[in] x <tt>opus_int32</tt>: Allowed values:
+  * <dl>
+  * <dt>0</dt><dd>Enable prediction (default).</dd>
+  * <dt>1</dt><dd>Disable prediction.</dd>
+  * </dl>
   * @hideinitializer */
 #define OPUS_SET_PREDICTION_DISABLED(x) OPUS_SET_PREDICTION_DISABLED_REQUEST, __opus_check_int(x)
 /** Gets the encoder's configured prediction status.
+  * @see OPUS_SET_PREDICTION_DISABLED
+  * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
+  * <dl>
+  * <dt>0</dt><dd>Prediction enabled (default).</dd>
+  * <dt>1</dt><dd>Prediction disabled.</dd>
+  * </dl>
   * @hideinitializer */
 #define OPUS_GET_PREDICTION_DISABLED(x) OPUS_GET_PREDICTION_DISABLED_REQUEST, __opus_check_int_ptr(x)
 
@@ -649,18 +659,6 @@
   * @hideinitializer */
 #define OPUS_GET_FINAL_RANGE(x) OPUS_GET_FINAL_RANGE_REQUEST, __opus_check_uint_ptr(x)
 
-/** Gets the pitch of the last decoded frame, if available.
-  * This can be used for any post-processing algorithm requiring the use of pitch,
-  * e.g. time stretching/shortening. If the last frame was not voiced, or if the
-  * pitch was not coded in the frame, then zero is returned.
-  *
-  * This CTL is only implemented for decoder instances.
-  *
-  * @param[out] x <tt>opus_int32 *</tt>: pitch period at 48 kHz (or 0 if not available)
-  *
-  * @hideinitializer */
-#define OPUS_GET_PITCH(x) OPUS_GET_PITCH_REQUEST, __opus_check_int_ptr(x)
-
 /** Gets the encoder's configured bandpass or the decoder's last bandpass.
   * @see OPUS_SET_BANDWIDTH
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
@@ -675,6 +673,14 @@
   * @hideinitializer */
 #define OPUS_GET_BANDWIDTH(x) OPUS_GET_BANDWIDTH_REQUEST, __opus_check_int_ptr(x)
 
+/** Gets the sampling rate the encoder or decoder was initialized with.
+  * This simply returns the <code>Fs</code> value passed to opus_encoder_init()
+  * or opus_decoder_init().
+  * @param[out] x <tt>opus_int32 *</tt>: Sampling rate of encoder or decoder.
+  * @hideinitializer
+  */
+#define OPUS_GET_SAMPLE_RATE(x) OPUS_GET_SAMPLE_RATE_REQUEST, __opus_check_int_ptr(x)
+
 /**@}*/
 
 /** @defgroup opus_decoderctls Decoder related CTLs
@@ -699,6 +705,23 @@
   * @hideinitializer */
 #define OPUS_GET_GAIN(x) OPUS_GET_GAIN_REQUEST, __opus_check_int_ptr(x)
 
+/** Gets the duration (in samples) of the last packet successfully decoded or concealed.
+  * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate).
+  * @hideinitializer */
+#define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x)
+
+/** Gets the pitch of the last decoded frame, if available.
+  * This can be used for any post-processing algorithm requiring the use of pitch,
+  * e.g. time stretching/shortening. If the last frame was not voiced, or if the
+  * pitch was not coded in the frame, then zero is returned.
+  *
+  * This CTL is only implemented for decoder instances.
+  *
+  * @param[out] x <tt>opus_int32 *</tt>: pitch period at 48 kHz (or 0 if not available)
+  *
+  * @hideinitializer */
+#define OPUS_GET_PITCH(x) OPUS_GET_PITCH_REQUEST, __opus_check_int_ptr(x)
+
 /**@}*/
 
 /** @defgroup opus_libinfo Opus library information functions
@@ -714,6 +737,10 @@
 
 /** Gets the libopus version string.
   *
+  * Applications may look for the substring "-fixed" in the version string to
+  * determine whether they have a fixed-point or floating-point build at
+  * runtime.
+  *
   * @returns Version string
   */
 OPUS_EXPORT const char *opus_get_version_string(void);
diff --git a/include/opus_multistream.h b/include/opus_multistream.h
index ae59979..47e0390 100644
--- a/include/opus_multistream.h
+++ b/include/opus_multistream.h
@@ -111,9 +111,9 @@
   * duration, can be computed without any special negotiation.
   *
   * The format for multistream Opus packets is defined in the
-  * <a href="http://tools.ietf.org/html/draft-terriberry-oggopus">Ogg
+  * <a href="https://tools.ietf.org/html/draft-ietf-codec-oggopus">Ogg
   * encapsulation specification</a> and is based on the self-delimited Opus
-  * framing described in Appendix B of <a href="http://tools.ietf.org/html/rfc6716">RFC 6716</a>.
+  * framing described in Appendix B of <a href="https://tools.ietf.org/html/rfc6716">RFC 6716</a>.
   * Normal Opus packets are just a degenerate case of multistream Opus packets,
   * and can be encoded or decoded with the multistream API by setting
   * <code>streams</code> to <code>1</code> when initializing the encoder or
@@ -140,7 +140,7 @@
   *
   * The output channels specified by the encoder
   * should use the
-  * <a href="http://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-800004.3.9">Vorbis
+  * <a href="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9">Vorbis
   * channel ordering</a>. A decoder may wish to apply an additional permutation
   * to the mapping the encoder used to achieve a different output channel
   * order (e.g. for outputing in WAV order).
diff --git a/install-sh b/install-sh
index a9244eb..0b0fdcb 100755
--- a/install-sh
+++ b/install-sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # install - install a program, script, or datafile
 
-scriptversion=2011-01-19.21; # UTC
+scriptversion=2013-12-25.23; # UTC
 
 # This originates from X11R5 (mit/util/scripts/install.sh), which was
 # later released in X11R6 (xc/config/util/install.sh) with the
@@ -35,25 +35,21 @@
 # FSF changes to this file are in the public domain.
 #
 # Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
+# 'make' implicit rules from creating a file called install from it
 # when there is no Makefile.
 #
 # This script is compatible with the BSD install script, but was written
 # from scratch.
 
+tab='	'
 nl='
 '
-IFS=" ""	$nl"
+IFS=" $tab$nl"
 
-# set DOITPROG to echo to test this script
+# Set DOITPROG to "echo" to test this script.
 
-# Don't use :- since 4.3BSD and earlier shells don't like it.
 doit=${DOITPROG-}
-if test -z "$doit"; then
-  doit_exec=exec
-else
-  doit_exec=$doit
-fi
+doit_exec=${doit:-exec}
 
 # Put in absolute file names if you don't have them in your path;
 # or use environment vars.
@@ -68,17 +64,6 @@
 rmprog=${RMPROG-rm}
 stripprog=${STRIPPROG-strip}
 
-posix_glob='?'
-initialize_posix_glob='
-  test "$posix_glob" != "?" || {
-    if (set -f) 2>/dev/null; then
-      posix_glob=
-    else
-      posix_glob=:
-    fi
-  }
-'
-
 posix_mkdir=
 
 # Desired mode of installed file.
@@ -97,7 +82,7 @@
 dst_arg=
 
 copy_on_change=false
-no_target_directory=
+is_target_a_directory=possibly
 
 usage="\
 Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
@@ -137,46 +122,57 @@
     -d) dir_arg=true;;
 
     -g) chgrpcmd="$chgrpprog $2"
-	shift;;
+        shift;;
 
     --help) echo "$usage"; exit $?;;
 
     -m) mode=$2
-	case $mode in
-	  *' '* | *'	'* | *'
-'*	  | *'*'* | *'?'* | *'['*)
-	    echo "$0: invalid mode: $mode" >&2
-	    exit 1;;
-	esac
-	shift;;
+        case $mode in
+          *' '* | *"$tab"* | *"$nl"* | *'*'* | *'?'* | *'['*)
+            echo "$0: invalid mode: $mode" >&2
+            exit 1;;
+        esac
+        shift;;
 
     -o) chowncmd="$chownprog $2"
-	shift;;
+        shift;;
 
     -s) stripcmd=$stripprog;;
 
-    -t) dst_arg=$2
-	# Protect names problematic for `test' and other utilities.
-	case $dst_arg in
-	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
-	esac
-	shift;;
+    -t)
+        is_target_a_directory=always
+        dst_arg=$2
+        # Protect names problematic for 'test' and other utilities.
+        case $dst_arg in
+          -* | [=\(\)!]) dst_arg=./$dst_arg;;
+        esac
+        shift;;
 
-    -T) no_target_directory=true;;
+    -T) is_target_a_directory=never;;
 
     --version) echo "$0 $scriptversion"; exit $?;;
 
-    --)	shift
-	break;;
+    --) shift
+        break;;
 
-    -*)	echo "$0: invalid option: $1" >&2
-	exit 1;;
+    -*) echo "$0: invalid option: $1" >&2
+        exit 1;;
 
     *)  break;;
   esac
   shift
 done
 
+# We allow the use of options -d and -T together, by making -d
+# take the precedence; this is for compatibility with GNU install.
+
+if test -n "$dir_arg"; then
+  if test -n "$dst_arg"; then
+    echo "$0: target directory not allowed when installing a directory." >&2
+    exit 1
+  fi
+fi
+
 if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
   # When -d is used, all remaining arguments are directories to create.
   # When -t is used, the destination is already specified.
@@ -190,7 +186,7 @@
     fi
     shift # arg
     dst_arg=$arg
-    # Protect names problematic for `test' and other utilities.
+    # Protect names problematic for 'test' and other utilities.
     case $dst_arg in
       -* | [=\(\)!]) dst_arg=./$dst_arg;;
     esac
@@ -202,12 +198,21 @@
     echo "$0: no input file specified." >&2
     exit 1
   fi
-  # It's OK to call `install-sh -d' without argument.
+  # It's OK to call 'install-sh -d' without argument.
   # This can happen when creating conditional directories.
   exit 0
 fi
 
 if test -z "$dir_arg"; then
+  if test $# -gt 1 || test "$is_target_a_directory" = always; then
+    if test ! -d "$dst_arg"; then
+      echo "$0: $dst_arg: Is not a directory." >&2
+      exit 1
+    fi
+  fi
+fi
+
+if test -z "$dir_arg"; then
   do_exit='(exit $ret); exit $ret'
   trap "ret=129; $do_exit" 1
   trap "ret=130; $do_exit" 2
@@ -223,16 +228,16 @@
 
     *[0-7])
       if test -z "$stripcmd"; then
-	u_plus_rw=
+        u_plus_rw=
       else
-	u_plus_rw='% 200'
+        u_plus_rw='% 200'
       fi
       cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
     *)
       if test -z "$stripcmd"; then
-	u_plus_rw=
+        u_plus_rw=
       else
-	u_plus_rw=,u+rw
+        u_plus_rw=,u+rw
       fi
       cp_umask=$mode$u_plus_rw;;
   esac
@@ -240,7 +245,7 @@
 
 for src
 do
-  # Protect names problematic for `test' and other utilities.
+  # Protect names problematic for 'test' and other utilities.
   case $src in
     -* | [=\(\)!]) src=./$src;;
   esac
@@ -269,41 +274,15 @@
     # If destination is a directory, append the input filename; won't work
     # if double slashes aren't ignored.
     if test -d "$dst"; then
-      if test -n "$no_target_directory"; then
-	echo "$0: $dst_arg: Is a directory" >&2
-	exit 1
+      if test "$is_target_a_directory" = never; then
+        echo "$0: $dst_arg: Is a directory" >&2
+        exit 1
       fi
       dstdir=$dst
       dst=$dstdir/`basename "$src"`
       dstdir_status=0
     else
-      # Prefer dirname, but fall back on a substitute if dirname fails.
-      dstdir=`
-	(dirname "$dst") 2>/dev/null ||
-	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	     X"$dst" : 'X\(//\)[^/]' \| \
-	     X"$dst" : 'X\(//\)$' \| \
-	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
-	echo X"$dst" |
-	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)[^/].*/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\).*/{
-		   s//\1/
-		   q
-		 }
-		 s/.*/./; q'
-      `
-
+      dstdir=`dirname "$dst"`
       test -d "$dstdir"
       dstdir_status=$?
     fi
@@ -314,74 +293,74 @@
   if test $dstdir_status != 0; then
     case $posix_mkdir in
       '')
-	# Create intermediate dirs using mode 755 as modified by the umask.
-	# This is like FreeBSD 'install' as of 1997-10-28.
-	umask=`umask`
-	case $stripcmd.$umask in
-	  # Optimize common cases.
-	  *[2367][2367]) mkdir_umask=$umask;;
-	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+        # Create intermediate dirs using mode 755 as modified by the umask.
+        # This is like FreeBSD 'install' as of 1997-10-28.
+        umask=`umask`
+        case $stripcmd.$umask in
+          # Optimize common cases.
+          *[2367][2367]) mkdir_umask=$umask;;
+          .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
 
-	  *[0-7])
-	    mkdir_umask=`expr $umask + 22 \
-	      - $umask % 100 % 40 + $umask % 20 \
-	      - $umask % 10 % 4 + $umask % 2
-	    `;;
-	  *) mkdir_umask=$umask,go-w;;
-	esac
+          *[0-7])
+            mkdir_umask=`expr $umask + 22 \
+              - $umask % 100 % 40 + $umask % 20 \
+              - $umask % 10 % 4 + $umask % 2
+            `;;
+          *) mkdir_umask=$umask,go-w;;
+        esac
 
-	# With -d, create the new directory with the user-specified mode.
-	# Otherwise, rely on $mkdir_umask.
-	if test -n "$dir_arg"; then
-	  mkdir_mode=-m$mode
-	else
-	  mkdir_mode=
-	fi
+        # With -d, create the new directory with the user-specified mode.
+        # Otherwise, rely on $mkdir_umask.
+        if test -n "$dir_arg"; then
+          mkdir_mode=-m$mode
+        else
+          mkdir_mode=
+        fi
 
-	posix_mkdir=false
-	case $umask in
-	  *[123567][0-7][0-7])
-	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
-	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
-	    ;;
-	  *)
-	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
-	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+        posix_mkdir=false
+        case $umask in
+          *[123567][0-7][0-7])
+            # POSIX mkdir -p sets u+wx bits regardless of umask, which
+            # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+            ;;
+          *)
+            tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+            trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
 
-	    if (umask $mkdir_umask &&
-		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
-	    then
-	      if test -z "$dir_arg" || {
-		   # Check for POSIX incompatibilities with -m.
-		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
-		   # other-writeable bit of parent directory when it shouldn't.
-		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
-		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
-		   case $ls_ld_tmpdir in
-		     d????-?r-*) different_mode=700;;
-		     d????-?--*) different_mode=755;;
-		     *) false;;
-		   esac &&
-		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
-		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
-		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
-		   }
-		 }
-	      then posix_mkdir=:
-	      fi
-	      rmdir "$tmpdir/d" "$tmpdir"
-	    else
-	      # Remove any dirs left behind by ancient mkdir implementations.
-	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
-	    fi
-	    trap '' 0;;
-	esac;;
+            if (umask $mkdir_umask &&
+                exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+            then
+              if test -z "$dir_arg" || {
+                   # Check for POSIX incompatibilities with -m.
+                   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+                   # other-writable bit of parent directory when it shouldn't.
+                   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+                   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+                   case $ls_ld_tmpdir in
+                     d????-?r-*) different_mode=700;;
+                     d????-?--*) different_mode=755;;
+                     *) false;;
+                   esac &&
+                   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+                     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+                     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+                   }
+                 }
+              then posix_mkdir=:
+              fi
+              rmdir "$tmpdir/d" "$tmpdir"
+            else
+              # Remove any dirs left behind by ancient mkdir implementations.
+              rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+            fi
+            trap '' 0;;
+        esac;;
     esac
 
     if
       $posix_mkdir && (
-	umask $mkdir_umask &&
-	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+        umask $mkdir_umask &&
+        $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
       )
     then :
     else
@@ -391,53 +370,51 @@
       # directory the slow way, step by step, checking for races as we go.
 
       case $dstdir in
-	/*) prefix='/';;
-	[-=\(\)!]*) prefix='./';;
-	*)  prefix='';;
+        /*) prefix='/';;
+        [-=\(\)!]*) prefix='./';;
+        *)  prefix='';;
       esac
 
-      eval "$initialize_posix_glob"
-
       oIFS=$IFS
       IFS=/
-      $posix_glob set -f
+      set -f
       set fnord $dstdir
       shift
-      $posix_glob set +f
+      set +f
       IFS=$oIFS
 
       prefixes=
 
       for d
       do
-	test X"$d" = X && continue
+        test X"$d" = X && continue
 
-	prefix=$prefix$d
-	if test -d "$prefix"; then
-	  prefixes=
-	else
-	  if $posix_mkdir; then
-	    (umask=$mkdir_umask &&
-	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
-	    # Don't fail if two instances are running concurrently.
-	    test -d "$prefix" || exit 1
-	  else
-	    case $prefix in
-	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
-	      *) qprefix=$prefix;;
-	    esac
-	    prefixes="$prefixes '$qprefix'"
-	  fi
-	fi
-	prefix=$prefix/
+        prefix=$prefix$d
+        if test -d "$prefix"; then
+          prefixes=
+        else
+          if $posix_mkdir; then
+            (umask=$mkdir_umask &&
+             $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+            # Don't fail if two instances are running concurrently.
+            test -d "$prefix" || exit 1
+          else
+            case $prefix in
+              *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+              *) qprefix=$prefix;;
+            esac
+            prefixes="$prefixes '$qprefix'"
+          fi
+        fi
+        prefix=$prefix/
       done
 
       if test -n "$prefixes"; then
-	# Don't fail if two instances are running concurrently.
-	(umask $mkdir_umask &&
-	 eval "\$doit_exec \$mkdirprog $prefixes") ||
-	  test -d "$dstdir" || exit 1
-	obsolete_mkdir_used=true
+        # Don't fail if two instances are running concurrently.
+        (umask $mkdir_umask &&
+         eval "\$doit_exec \$mkdirprog $prefixes") ||
+          test -d "$dstdir" || exit 1
+        obsolete_mkdir_used=true
       fi
     fi
   fi
@@ -472,15 +449,12 @@
 
     # If -C, don't bother to copy if it wouldn't change the file.
     if $copy_on_change &&
-       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
-       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
-
-       eval "$initialize_posix_glob" &&
-       $posix_glob set -f &&
+       old=`LC_ALL=C ls -dlL "$dst"     2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"  2>/dev/null` &&
+       set -f &&
        set X $old && old=:$2:$4:$5:$6 &&
        set X $new && new=:$2:$4:$5:$6 &&
-       $posix_glob set +f &&
-
+       set +f &&
        test "$old" = "$new" &&
        $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
     then
@@ -493,24 +467,24 @@
       # to itself, or perhaps because mv is so ancient that it does not
       # support -f.
       {
-	# Now remove or move aside any old file at destination location.
-	# We try this two ways since rm can't unlink itself on some
-	# systems and the destination file might be busy for other
-	# reasons.  In this case, the final cleanup might fail but the new
-	# file should still install successfully.
-	{
-	  test ! -f "$dst" ||
-	  $doit $rmcmd -f "$dst" 2>/dev/null ||
-	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
-	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
-	  } ||
-	  { echo "$0: cannot unlink or rename $dst" >&2
-	    (exit 1); exit 1
-	  }
-	} &&
+        # Now remove or move aside any old file at destination location.
+        # We try this two ways since rm can't unlink itself on some
+        # systems and the destination file might be busy for other
+        # reasons.  In this case, the final cleanup might fail but the new
+        # file should still install successfully.
+        {
+          test ! -f "$dst" ||
+          $doit $rmcmd -f "$dst" 2>/dev/null ||
+          { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+            { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+          } ||
+          { echo "$0: cannot unlink or rename $dst" >&2
+            (exit 1); exit 1
+          }
+        } &&
 
-	# Now rename the file to the real destination.
-	$doit $mvcmd "$dsttmp" "$dst"
+        # Now rename the file to the real destination.
+        $doit $mvcmd "$dsttmp" "$dst"
       }
     fi || exit 1
 
diff --git a/m4/libtool.m4 b/m4/libtool.m4
index 56666f0..f12cfdf 100644
--- a/m4/libtool.m4
+++ b/m4/libtool.m4
@@ -1312,7 +1312,7 @@
   rm -rf conftest*
   ;;
 
-x86_64-*kfreebsd*-gnu|x86_64-*linux*|ppc*-*linux*|powerpc*-*linux*| \
+x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
 s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
   # Find out which ABI we are using.
   echo 'int i;' > conftest.$ac_ext
@@ -1326,7 +1326,10 @@
 	  x86_64-*linux*)
 	    LD="${LD-ld} -m elf_i386"
 	    ;;
-	  ppc64-*linux*|powerpc64-*linux*)
+	  powerpc64le-*linux*)
+	    LD="${LD-ld} -m elf32lppclinux"
+	    ;;
+	  powerpc64-*linux*)
 	    LD="${LD-ld} -m elf32ppclinux"
 	    ;;
 	  s390x-*linux*)
@@ -1345,7 +1348,10 @@
 	  x86_64-*linux*)
 	    LD="${LD-ld} -m elf_x86_64"
 	    ;;
-	  ppc*-*linux*|powerpc*-*linux*)
+	  powerpcle-*linux*)
+	    LD="${LD-ld} -m elf64lppc"
+	    ;;
+	  powerpc-*linux*)
 	    LD="${LD-ld} -m elf64ppc"
 	    ;;
 	  s390*-*linux*|s390*-*tpf*)
diff --git a/m4/opus-intrinsics.m4 b/m4/opus-intrinsics.m4
new file mode 100644
index 0000000..b93ddd3
--- /dev/null
+++ b/m4/opus-intrinsics.m4
@@ -0,0 +1,29 @@
+dnl opus-intrinsics.m4
+dnl macro for testing for support for compiler intrinsics, either by default or with a compiler flag
+
+dnl OPUS_CHECK_INTRINSICS(NAME-OF-INTRINSICS, COMPILER-FLAG-FOR-INTRINSICS, VAR-IF-PRESENT, VAR-IF-DEFAULT, TEST-PROGRAM-HEADER, TEST-PROGRAM-BODY)
+AC_DEFUN([OPUS_CHECK_INTRINSICS],
+[
+   AC_MSG_CHECKING([if compiler supports $1 intrinsics])
+   AC_LINK_IFELSE(
+     [AC_LANG_PROGRAM($5, $6)],
+     [
+        $3=1
+        $4=1
+        AC_MSG_RESULT([yes])
+      ],[
+        $4=0
+        AC_MSG_RESULT([no])
+        AC_MSG_CHECKING([if compiler supports $1 intrinsics with $2])
+        save_CFLAGS="$CFLAGS"; CFLAGS="$2 $CFLAGS"
+        AC_LINK_IFELSE([AC_LANG_PROGRAM($5, $6)],
+        [
+           AC_MSG_RESULT([yes])
+           $3=1
+        ],[
+           AC_MSG_RESULT([no])
+           $3=0
+        ])
+        CFLAGS="$save_CFLAGS"
+     ])
+])
diff --git a/missing b/missing
index 86a8fc3..f62bbae 100755
--- a/missing
+++ b/missing
@@ -1,11 +1,10 @@
 #! /bin/sh
-# Common stub for a few missing GNU programs while installing.
+# Common wrapper for a few potentially missing GNU programs.
 
-scriptversion=2012-01-06.13; # UTC
+scriptversion=2013-10-28.13; # UTC
 
-# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006,
-# 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
-# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -26,68 +25,40 @@
 # the same distribution terms that you use for the rest of that program.
 
 if test $# -eq 0; then
-  echo 1>&2 "Try \`$0 --help' for more information"
+  echo 1>&2 "Try '$0 --help' for more information"
   exit 1
 fi
 
-run=:
-sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
-sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
-
-# In the cases where this matters, `missing' is being run in the
-# srcdir already.
-if test -f configure.ac; then
-  configure_ac=configure.ac
-else
-  configure_ac=configure.in
-fi
-
-msg="missing on your system"
-
 case $1 in
---run)
-  # Try to run requested program, and just exit if it succeeds.
-  run=
-  shift
-  "$@" && exit 0
-  # Exit code 63 means version mismatch.  This often happens
-  # when the user try to use an ancient version of a tool on
-  # a file that requires a minimum version.  In this case we
-  # we should proceed has if the program had been absent, or
-  # if --run hadn't been passed.
-  if test $? = 63; then
-    run=:
-    msg="probably too old"
-  fi
-  ;;
+
+  --is-lightweight)
+    # Used by our autoconf macros to check whether the available missing
+    # script is modern enough.
+    exit 0
+    ;;
+
+  --run)
+    # Back-compat with the calling convention used by older automake.
+    shift
+    ;;
 
   -h|--h|--he|--hel|--help)
     echo "\
 $0 [OPTION]... PROGRAM [ARGUMENT]...
 
-Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
-error status if there is no known handling for PROGRAM.
+Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due
+to PROGRAM being missing or too old.
 
 Options:
   -h, --help      display this help and exit
   -v, --version   output version information and exit
-  --run           try to run the given command, and emulate it if it fails
 
 Supported PROGRAM values:
-  aclocal      touch file \`aclocal.m4'
-  autoconf     touch file \`configure'
-  autoheader   touch file \`config.h.in'
-  autom4te     touch the output file, or create a stub one
-  automake     touch all \`Makefile.in' files
-  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
-  flex         create \`lex.yy.c', if possible, from existing .c
-  help2man     touch the output file
-  lex          create \`lex.yy.c', if possible, from existing .c
-  makeinfo     touch the output file
-  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+  aclocal   autoconf  autoheader   autom4te  automake  makeinfo
+  bison     yacc      flex         lex       help2man
 
-Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and
-\`g' are ignored when checking the name.
+Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and
+'g' are ignored when checking the name.
 
 Send bug reports to <bug-automake@gnu.org>."
     exit $?
@@ -99,228 +70,141 @@
     ;;
 
   -*)
-    echo 1>&2 "$0: Unknown \`$1' option"
-    echo 1>&2 "Try \`$0 --help' for more information"
+    echo 1>&2 "$0: unknown '$1' option"
+    echo 1>&2 "Try '$0 --help' for more information"
     exit 1
     ;;
 
 esac
 
-# normalize program name to check for.
-program=`echo "$1" | sed '
-  s/^gnu-//; t
-  s/^gnu//; t
-  s/^g//; t'`
+# Run the given program, remember its exit status.
+"$@"; st=$?
 
-# Now exit if we have it, but it failed.  Also exit now if we
-# don't have it and --version was passed (most likely to detect
-# the program).  This is about non-GNU programs, so use $1 not
-# $program.
-case $1 in
-  lex*|yacc*)
-    # Not GNU programs, they don't have --version.
+# If it succeeded, we are done.
+test $st -eq 0 && exit 0
+
+# Also exit now if we it failed (or wasn't found), and '--version' was
+# passed; such an option is passed most likely to detect whether the
+# program is present and works.
+case $2 in --version|--help) exit $st;; esac
+
+# Exit code 63 means version mismatch.  This often happens when the user
+# tries to use an ancient version of a tool on a file that requires a
+# minimum version.
+if test $st -eq 63; then
+  msg="probably too old"
+elif test $st -eq 127; then
+  # Program was missing.
+  msg="missing on your system"
+else
+  # Program was found and executed, but failed.  Give up.
+  exit $st
+fi
+
+perl_URL=http://www.perl.org/
+flex_URL=http://flex.sourceforge.net/
+gnu_software_URL=http://www.gnu.org/software
+
+program_details ()
+{
+  case $1 in
+    aclocal|automake)
+      echo "The '$1' program is part of the GNU Automake package:"
+      echo "<$gnu_software_URL/automake>"
+      echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:"
+      echo "<$gnu_software_URL/autoconf>"
+      echo "<$gnu_software_URL/m4/>"
+      echo "<$perl_URL>"
+      ;;
+    autoconf|autom4te|autoheader)
+      echo "The '$1' program is part of the GNU Autoconf package:"
+      echo "<$gnu_software_URL/autoconf/>"
+      echo "It also requires GNU m4 and Perl in order to run:"
+      echo "<$gnu_software_URL/m4/>"
+      echo "<$perl_URL>"
+      ;;
+  esac
+}
+
+give_advice ()
+{
+  # Normalize program name to check for.
+  normalized_program=`echo "$1" | sed '
+    s/^gnu-//; t
+    s/^gnu//; t
+    s/^g//; t'`
+
+  printf '%s\n' "'$1' is $msg."
+
+  configure_deps="'configure.ac' or m4 files included by 'configure.ac'"
+  case $normalized_program in
+    autoconf*)
+      echo "You should only need it if you modified 'configure.ac',"
+      echo "or m4 files included by it."
+      program_details 'autoconf'
+      ;;
+    autoheader*)
+      echo "You should only need it if you modified 'acconfig.h' or"
+      echo "$configure_deps."
+      program_details 'autoheader'
+      ;;
+    automake*)
+      echo "You should only need it if you modified 'Makefile.am' or"
+      echo "$configure_deps."
+      program_details 'automake'
+      ;;
+    aclocal*)
+      echo "You should only need it if you modified 'acinclude.m4' or"
+      echo "$configure_deps."
+      program_details 'aclocal'
+      ;;
+   autom4te*)
+      echo "You might have modified some maintainer files that require"
+      echo "the 'autom4te' program to be rebuilt."
+      program_details 'autom4te'
+      ;;
+    bison*|yacc*)
+      echo "You should only need it if you modified a '.y' file."
+      echo "You may want to install the GNU Bison package:"
+      echo "<$gnu_software_URL/bison/>"
+      ;;
+    lex*|flex*)
+      echo "You should only need it if you modified a '.l' file."
+      echo "You may want to install the Fast Lexical Analyzer package:"
+      echo "<$flex_URL>"
+      ;;
+    help2man*)
+      echo "You should only need it if you modified a dependency" \
+           "of a man page."
+      echo "You may want to install the GNU Help2man package:"
+      echo "<$gnu_software_URL/help2man/>"
     ;;
+    makeinfo*)
+      echo "You should only need it if you modified a '.texi' file, or"
+      echo "any other file indirectly affecting the aspect of the manual."
+      echo "You might want to install the Texinfo package:"
+      echo "<$gnu_software_URL/texinfo/>"
+      echo "The spurious makeinfo call might also be the consequence of"
+      echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might"
+      echo "want to install GNU make:"
+      echo "<$gnu_software_URL/make/>"
+      ;;
+    *)
+      echo "You might have modified some files without having the proper"
+      echo "tools for further handling them.  Check the 'README' file, it"
+      echo "often tells you about the needed prerequisites for installing"
+      echo "this package.  You may also peek at any GNU archive site, in"
+      echo "case some other package contains this missing '$1' program."
+      ;;
+  esac
+}
 
-  *)
-    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
-       # We have it, but it failed.
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       # Could not run --version or --help.  This is probably someone
-       # running `$TOOL --version' or `$TOOL --help' to check whether
-       # $TOOL exists and not knowing $TOOL uses missing.
-       exit 1
-    fi
-    ;;
-esac
+give_advice "$1" | sed -e '1s/^/WARNING: /' \
+                       -e '2,$s/^/         /' >&2
 
-# If it does not exist, or fails to run (possibly an outdated version),
-# try to emulate it.
-case $program in
-  aclocal*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
-         to install the \`Automake' and \`Perl' packages.  Grab them from
-         any GNU archive site."
-    touch aclocal.m4
-    ;;
-
-  autoconf*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`${configure_ac}'.  You might want to install the
-         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
-         archive site."
-    touch configure
-    ;;
-
-  autoheader*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
-         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
-         from any GNU archive site."
-    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
-    test -z "$files" && files="config.h"
-    touch_files=
-    for f in $files; do
-      case $f in
-      *:*) touch_files="$touch_files "`echo "$f" |
-				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
-      *) touch_files="$touch_files $f.in";;
-      esac
-    done
-    touch $touch_files
-    ;;
-
-  automake*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
-         You might want to install the \`Automake' and \`Perl' packages.
-         Grab them from any GNU archive site."
-    find . -type f -name Makefile.am -print |
-	   sed 's/\.am$/.in/' |
-	   while read f; do touch "$f"; done
-    ;;
-
-  autom4te*)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, but is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.
-         You can get \`$1' as part of \`Autoconf' from any GNU
-         archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo "#! /bin/sh"
-	echo "# Created by GNU Automake missing as a replacement of"
-	echo "#  $ $@"
-	echo "exit 0"
-	chmod +x $file
-	exit 1
-    fi
-    ;;
-
-  bison*|yacc*)
-    echo 1>&2 "\
-WARNING: \`$1' $msg.  You should only need it if
-         you modified a \`.y' file.  You may need the \`Bison' package
-         in order for those modifications to take effect.  You can get
-         \`Bison' from any GNU archive site."
-    rm -f y.tab.c y.tab.h
-    if test $# -ne 1; then
-        eval LASTARG=\${$#}
-	case $LASTARG in
-	*.y)
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.c
-	    fi
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.h
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f y.tab.h; then
-	echo >y.tab.h
-    fi
-    if test ! -f y.tab.c; then
-	echo 'main() { return 0; }' >y.tab.c
-    fi
-    ;;
-
-  lex*|flex*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.l' file.  You may need the \`Flex' package
-         in order for those modifications to take effect.  You can get
-         \`Flex' from any GNU archive site."
-    rm -f lex.yy.c
-    if test $# -ne 1; then
-        eval LASTARG=\${$#}
-	case $LASTARG in
-	*.l)
-	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" lex.yy.c
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f lex.yy.c; then
-	echo 'main() { return 0; }' >lex.yy.c
-    fi
-    ;;
-
-  help2man*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-	 you modified a dependency of a manual page.  You may need the
-	 \`Help2man' package in order for those modifications to take
-	 effect.  You can get \`Help2man' from any GNU archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo ".ab help2man is required to generate this page"
-	exit $?
-    fi
-    ;;
-
-  makeinfo*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.texi' or \`.texinfo' file, or any other file
-         indirectly affecting the aspect of the manual.  The spurious
-         call might also be the consequence of using a buggy \`make' (AIX,
-         DU, IRIX).  You might want to install the \`Texinfo' package or
-         the \`GNU make' package.  Grab either from any GNU archive site."
-    # The file to touch is that specified with -o ...
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -z "$file"; then
-      # ... or it is the one specified with @setfilename ...
-      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
-      file=`sed -n '
-	/^@setfilename/{
-	  s/.* \([^ ]*\) *$/\1/
-	  p
-	  q
-	}' $infile`
-      # ... or it is derived from the source name (dir/f.texi becomes f.info)
-      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
-    fi
-    # If the file does not exist, the user really needs makeinfo;
-    # let's fail without touching anything.
-    test -f $file || exit 1
-    touch $file
-    ;;
-
-  *)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, and is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.  Check the \`README' file,
-         it often tells you about the needed prerequisites for installing
-         this package.  You may also peek at any GNU archive site, in case
-         some other package would contain this missing \`$1' program."
-    exit 1
-    ;;
-esac
-
-exit 0
+# Propagate the correct exit status (expected to be 127 for a program
+# not found, 63 for a program that failed due to version mismatch).
+exit $st
 
 # Local variables:
 # eval: (add-hook 'write-file-hooks 'time-stamp)
diff --git a/package_version b/package_version
index 8eaa996..342e6af 100644
--- a/package_version
+++ b/package_version
@@ -1 +1 @@
-PACKAGE_VERSION="1.1"
+PACKAGE_VERSION="1.1.2"
diff --git a/silk/A2NLSF.c b/silk/A2NLSF.c
index 74b1b95..b6e9e5f 100644
--- a/silk/A2NLSF.c
+++ b/silk/A2NLSF.c
@@ -71,8 +71,23 @@
 
     y32 = p[ dd ];                                  /* Q16 */
     x_Q16 = silk_LSHIFT( x, 4 );
-    for( n = dd - 1; n >= 0; n-- ) {
-        y32 = silk_SMLAWW( p[ n ], y32, x_Q16 );    /* Q16 */
+
+    if ( opus_likely( 8 == dd ) )
+    {
+        y32 = silk_SMLAWW( p[ 7 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 6 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 5 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 4 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 3 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 2 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 1 ], y32, x_Q16 );
+        y32 = silk_SMLAWW( p[ 0 ], y32, x_Q16 );
+    }
+    else
+    {
+        for( n = dd - 1; n >= 0; n-- ) {
+            y32 = silk_SMLAWW( p[ n ], y32, x_Q16 );    /* Q16 */
+        }
     }
     return y32;
 }
diff --git a/silk/API.h b/silk/API.h
index f0601bc..0131acb 100644
--- a/silk/API.h
+++ b/silk/API.h
@@ -111,7 +111,8 @@
     opus_int                        newPacketFlag,      /* I    Indicates first decoder call for this packet    */
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
-    opus_int32                      *nSamplesOut        /* O    Number of samples decoded                       */
+    opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+    int                             arch                /* I    Run-time architecture                           */
 );
 
 #if 0
diff --git a/silk/CNG.c b/silk/CNG.c
index 8481d95..61787c2 100644
--- a/silk/CNG.c
+++ b/silk/CNG.c
@@ -34,7 +34,7 @@
 
 /* Generates excitation for CNG LPC synthesis */
 static OPUS_INLINE void silk_CNG_exc(
-    opus_int32                       residual_Q10[],     /* O    CNG residual signal Q10                     */
+    opus_int32                       exc_Q10[],          /* O    CNG excitation signal Q10                   */
     opus_int32                       exc_buf_Q14[],      /* I    Random samples buffer Q10                   */
     opus_int32                       Gain_Q16,           /* I    Gain to apply                               */
     opus_int                         length,             /* I    Length                                      */
@@ -55,7 +55,7 @@
         idx = (opus_int)( silk_RSHIFT( seed, 24 ) & exc_mask );
         silk_assert( idx >= 0 );
         silk_assert( idx <= CNG_BUF_MASK_MAX );
-        residual_Q10[ i ] = (opus_int16)silk_SAT16( silk_SMULWW( exc_buf_Q14[ idx ], Gain_Q16 >> 4 ) );
+        exc_Q10[ i ] = (opus_int16)silk_SAT16( silk_SMULWW( exc_buf_Q14[ idx ], Gain_Q16 >> 4 ) );
     }
     *rand_seed = seed;
 }
@@ -85,7 +85,7 @@
 )
 {
     opus_int   i, subfr;
-    opus_int32 sum_Q6, max_Gain_Q16;
+    opus_int32 sum_Q6, max_Gain_Q16, gain_Q16;
     opus_int16 A_Q12[ MAX_LPC_ORDER ];
     silk_CNG_struct *psCNG = &psDec->sCNG;
     SAVE_STACK;
@@ -125,11 +125,20 @@
     /* Add CNG when packet is lost or during DTX */
     if( psDec->lossCnt ) {
         VARDECL( opus_int32, CNG_sig_Q10 );
-
         ALLOC( CNG_sig_Q10, length + MAX_LPC_ORDER, opus_int32 );
 
         /* Generate CNG excitation */
-        silk_CNG_exc( CNG_sig_Q10 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, psCNG->CNG_smth_Gain_Q16, length, &psCNG->rand_seed );
+        gain_Q16 = silk_SMULWW( psDec->sPLC.randScale_Q14, psDec->sPLC.prevGain_Q16[1] );
+        if( gain_Q16 >= (1 << 21) || psCNG->CNG_smth_Gain_Q16 > (1 << 23) ) {
+            gain_Q16 = silk_SMULTT( gain_Q16, gain_Q16 );
+            gain_Q16 = silk_SUB_LSHIFT32(silk_SMULTT( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 );
+            gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 16 );
+        } else {
+            gain_Q16 = silk_SMULWW( gain_Q16, gain_Q16 );
+            gain_Q16 = silk_SUB_LSHIFT32(silk_SMULWW( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 );
+            gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 8 );
+        }
+        silk_CNG_exc( CNG_sig_Q10 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, gain_Q16, length, &psCNG->rand_seed );
 
         /* Convert CNG NLSF to filter representation */
         silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
@@ -162,7 +171,7 @@
             /* Update states */
             CNG_sig_Q10[ MAX_LPC_ORDER + i ] = silk_ADD_LSHIFT( CNG_sig_Q10[ MAX_LPC_ORDER + i ], sum_Q6, 4 );
 
-            frame[ i ] = silk_ADD_SAT16( frame[ i ], silk_RSHIFT_ROUND( sum_Q6, 6 ) );
+            frame[ i ] = silk_ADD_SAT16( frame[ i ], silk_RSHIFT_ROUND( CNG_sig_Q10[ MAX_LPC_ORDER + i ], 10 ) );
         }
         silk_memcpy( psCNG->CNG_synth_state, &CNG_sig_Q10[ length ], MAX_LPC_ORDER * sizeof( opus_int32 ) );
     } else {
diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c
index 9d1f16c..2090667 100644
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -44,7 +44,8 @@
     const opus_int16            *in,                /* I    Input signal                                                */
     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
     const opus_int32            len,                /* I    Signal length                                               */
-    const opus_int32            d                   /* I    Filter order                                                */
+    const opus_int32            d,                  /* I    Filter order                                                */
+    int                         arch                /* I    Run-time architecture                                       */
 )
 {
     opus_int   j;
@@ -69,11 +70,12 @@
     for (j=0;j<d;j++) {
         mem[ j ] = in[ d - j - 1 ];
     }
-    celt_fir( in + d, num, out + d, len - d, d, mem );
+    celt_fir( in + d, num, out + d, len - d, d, mem, arch );
     for ( j = 0; j < d; j++ ) {
         out[ j ] = 0;
     }
 #else
+    (void)arch;
     for( ix = d; ix < len; ix++ ) {
         in_ptr = &in[ ix - 1 ];
 
diff --git a/silk/NLSF_del_dec_quant.c b/silk/NLSF_del_dec_quant.c
index 504dbbd..c3b9efc 100644
--- a/silk/NLSF_del_dec_quant.c
+++ b/silk/NLSF_del_dec_quant.c
@@ -56,6 +56,28 @@
     opus_int32       RD_max_Q25[       NLSF_QUANT_DEL_DEC_STATES ];
     const opus_uint8 *rates_Q5;
 
+    opus_int out0_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT];
+    opus_int out1_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT];
+
+    for (i = -NLSF_QUANT_MAX_AMPLITUDE_EXT; i <= NLSF_QUANT_MAX_AMPLITUDE_EXT-1; i++)
+    {
+        out0_Q10 = silk_LSHIFT( i, 10 );
+        out1_Q10 = silk_ADD16( out0_Q10, 1024 );
+        if( i > 0 ) {
+            out0_Q10 = silk_SUB16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+            out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+        } else if( i == 0 ) {
+            out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+        } else if( i == -1 ) {
+            out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+        } else {
+            out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+            out1_Q10 = silk_ADD16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
+        }
+        out0_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_SMULWB( (opus_int32)out0_Q10, quant_step_size_Q16 );
+        out1_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_SMULWB( (opus_int32)out1_Q10, quant_step_size_Q16 );
+    }
+
     silk_assert( (NLSF_QUANT_DEL_DEC_STATES & (NLSF_QUANT_DEL_DEC_STATES-1)) == 0 );     /* must be power of two */
 
     nStates = 1;
@@ -73,21 +95,9 @@
             ind[ j ][ i ] = (opus_int8)ind_tmp;
 
             /* compute outputs for ind_tmp and ind_tmp + 1 */
-            out0_Q10 = silk_LSHIFT( ind_tmp, 10 );
-            out1_Q10 = silk_ADD16( out0_Q10, 1024 );
-            if( ind_tmp > 0 ) {
-                out0_Q10 = silk_SUB16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-                out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-            } else if( ind_tmp == 0 ) {
-                out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-            } else if( ind_tmp == -1 ) {
-                out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-            } else {
-                out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-                out1_Q10 = silk_ADD16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
-            }
-            out0_Q10  = silk_SMULWB( (opus_int32)out0_Q10, quant_step_size_Q16 );
-            out1_Q10  = silk_SMULWB( (opus_int32)out1_Q10, quant_step_size_Q16 );
+            out0_Q10 = out0_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ];
+            out1_Q10 = out1_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ];
+
             out0_Q10  = silk_ADD16( out0_Q10, pred_Q10 );
             out1_Q10  = silk_ADD16( out1_Q10, pred_Q10 );
             prev_out_Q10[ j           ] = out0_Q10;
diff --git a/silk/NSQ.c b/silk/NSQ.c
index cf5b3fd..a065884 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -46,6 +46,7 @@
     const opus_int      signal_type             /* I    Signal type                     */
 );
 
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     opus_int            signalType,             /* I    Signal type                     */
@@ -67,8 +68,10 @@
     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter order   */
     opus_int            predictLPCOrder         /* I    Prediction filter order         */
 );
+#endif
 
-void silk_NSQ(
+void silk_NSQ_c
+(
     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
@@ -141,7 +144,7 @@
                 silk_assert( start_idx > 0 );
 
                 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
-                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder );
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
 
                 NSQ->rewhite_flag = 1;
                 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
@@ -172,7 +175,11 @@
 /***********************************/
 /* silk_noise_shape_quantizer  */
 /***********************************/
-static OPUS_INLINE void silk_noise_shape_quantizer(
+
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+static OPUS_INLINE
+#endif
+void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     opus_int            signalType,             /* I    Signal type                     */
     const opus_int32    x_sc_Q10[],             /* I                                    */
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 522be40..aff560c 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -57,6 +57,9 @@
 
 typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
 
+#if defined(MIPSr1_ASM)
+#include "mips/NSQ_del_dec_mipsr1.h"
+#endif
 static OPUS_INLINE void silk_nsq_del_dec_scale_states(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
@@ -106,7 +109,7 @@
     opus_int            decisionDelay           /* I                                        */
 );
 
-void silk_NSQ_del_dec(
+void silk_NSQ_del_dec_c(
     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
@@ -244,7 +247,7 @@
                 silk_assert( start_idx > 0 );
 
                 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
-                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder );
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
 
                 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
                 NSQ->rewhite_flag = 1;
@@ -303,6 +306,7 @@
 /******************************************/
 /* Noise shape quantizer for one subframe */
 /******************************************/
+#ifndef OVERRIDE_silk_noise_shape_quantizer_del_dec
 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
@@ -629,6 +633,7 @@
     }
     RESTORE_STACK;
 }
+#endif /* OVERRIDE_silk_noise_shape_quantizer_del_dec */
 
 static OPUS_INLINE void silk_nsq_del_dec_scale_states(
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
diff --git a/silk/PLC.c b/silk/PLC.c
index 01f4001..34a94bc 100644
--- a/silk/PLC.c
+++ b/silk/PLC.c
@@ -46,7 +46,8 @@
 static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
-    opus_int16                          frame[]             /* O LPC residual signal    */
+    opus_int16                          frame[],            /* O LPC residual signal    */
+    int                                 arch                /* I  Run-time architecture */
 );
 
 
@@ -65,7 +66,8 @@
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
-    opus_int                            lost                /* I Loss flag              */
+    opus_int                            lost,               /* I Loss flag              */
+    int                                 arch                /* I Run-time architecture  */
 )
 {
     /* PLC control function */
@@ -78,7 +80,7 @@
         /****************************/
         /* Generate Signal          */
         /****************************/
-        silk_PLC_conceal( psDec, psDecCtrl, frame );
+        silk_PLC_conceal( psDec, psDecCtrl, frame, arch );
 
         psDec->lossCnt++;
     } else {
@@ -165,10 +167,35 @@
     psPLC->nb_subfr = psDec->nb_subfr;
 }
 
+static OPUS_INLINE void silk_PLC_energy(opus_int32 *energy1, opus_int *shift1, opus_int32 *energy2, opus_int *shift2,
+      const opus_int32 *exc_Q14, const opus_int32 *prevGain_Q10, int subfr_length, int nb_subfr)
+{
+    int i, k;
+    VARDECL( opus_int16, exc_buf );
+    opus_int16 *exc_buf_ptr;
+    SAVE_STACK;
+    ALLOC( exc_buf, 2*subfr_length, opus_int16 );
+    /* Find random noise component */
+    /* Scale previous excitation signal */
+    exc_buf_ptr = exc_buf;
+    for( k = 0; k < 2; k++ ) {
+        for( i = 0; i < subfr_length; i++ ) {
+            exc_buf_ptr[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT(
+                silk_SMULWW( exc_Q14[ i + ( k + nb_subfr - 2 ) * subfr_length ], prevGain_Q10[ k ] ), 8 ) );
+        }
+        exc_buf_ptr += subfr_length;
+    }
+    /* Find the subframe with lowest energy of the last two and use that as random noise generator */
+    silk_sum_sqr_shift( energy1, shift1, exc_buf,                  subfr_length );
+    silk_sum_sqr_shift( energy2, shift2, &exc_buf[ subfr_length ], subfr_length );
+    RESTORE_STACK;
+}
+
 static OPUS_INLINE void silk_PLC_conceal(
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
-    opus_int16                          frame[]             /* O LPC residual signal    */
+    opus_int16                          frame[],            /* O LPC residual signal    */
+    int                                 arch                /* I Run-time architecture  */
 )
 {
     opus_int   i, j, k;
@@ -177,19 +204,26 @@
     opus_int32 energy1, energy2, *rand_ptr, *pred_lag_ptr;
     opus_int32 LPC_pred_Q10, LTP_pred_Q12;
     opus_int16 rand_scale_Q14;
-    opus_int16 *B_Q14, *exc_buf_ptr;
+    opus_int16 *B_Q14;
     opus_int32 *sLPC_Q14_ptr;
-    VARDECL( opus_int16, exc_buf );
     opus_int16 A_Q12[ MAX_LPC_ORDER ];
+#ifdef SMALL_FOOTPRINT
+    opus_int16 *sLTP;
+#else
     VARDECL( opus_int16, sLTP );
+#endif
     VARDECL( opus_int32, sLTP_Q14 );
     silk_PLC_struct *psPLC = &psDec->sPLC;
     opus_int32 prevGain_Q10[2];
     SAVE_STACK;
 
-    ALLOC( exc_buf, 2*psPLC->subfr_length, opus_int16 );
-    ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 );
     ALLOC( sLTP_Q14, psDec->ltp_mem_length + psDec->frame_length, opus_int32 );
+#ifdef SMALL_FOOTPRINT
+    /* Ugly hack that breaks aliasing rules to save stack: put sLTP at the very end of sLTP_Q14. */
+    sLTP = ((opus_int16*)&sLTP_Q14[psDec->ltp_mem_length + psDec->frame_length])-psDec->ltp_mem_length;
+#else
+    ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 );
+#endif
 
     prevGain_Q10[0] = silk_RSHIFT( psPLC->prevGain_Q16[ 0 ], 6);
     prevGain_Q10[1] = silk_RSHIFT( psPLC->prevGain_Q16[ 1 ], 6);
@@ -198,19 +232,7 @@
        silk_memset( psPLC->prevLPC_Q12, 0, sizeof( psPLC->prevLPC_Q12 ) );
     }
 
-    /* Find random noise component */
-    /* Scale previous excitation signal */
-    exc_buf_ptr = exc_buf;
-    for( k = 0; k < 2; k++ ) {
-        for( i = 0; i < psPLC->subfr_length; i++ ) {
-            exc_buf_ptr[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT(
-                silk_SMULWW( psDec->exc_Q14[ i + ( k + psPLC->nb_subfr - 2 ) * psPLC->subfr_length ], prevGain_Q10[ k ] ), 8 ) );
-        }
-        exc_buf_ptr += psPLC->subfr_length;
-    }
-    /* Find the subframe with lowest energy of the last two and use that as random noise generator */
-    silk_sum_sqr_shift( &energy1, &shift1, exc_buf,                         psPLC->subfr_length );
-    silk_sum_sqr_shift( &energy2, &shift2, &exc_buf[ psPLC->subfr_length ], psPLC->subfr_length );
+    silk_PLC_energy(&energy1, &shift1, &energy2, &shift2, psDec->exc_Q14, prevGain_Q10, psDec->subfr_length, psDec->nb_subfr);
 
     if( silk_RSHIFT( energy1, shift2 ) < silk_RSHIFT( energy2, shift1 ) ) {
         /* First sub-frame has lowest energy */
@@ -270,7 +292,7 @@
     /* Rewhiten LTP state */
     idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2;
     silk_assert( idx > 0 );
-    silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order );
+    silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order, arch );
     /* Scale LTP state */
     inv_gain_Q30 = silk_INVERSE32_varQ( psPLC->prevGain_Q16[ 1 ], 46 );
     inv_gain_Q30 = silk_min( inv_gain_Q30, silk_int32_MAX >> 1 );
diff --git a/silk/PLC.h b/silk/PLC.h
index f1e2ecc..6438f51 100644
--- a/silk/PLC.h
+++ b/silk/PLC.h
@@ -48,7 +48,8 @@
     silk_decoder_state                  *psDec,             /* I/O Decoder state        */
     silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
     opus_int16                          frame[],            /* I/O  signal              */
-    opus_int                            lost                /* I Loss flag              */
+    opus_int                            lost,               /* I Loss flag              */
+    int                                 arch                /* I Run-time architecture  */
 );
 
 void silk_PLC_glue_frames(
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index 1b58057..b632994 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -41,7 +41,11 @@
 #include "typedef.h"
 #include "resampler_structs.h"
 #include "macros.h"
+#include "cpu_support.h"
 
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/SigProc_FIX_sse.h"
+#endif
 
 /********************************************************************/
 /*                    SIGNAL PROCESSING FUNCTIONS                   */
@@ -108,7 +112,8 @@
     const opus_int16            *in,                /* I    Input signal                                                */
     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
     const opus_int32            len,                /* I    Signal length                                               */
-    const opus_int32            d                   /* I    Filter order                                                */
+    const opus_int32            d,                  /* I    Filter order                                                */
+    int                         arch                /* I    Run-time architecture                                       */
 );
 
 /* Chirp (bandwidth expand) LP AR filter */
@@ -303,7 +308,7 @@
 );
 
 /* Compute reflection coefficients from input signal */
-void silk_burg_modified(
+void silk_burg_modified_c(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
@@ -335,12 +340,15 @@
 /********************************************************************/
 
 /*    return sum( inVec1[i] * inVec2[i] ) */
+
 opus_int32 silk_inner_prod_aligned(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
-    const opus_int              len                 /*    I vector lengths                                              */
+    const opus_int              len,                /*    I vector lengths                                              */
+    int                         arch                /*    I Run-time architecture                                       */
 );
 
+
 opus_int32 silk_inner_prod_aligned_scale(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
@@ -348,7 +356,7 @@
     const opus_int              len                 /*    I vector lengths                                              */
 );
 
-opus_int64 silk_inner_prod16_aligned_64(
+opus_int64 silk_inner_prod16_aligned_64_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
@@ -575,6 +583,14 @@
 /* the following seems faster on x86 */
 #define silk_SMMUL(a32, b32)                (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
 
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+    ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#endif
+
 #include "Inlines.h"
 #include "MacroCount.h"
 #include "MacroDebug.h"
@@ -587,6 +603,11 @@
 #include "arm/SigProc_FIX_armv5e.h"
 #endif
 
+#if defined(MIPSr1_ASM)
+#include "mips/sigproc_fix_mipsr1.h"
+#endif
+
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/silk/VAD.c b/silk/VAD.c
index a809098..0a782af 100644
--- a/silk/VAD.c
+++ b/silk/VAD.c
@@ -33,10 +33,12 @@
 #include "stack_alloc.h"
 
 /* Silk VAD noise level estimation */
+# if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
     const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
 );
+#endif
 
 /**********************************/
 /* Initialization of the Silk VAD */
@@ -77,7 +79,7 @@
 /***************************************/
 /* Get the speech activity level in Q8 */
 /***************************************/
-opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
+opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return value, 0 if success                  */
     silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
     const opus_int16            pIn[]                           /* I    PCM input                                   */
 )
@@ -296,7 +298,10 @@
 /**************************/
 /* Noise level estimation */
 /**************************/
-static OPUS_INLINE void silk_VAD_GetNoiseLevels(
+# if  !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+static OPUS_INLINE
+#endif
+void silk_VAD_GetNoiseLevels(
     const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
 )
diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c
index 13d5d34..7983f1d 100644
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@@ -32,7 +32,7 @@
 #include "main.h"
 
 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
-void silk_VQ_WMat_EC(
+void silk_VQ_WMat_EC_c(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
     opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
@@ -55,7 +55,7 @@
     *rate_dist_Q14 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
     for( k = 0; k < L; k++ ) {
-	    gain_tmp_Q7 = cb_gain_Q7[k];
+        gain_tmp_Q7 = cb_gain_Q7[k];
 
         diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
         diff_Q14[ 1 ] = in_Q14[ 1 ] - silk_LSHIFT( cb_row_Q7[ 1 ], 7 );
@@ -66,8 +66,8 @@
         /* Weighted rate */
         sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
 
-		/* Penalty for too large gain */
-		sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
+        /* Penalty for too large gain */
+        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
 
         silk_assert( sum1_Q14 >= 0 );
 
@@ -111,7 +111,7 @@
         if( sum1_Q14 < *rate_dist_Q14 ) {
             *rate_dist_Q14 = sum1_Q14;
             *ind = (opus_int8)k;
-			*gain_Q7 = gain_tmp_Q7;
+            *gain_Q7 = gain_tmp_Q7;
         }
 
         /* Go to next cbk vector */
diff --git a/silk/code_signs.c b/silk/code_signs.c
index 0419ea2..dfd1dca 100644
--- a/silk/code_signs.c
+++ b/silk/code_signs.c
@@ -74,7 +74,7 @@
 /* Decodes signs of excitation */
 void silk_decode_signs(
     ec_dec                      *psRangeDec,                        /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                           /* I/O  pulse signal                                */
+    opus_int16                  pulses[],                           /* I/O  pulse signal                                */
     opus_int                    length,                             /* I    length of input                             */
     const opus_int              signalType,                         /* I    Signal type                                 */
     const opus_int              quantOffsetType,                    /* I    Quantization offset type                    */
@@ -83,7 +83,7 @@
 {
     opus_int         i, j, p;
     opus_uint8       icdf[ 2 ];
-    opus_int         *q_ptr;
+    opus_int16       *q_ptr;
     const opus_uint8 *icdf_ptr;
 
     icdf[ 1 ] = 0;
diff --git a/silk/control_SNR.c b/silk/control_SNR.c
index f04e69f..cee87eb 100644
--- a/silk/control_SNR.c
+++ b/silk/control_SNR.c
@@ -70,11 +70,6 @@
                 break;
             }
         }
-
-        /* Reduce coding quality whenever LBRR is enabled, to free up some bits */
-        if( psEncC->LBRR_enabled ) {
-            psEncC->SNR_dB_Q7 = silk_SMLABB( psEncC->SNR_dB_Q7, 12 - psEncC->LBRR_GainIncreases, SILK_FIX_CONST( -0.25, 7 ) );
-        }
     }
 
     return ret;
diff --git a/silk/control_codec.c b/silk/control_codec.c
index 1f674bd..044eea3 100644
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -397,9 +397,10 @@
     const opus_int32            TargetRate_bps      /* I                        */
 )
 {
-    opus_int   ret = SILK_NO_ERROR;
+    opus_int   LBRR_in_previous_packet, ret = SILK_NO_ERROR;
     opus_int32 LBRR_rate_thres_bps;
 
+    LBRR_in_previous_packet = psEncC->LBRR_enabled;
     psEncC->LBRR_enabled = 0;
     if( psEncC->useInBandFEC && psEncC->PacketLoss_perc > 0 ) {
         if( psEncC->fs_kHz == 8 ) {
@@ -413,8 +414,13 @@
 
         if( TargetRate_bps > LBRR_rate_thres_bps ) {
             /* Set gain increase for coding LBRR excitation */
+            if( LBRR_in_previous_packet == 0 ) {
+                /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */
+                psEncC->LBRR_GainIncreases = 7;
+            } else {
+                psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
+            }
             psEncC->LBRR_enabled = 1;
-            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
         }
     }
 
diff --git a/silk/dec_API.c b/silk/dec_API.c
index 4cbcf71..b7d8ed4 100644
--- a/silk/dec_API.c
+++ b/silk/dec_API.c
@@ -31,6 +31,7 @@
 #include "API.h"
 #include "main.h"
 #include "stack_alloc.h"
+#include "os_support.h"
 
 /************************/
 /* Decoder Super Struct */
@@ -84,13 +85,15 @@
     opus_int                        newPacketFlag,      /* I    Indicates first decoder call for this packet    */
     ec_dec                          *psRangeDec,        /* I/O  Compressor data structure                       */
     opus_int16                      *samplesOut,        /* O    Decoded output speech vector                    */
-    opus_int32                      *nSamplesOut        /* O    Number of samples decoded                       */
+    opus_int32                      *nSamplesOut,       /* O    Number of samples decoded                       */
+    int                             arch                /* I    Run-time architecture                           */
 )
 {
     opus_int   i, n, decode_only_middle = 0, ret = SILK_NO_ERROR;
     opus_int32 nSamplesOutDec, LBRR_symbol;
     opus_int16 *samplesOut1_tmp[ 2 ];
-    VARDECL( opus_int16, samplesOut1_tmp_storage );
+    VARDECL( opus_int16, samplesOut1_tmp_storage1 );
+    VARDECL( opus_int16, samplesOut1_tmp_storage2 );
     VARDECL( opus_int16, samplesOut2_tmp );
     opus_int32 MS_pred_Q13[ 2 ] = { 0 };
     opus_int16 *resample_out_ptr;
@@ -98,6 +101,7 @@
     silk_decoder_state *channel_state = psDec->channel_state;
     opus_int has_side;
     opus_int stereo_to_mono;
+    int delay_stack_alloc;
     SAVE_STACK;
 
     silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );
@@ -196,7 +200,7 @@
             for( i = 0; i < channel_state[ 0 ].nFramesPerPacket; i++ ) {
                 for( n = 0; n < decControl->nChannelsInternal; n++ ) {
                     if( channel_state[ n ].LBRR_flags[ i ] ) {
-                        opus_int pulses[ MAX_FRAME_LENGTH ];
+                        opus_int16 pulses[ MAX_FRAME_LENGTH ];
                         opus_int condCoding;
 
                         if( decControl->nChannelsInternal == 2 && n == 0 ) {
@@ -251,13 +255,22 @@
         psDec->channel_state[ 1 ].first_frame_after_reset = 1;
     }
 
-    ALLOC( samplesOut1_tmp_storage,
-           decControl->nChannelsInternal*(
-               channel_state[ 0 ].frame_length + 2 ),
+    /* Check if the temp buffer fits into the output PCM buffer. If it fits,
+       we can delay allocating the temp buffer until after the SILK peak stack
+       usage. We need to use a < and not a <= because of the two extra samples. */
+    delay_stack_alloc = decControl->internalSampleRate*decControl->nChannelsInternal
+          < decControl->API_sampleRate*decControl->nChannelsAPI;
+    ALLOC( samplesOut1_tmp_storage1, delay_stack_alloc ? ALLOC_NONE
+           : decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ),
            opus_int16 );
-    samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage;
-    samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage
-                           + channel_state[ 0 ].frame_length + 2;
+    if ( delay_stack_alloc )
+    {
+       samplesOut1_tmp[ 0 ] = samplesOut;
+       samplesOut1_tmp[ 1 ] = samplesOut + channel_state[ 0 ].frame_length + 2;
+    } else {
+       samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage1;
+       samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage1 + channel_state[ 0 ].frame_length + 2;
+    }
 
     if( lostFlag == FLAG_DECODE_NORMAL ) {
         has_side = !decode_only_middle;
@@ -284,7 +297,7 @@
             } else {
                 condCoding = CODE_CONDITIONALLY;
             }
-            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding);
+            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch);
         } else {
             silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
         }
@@ -312,6 +325,15 @@
         resample_out_ptr = samplesOut;
     }
 
+    ALLOC( samplesOut1_tmp_storage2, delay_stack_alloc
+           ? decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 )
+           : ALLOC_NONE,
+           opus_int16 );
+    if ( delay_stack_alloc ) {
+       OPUS_COPY(samplesOut1_tmp_storage2, samplesOut, decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2));
+       samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage2;
+       samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage2 + channel_state[ 0 ].frame_length + 2;
+    }
     for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {
 
         /* Resample decoded signal to API_sampleRate */
diff --git a/silk/decode_core.c b/silk/decode_core.c
index a820bf1..b88991e 100644
--- a/silk/decode_core.c
+++ b/silk/decode_core.c
@@ -39,7 +39,8 @@
     silk_decoder_state          *psDec,                         /* I/O  Decoder state                               */
     silk_decoder_control        *psDecCtrl,                     /* I    Decoder control                             */
     opus_int16                  xq[],                           /* O    Decoded speech                              */
-    const opus_int              pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
+    const opus_int16            pulses[ MAX_FRAME_LENGTH ],     /* I    Pulse signal                                */
+    int                         arch                            /* I    Run-time architecture                       */
 )
 {
     opus_int   i, k, lag = 0, start_idx, sLTP_buf_idx, NLSF_interpolation_flag, signalType;
@@ -147,7 +148,7 @@
                 }
 
                 silk_LPC_analysis_filter( &sLTP[ start_idx ], &psDec->outBuf[ start_idx + k * psDec->subfr_length ],
-                    A_Q12, psDec->ltp_mem_length - start_idx, psDec->LPC_order );
+                    A_Q12, psDec->ltp_mem_length - start_idx, psDec->LPC_order, arch );
 
                 /* After rewhitening the LTP state is unscaled */
                 if( k == 0 ) {
diff --git a/silk/decode_frame.c b/silk/decode_frame.c
index abc00a3..a605d95 100644
--- a/silk/decode_frame.c
+++ b/silk/decode_frame.c
@@ -42,18 +42,16 @@
     opus_int16                  pOut[],                         /* O    Pointer to output speech frame              */
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
-    opus_int                    condCoding                      /* I    The type of conditional coding to use       */
+    opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+    int                         arch                            /* I    Run-time architecture                       */
 )
 {
     VARDECL( silk_decoder_control, psDecCtrl );
     opus_int         L, mv_len, ret = 0;
-    VARDECL( opus_int, pulses );
     SAVE_STACK;
 
     L = psDec->frame_length;
     ALLOC( psDecCtrl, 1, silk_decoder_control );
-    ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
-                   ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int );
     psDecCtrl->LTP_scale_Q14 = 0;
 
     /* Safety checks */
@@ -62,6 +60,9 @@
     if(   lostFlag == FLAG_DECODE_NORMAL ||
         ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) )
     {
+        VARDECL( opus_int16, pulses );
+        ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
+                       ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 );
         /*********************************************/
         /* Decode quantization indices of side info  */
         /*********************************************/
@@ -81,12 +82,12 @@
         /********************************************************/
         /* Run inverse NSQ                                      */
         /********************************************************/
-        silk_decode_core( psDec, psDecCtrl, pOut, pulses );
+        silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch );
 
         /********************************************************/
         /* Update PLC state                                     */
         /********************************************************/
-        silk_PLC( psDec, psDecCtrl, pOut, 0 );
+        silk_PLC( psDec, psDecCtrl, pOut, 0, arch );
 
         psDec->lossCnt = 0;
         psDec->prevSignalType = psDec->indices.signalType;
@@ -96,7 +97,7 @@
         psDec->first_frame_after_reset = 0;
     } else {
         /* Handle packet loss by extrapolation */
-        silk_PLC( psDec, psDecCtrl, pOut, 1 );
+        silk_PLC( psDec, psDecCtrl, pOut, 1, arch );
     }
 
     /*************************/
@@ -107,16 +108,16 @@
     silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
     silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );
 
-    /****************************************************************/
-    /* Ensure smooth connection of extrapolated and good frames     */
-    /****************************************************************/
-    silk_PLC_glue_frames( psDec, pOut, L );
-
     /************************************************/
     /* Comfort noise generation / estimation        */
     /************************************************/
     silk_CNG( psDec, psDecCtrl, pOut, L );
 
+    /****************************************************************/
+    /* Ensure smooth connection of extrapolated and good frames     */
+    /****************************************************************/
+    silk_PLC_glue_frames( psDec, pOut, L );
+
     /* Update some decoder state variables */
     psDec->lagPrev = psDecCtrl->pitchL[ psDec->nb_subfr - 1 ];
 
diff --git a/silk/decode_pulses.c b/silk/decode_pulses.c
index e8a87c2..d6bbec9 100644
--- a/silk/decode_pulses.c
+++ b/silk/decode_pulses.c
@@ -36,7 +36,7 @@
 /*********************************************/
 void silk_decode_pulses(
     ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                       /* O    Excitation signal                           */
+    opus_int16                  pulses[],                       /* O    Excitation signal                           */
     const opus_int              signalType,                     /* I    Sigtype                                     */
     const opus_int              quantOffsetType,                /* I    quantOffsetType                             */
     const opus_int              frame_length                    /* I    Frame length                                */
@@ -44,7 +44,7 @@
 {
     opus_int   i, j, k, iter, abs_q, nLS, RateLevelIndex;
     opus_int   sum_pulses[ MAX_NB_SHELL_BLOCKS ], nLshifts[ MAX_NB_SHELL_BLOCKS ];
-    opus_int   *pulses_ptr;
+    opus_int16 *pulses_ptr;
     const opus_uint8 *cdf_ptr;
 
     /*********************/
@@ -69,9 +69,9 @@
         sum_pulses[ i ] = ec_dec_icdf( psRangeDec, cdf_ptr, 8 );
 
         /* LSB indication */
-        while( sum_pulses[ i ] == MAX_PULSES + 1 ) {
+        while( sum_pulses[ i ] == SILK_MAX_PULSES + 1 ) {
             nLshifts[ i ]++;
-            /* When we've already got 10 LSBs, we shift the table to not allow (MAX_PULSES + 1) */
+            /* When we've already got 10 LSBs, we shift the table to not allow (SILK_MAX_PULSES + 1) */
             sum_pulses[ i ] = ec_dec_icdf( psRangeDec,
                     silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1] + ( nLshifts[ i ] == 10 ), 8 );
         }
@@ -84,7 +84,7 @@
         if( sum_pulses[ i ] > 0 ) {
             silk_shell_decoder( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], psRangeDec, sum_pulses[ i ] );
         } else {
-            silk_memset( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof( opus_int ) );
+            silk_memset( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof( pulses[0] ) );
         }
     }
 
diff --git a/silk/define.h b/silk/define.h
index c47aca9..19c9b00 100644
--- a/silk/define.h
+++ b/silk/define.h
@@ -169,7 +169,7 @@
 #define N_RATE_LEVELS                           10
 
 /* Maximum sum of pulses per shell coding frame */
-#define MAX_PULSES                              16
+#define SILK_MAX_PULSES                         16
 
 #define MAX_MATRIX_SIZE                         MAX_LPC_ORDER /* Max of LPC Order and LTP order */
 
diff --git a/silk/enc_API.c b/silk/enc_API.c
index 43739ef..f806028 100644
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -165,7 +165,7 @@
     psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded = psEnc->state_Fxx[ 1 ].sCmn.nFramesEncoded = 0;
 
     /* Check values in encoder control structure */
-    if( ( ret = check_control_input( encControl ) != 0 ) ) {
+    if( ( ret = check_control_input( encControl ) ) != 0 ) {
         silk_assert( 0 );
         RESTORE_STACK;
         return ret;
@@ -376,26 +376,33 @@
                 for( n = 0; n < encControl->nChannelsInternal; n++ ) {
                     silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) );
                 }
+
+                psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc );
             }
 
             silk_HP_variable_cutoff( psEnc->state_Fxx );
 
             /* Total target bits for packet */
             nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 );
-            /* Subtract half of the bits already used */
+            /* Subtract bits used for LBRR */
             if( !prefillFlag ) {
-                nBits -= ec_tell( psRangeEnc ) >> 1;
+                nBits -= psEnc->nBitsUsedLBRR;
             }
             /* Divide by number of uncoded frames left in packet */
-            nBits = silk_DIV32_16( nBits, psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket - psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded );
+            nBits = silk_DIV32_16( nBits, psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket );
             /* Convert to bits/second */
             if( encControl->payloadSize_ms == 10 ) {
                 TargetRate_bps = silk_SMULBB( nBits, 100 );
             } else {
                 TargetRate_bps = silk_SMULBB( nBits, 50 );
             }
-            /* Subtract fraction of bits in excess of target in previous packets */
+            /* Subtract fraction of bits in excess of target in previous frames and packets */
             TargetRate_bps -= silk_DIV32_16( silk_MUL( psEnc->nBitsExceeded, 1000 ), BITRESERVOIR_DECAY_TIME_MS );
+            if( !prefillFlag && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded > 0 ) {
+                /* Compare actual vs target bits so far in this packet */
+                opus_int32 bitsBalance = ec_tell( psRangeEnc ) - psEnc->nBitsUsedLBRR - nBits * psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded;
+                TargetRate_bps -= silk_DIV32_16( silk_MUL( bitsBalance, 1000 ), BITRESERVOIR_DECAY_TIME_MS );
+            }
             /* Never exceed input bitrate */
             TargetRate_bps = silk_LIMIT( TargetRate_bps, encControl->bitRate, 5000 );
 
diff --git a/silk/encode_pulses.c b/silk/encode_pulses.c
index a450143..ab00264 100644
--- a/silk/encode_pulses.c
+++ b/silk/encode_pulses.c
@@ -142,7 +142,7 @@
         sumBits_Q5 = silk_rate_levels_BITS_Q5[ signalType >> 1 ][ k ];
         for( i = 0; i < iter; i++ ) {
             if( nRshifts[ i ] > 0 ) {
-                sumBits_Q5 += nBits_ptr[ MAX_PULSES + 1 ];
+                sumBits_Q5 += nBits_ptr[ SILK_MAX_PULSES + 1 ];
             } else {
                 sumBits_Q5 += nBits_ptr[ sum_pulses[ i ] ];
             }
@@ -162,9 +162,9 @@
         if( nRshifts[ i ] == 0 ) {
             ec_enc_icdf( psRangeEnc, sum_pulses[ i ], cdf_ptr, 8 );
         } else {
-            ec_enc_icdf( psRangeEnc, MAX_PULSES + 1, cdf_ptr, 8 );
+            ec_enc_icdf( psRangeEnc, SILK_MAX_PULSES + 1, cdf_ptr, 8 );
             for( k = 0; k < nRshifts[ i ] - 1; k++ ) {
-                ec_enc_icdf( psRangeEnc, MAX_PULSES + 1, silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1 ], 8 );
+                ec_enc_icdf( psRangeEnc, SILK_MAX_PULSES + 1, silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1 ], 8 );
             }
             ec_enc_icdf( psRangeEnc, sum_pulses[ i ], silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1 ], 8 );
         }
diff --git a/silk/fixed/LTP_analysis_filter_FIX.c b/silk/fixed/LTP_analysis_filter_FIX.c
index a941908..5574e70 100644
--- a/silk/fixed/LTP_analysis_filter_FIX.c
+++ b/silk/fixed/LTP_analysis_filter_FIX.c
@@ -45,7 +45,7 @@
     const opus_int16 *x_ptr, *x_lag_ptr;
     opus_int16   Btmp_Q14[ LTP_ORDER ];
     opus_int16   *LTP_res_ptr;
-    opus_int     k, i, j;
+    opus_int     k, i;
     opus_int32   LTP_est;
 
     x_ptr = x;
@@ -53,9 +53,12 @@
     for( k = 0; k < nb_subfr; k++ ) {
 
         x_lag_ptr = x_ptr - pitchL[ k ];
-        for( i = 0; i < LTP_ORDER; i++ ) {
-            Btmp_Q14[ i ] = LTPCoef_Q14[ k * LTP_ORDER + i ];
-        }
+
+        Btmp_Q14[ 0 ] = LTPCoef_Q14[ k * LTP_ORDER ];
+        Btmp_Q14[ 1 ] = LTPCoef_Q14[ k * LTP_ORDER + 1 ];
+        Btmp_Q14[ 2 ] = LTPCoef_Q14[ k * LTP_ORDER + 2 ];
+        Btmp_Q14[ 3 ] = LTPCoef_Q14[ k * LTP_ORDER + 3 ];
+        Btmp_Q14[ 4 ] = LTPCoef_Q14[ k * LTP_ORDER + 4 ];
 
         /* LTP analysis FIR filter */
         for( i = 0; i < subfr_length + pre_length; i++ ) {
@@ -63,9 +66,11 @@
 
             /* Long-term prediction */
             LTP_est = silk_SMULBB( x_lag_ptr[ LTP_ORDER / 2 ], Btmp_Q14[ 0 ] );
-            for( j = 1; j < LTP_ORDER; j++ ) {
-                LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ LTP_ORDER / 2 - j ], Btmp_Q14[ j ] );
-            }
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 1 ], Btmp_Q14[ 1 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 0 ], Btmp_Q14[ 2 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -1 ], Btmp_Q14[ 3 ] );
+            LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -2 ], Btmp_Q14[ 4 ] );
+
             LTP_est = silk_RSHIFT_ROUND( LTP_est, 14 ); /* round and -> Q0*/
 
             /* Subtract long-term prediction */
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index db34829..4878553 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -42,7 +42,7 @@
 #define MAX_RSHIFTS                 (32 - QA)
 
 /* Compute reflection coefficients from input signal */
-void silk_burg_modified(
+void silk_burg_modified_c(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
@@ -54,7 +54,7 @@
     int                         arch                /* I    Run-time architecture                                       */
 )
 {
-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int         k, n, s, lz, rshifts, reached_max_gain;
     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     const opus_int16 *x_ptr;
     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -63,27 +63,23 @@
     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+    opus_int64       C0_64;
 
     silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
-    if( rshifts > MAX_RSHIFTS ) {
-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
-        silk_assert( C0 > 0 );
-        rshifts = MAX_RSHIFTS;
+    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+    lz = silk_CLZ64(C0_64);
+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+    if (rshifts > 0) {
+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
     } else {
-        lz = silk_CLZ32( C0 ) - 1;
-        rshifts_extra = N_BITS_HEAD_ROOM - lz;
-        if( rshifts_extra > 0 ) {
-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
-            C0 = silk_RSHIFT32( C0, rshifts_extra );
-        } else {
-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
-            C0 = silk_LSHIFT32( C0, -rshifts_extra );
-        }
-        rshifts += rshifts_extra;
+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
     }
+
     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     if( rshifts > 0 ) {
@@ -91,7 +87,7 @@
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n ), rshifts );
+                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -252,12 +248,12 @@
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D ), -rshifts );
+                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch), -rshifts);
             }
         }
         /* Approximate residual energy */
diff --git a/silk/fixed/corrMatrix_FIX.c b/silk/fixed/corrMatrix_FIX.c
index c617270..c1d437c 100644
--- a/silk/fixed/corrMatrix_FIX.c
+++ b/silk/fixed/corrMatrix_FIX.c
@@ -42,7 +42,8 @@
     const opus_int                  L,                                      /* I    Length of vectors                                                           */
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     opus_int32                      *Xt,                                    /* O    Pointer to X'*t correlation vector [order]                                  */
-    const opus_int                  rshifts                                 /* I    Right shifts of correlations                                                */
+    const opus_int                  rshifts,                                /* I    Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         lag, i;
@@ -65,7 +66,7 @@
     } else {
         silk_assert( rshifts == 0 );
         for( lag = 0; lag < order; lag++ ) {
-            Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L ); /* X[:,lag]'*t */
+            Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L, arch ); /* X[:,lag]'*t */
             ptr1--; /* Go to next column of X */
         }
     }
@@ -78,7 +79,8 @@
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     const opus_int                  head_room,                              /* I    Desired headroom                                                            */
     opus_int32                      *XX,                                    /* O    Pointer to X'*X correlation matrix [ order x order ]                        */
-    opus_int                        *rshifts                                /* I/O  Right shifts of correlations                                                */
+    opus_int                        *rshifts,                               /* I/O  Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         i, j, lag, rshifts_local, head_room_rshifts;
@@ -138,7 +140,7 @@
     } else {
         for( lag = 1; lag < order; lag++ ) {
             /* Inner product of column 0 and column lag: X[:,0]'*X[:,lag] */
-            energy = silk_inner_prod_aligned( ptr1, ptr2, L );
+            energy = silk_inner_prod_aligned( ptr1, ptr2, L, arch );
             matrix_ptr( XX, lag, 0, order ) = energy;
             matrix_ptr( XX, 0, lag, order ) = energy;
             /* Calculate remaining off diagonal: X[:,j]'*X[:,j + lag] */
diff --git a/silk/fixed/encode_frame_FIX.c b/silk/fixed/encode_frame_FIX.c
index b490986..5ef44b0 100644
--- a/silk/fixed/encode_frame_FIX.c
+++ b/silk/fixed/encode_frame_FIX.c
@@ -48,7 +48,7 @@
     /****************************/
     /* Voice Activity Detection */
     /****************************/
-    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1 );
+    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
 
     /**************************************************/
     /* Convert speech activity into VAD and DTX flags */
@@ -196,11 +196,13 @@
                 if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) {
                     silk_NSQ_del_dec( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
                            sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
-                           sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
+                           sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
+                           psEnc->sCmn.arch );
                 } else {
                     silk_NSQ( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
                             sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
-                            sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
+                            sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
+                            psEnc->sCmn.arch);
                 }
 
                 /****************************************/
@@ -287,7 +289,7 @@
             for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) {
                 sEncCtrl.Gains_Q16[ i ] = silk_LSHIFT_SAT32( silk_SMULWB( sEncCtrl.GainsUnq_Q16[ i ], gainMult_Q8 ), 8 );
             }
- 
+
             /* Quantize gains */
             psEnc->sShape.LastGainIndex = sEncCtrl.lastGainIndexPrev;
             silk_gains_quant( psEnc->sCmn.indices.GainsIndices, sEncCtrl.Gains_Q16,
@@ -371,12 +373,12 @@
             silk_NSQ_del_dec( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
                 psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
                 psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
-                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
+                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
         } else {
             silk_NSQ( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
                 psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
                 psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
-                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
+                psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
         }
 
         /* Restore original gains */
diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c
index 783d32e..e11cdc8 100644
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -95,7 +95,7 @@
             silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
 
             /* Calculate residual energy with NLSF interpolation */
-            silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder );
+            silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
 
             silk_sum_sqr_shift( &res_nrg0, &rshift0, LPC_res + psEncC->predictLPCOrder,                subfr_length - psEncC->predictLPCOrder );
             silk_sum_sqr_shift( &res_nrg1, &rshift1, LPC_res + psEncC->predictLPCOrder + subfr_length, subfr_length - psEncC->predictLPCOrder );
diff --git a/silk/fixed/find_LTP_FIX.c b/silk/fixed/find_LTP_FIX.c
index 8c4d703..1314a28 100644
--- a/silk/fixed/find_LTP_FIX.c
+++ b/silk/fixed/find_LTP_FIX.c
@@ -50,7 +50,8 @@
     const opus_int                  subfr_length,                           /* I    subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    number of subframes                                                         */
     const opus_int                  mem_offset,                             /* I    number of samples in LTP memory                                             */
-    opus_int                        corr_rshifts[ MAX_NB_SUBFR ]            /* O    right shifts applied to correlations                                        */
+    opus_int                        corr_rshifts[ MAX_NB_SUBFR ],           /* O    right shifts applied to correlations                                        */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int   i, k, lshift;
@@ -84,10 +85,10 @@
             rr_shifts += ( LTP_CORRS_HEAD_ROOM - LZs );
         }
         corr_rshifts[ k ] = rr_shifts;
-        silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ] );  /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
+        silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ], arch );  /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
 
         /* The correlation vector always has lower max abs value than rr and/or RR so head room is assured */
-        silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ] );  /* Rr_fix_ptr   in Q( -corr_rshifts[ k ] ) */
+        silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ], arch );  /* Rr_fix_ptr   in Q( -corr_rshifts[ k ] ) */
         if( corr_rshifts[ k ] > rr_shifts ) {
             rr[ k ] = silk_RSHIFT( rr[ k ], corr_rshifts[ k ] - rr_shifts ); /* rr[ k ] in Q( -corr_rshifts[ k ] ) */
         }
diff --git a/silk/fixed/find_pitch_lags_FIX.c b/silk/fixed/find_pitch_lags_FIX.c
index 620f8dc..b8440a8 100644
--- a/silk/fixed/find_pitch_lags_FIX.c
+++ b/silk/fixed/find_pitch_lags_FIX.c
@@ -112,7 +112,7 @@
     /*****************************************/
     /* LPC analysis filtering                */
     /*****************************************/
-    silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder );
+    silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder, psEnc->sCmn.arch );
 
     if( psEnc->sCmn.indices.signalType != TYPE_NO_VOICE_ACTIVITY && psEnc->sCmn.first_frame_after_reset == 0 ) {
         /* Threshold for pitch estimator */
diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c
index 5c22f82..d308e9c 100644
--- a/silk/fixed/find_pred_coefs_FIX.c
+++ b/silk/fixed/find_pred_coefs_FIX.c
@@ -89,11 +89,12 @@
         /* LTP analysis */
         silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7,
             res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length,
-            psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift );
+            psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
-            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr);
+            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr,
+            psEnc->sCmn.arch);
 
         /* Control LTP scaling */
         silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl, condCoding );
@@ -118,16 +119,16 @@
 
         silk_memset( psEncCtrl->LTPCoef_Q14, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( opus_int16 ) );
         psEncCtrl->LTPredCodGain_Q7 = 0;
-		psEnc->sCmn.sum_log_gain_Q7 = 0;
+        psEnc->sCmn.sum_log_gain_Q7 = 0;
     }
 
     /* Limit on total predictive coding gain */
     if( psEnc->sCmn.first_frame_after_reset ) {
         minInvGain_Q30 = SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN_AFTER_RESET, 30 );
-    } else {        
+    } else {
         minInvGain_Q30 = silk_log2lin( silk_SMLAWB( 16 << 7, (opus_int32)psEncCtrl->LTPredCodGain_Q7, SILK_FIX_CONST( 1.0 / 3, 16 ) ) );      /* Q16 */
-        minInvGain_Q30 = silk_DIV32_varQ( minInvGain_Q30, 
-            silk_SMULWW( SILK_FIX_CONST( MAX_PREDICTION_POWER_GAIN, 0 ), 
+        minInvGain_Q30 = silk_DIV32_varQ( minInvGain_Q30,
+            silk_SMULWW( SILK_FIX_CONST( MAX_PREDICTION_POWER_GAIN, 0 ),
                 silk_SMLAWB( SILK_FIX_CONST( 0.25, 18 ), SILK_FIX_CONST( 0.75, 18 ), psEncCtrl->coding_quality_Q14 ) ), 14 );
     }
 
@@ -139,7 +140,7 @@
 
     /* Calculate residual energy using quantized LPC coefficients */
     silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains,
-        psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder );
+        psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.arch );
 
     /* Copy to prediction struct for use in next frame for interpolation */
     silk_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) );
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index a56ca07..375b5eb 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -97,6 +97,17 @@
     const opus_int16                x[]                                     /* I    Speech signal                                                               */
 );
 
+void silk_warped_LPC_analysis_filter_FIX_c(
+          opus_int32            state[],                    /* I/O  State [order + 1]                   */
+          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+);
+
+
 /**************************/
 /* Noise shaping analysis */
 /**************************/
@@ -166,7 +177,8 @@
     const opus_int                  subfr_length,                           /* I    subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    number of subframes                                                         */
     const opus_int                  mem_offset,                             /* I    number of samples in LTP memory                                             */
-    opus_int                        corr_rshifts[ MAX_NB_SUBFR ]            /* O    right shifts applied to correlations                                        */
+    opus_int                        corr_rshifts[ MAX_NB_SUBFR ],           /* O    right shifts applied to correlations                                        */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 void silk_LTP_analysis_filter_FIX(
@@ -190,7 +202,8 @@
     const opus_int32                gains[ MAX_NB_SUBFR ],                  /* I    Quantization gains                                                          */
     const opus_int                  subfr_length,                           /* I    Subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    Number of subframes                                                         */
-    const opus_int                  LPC_order                               /* I    LPC order                                                                   */
+    const opus_int                  LPC_order,                              /* I    LPC order                                                                   */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 /* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */
@@ -220,7 +233,8 @@
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     const opus_int                  head_room,                              /* I    Desired headroom                                                            */
     opus_int32                      *XX,                                    /* O    Pointer to X'*X correlation matrix [ order x order ]                        */
-    opus_int                        *rshifts                                /* I/O  Right shifts of correlations                                                */
+    opus_int                        *rshifts,                               /* I/O  Right shifts of correlations                                                */
+    int                              arch                                   /* I    Run-time architecture                                                       */
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +244,8 @@
     const opus_int                  L,                                      /* I    Length of vectors                                                           */
     const opus_int                  order,                                  /* I    Max lag for correlation                                                     */
     opus_int32                      *Xt,                                    /* O    Pointer to X'*t correlation vector [order]                                  */
-    const opus_int                  rshifts                                 /* I    Right shifts of correlations                                                */
+    const opus_int                  rshifts,                                /* I    Right shifts of correlations                                                */
+    int                             arch                                    /* I    Run-time architecture                                                       */
 );
 
 /* Add noise to matrix diagonal */
diff --git a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h
new file mode 100644
index 0000000..c30481e
--- /dev/null
+++ b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h
@@ -0,0 +1,336 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+
+/**************************************************************/
+/* Compute noise shaping coefficients and initial gain values */
+/**************************************************************/
+#define OVERRIDE_silk_noise_shape_analysis_FIX
+
+void silk_noise_shape_analysis_FIX(
+    silk_encoder_state_FIX          *psEnc,                                 /* I/O  Encoder state FIX                                                           */
+    silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  Encoder control FIX                                                         */
+    const opus_int16                *pitch_res,                             /* I    LPC residual from pitch analysis                                            */
+    const opus_int16                *x,                                     /* I    Input signal [ frame_length + la_shape ]                                    */
+    int                              arch                                   /* I    Run-time architecture                                                       */
+)
+{
+    silk_shape_state_FIX *psShapeSt = &psEnc->sShape;
+    opus_int     k, i, nSamples, Qnrg, b_Q14, warping_Q16, scale = 0;
+    opus_int32   SNR_adj_dB_Q7, HarmBoost_Q16, HarmShapeGain_Q16, Tilt_Q16, tmp32;
+    opus_int32   nrg, pre_nrg_Q30, log_energy_Q7, log_energy_prev_Q7, energy_variation_Q7;
+    opus_int32   delta_Q16, BWExp1_Q16, BWExp2_Q16, gain_mult_Q16, gain_add_Q16, strength_Q16, b_Q8;
+    opus_int32   auto_corr[     MAX_SHAPE_LPC_ORDER + 1 ];
+    opus_int32   refl_coef_Q16[ MAX_SHAPE_LPC_ORDER ];
+    opus_int32   AR1_Q24[       MAX_SHAPE_LPC_ORDER ];
+    opus_int32   AR2_Q24[       MAX_SHAPE_LPC_ORDER ];
+    VARDECL( opus_int16, x_windowed );
+    const opus_int16 *x_ptr, *pitch_res_ptr;
+    SAVE_STACK;
+
+    /* Point to start of first LPC analysis block */
+    x_ptr = x - psEnc->sCmn.la_shape;
+
+    /****************/
+    /* GAIN CONTROL */
+    /****************/
+    SNR_adj_dB_Q7 = psEnc->sCmn.SNR_dB_Q7;
+
+    /* Input quality is the average of the quality in the lowest two VAD bands */
+    psEncCtrl->input_quality_Q14 = ( opus_int )silk_RSHIFT( (opus_int32)psEnc->sCmn.input_quality_bands_Q15[ 0 ]
+        + psEnc->sCmn.input_quality_bands_Q15[ 1 ], 2 );
+
+    /* Coding quality level, between 0.0_Q0 and 1.0_Q0, but in Q14 */
+    psEncCtrl->coding_quality_Q14 = silk_RSHIFT( silk_sigm_Q15( silk_RSHIFT_ROUND( SNR_adj_dB_Q7 -
+        SILK_FIX_CONST( 20.0, 7 ), 4 ) ), 1 );
+
+    /* Reduce coding SNR during low speech activity */
+    if( psEnc->sCmn.useCBR == 0 ) {
+        b_Q8 = SILK_FIX_CONST( 1.0, 8 ) - psEnc->sCmn.speech_activity_Q8;
+        b_Q8 = silk_SMULWB( silk_LSHIFT( b_Q8, 8 ), b_Q8 );
+        SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7,
+            silk_SMULBB( SILK_FIX_CONST( -BG_SNR_DECR_dB, 7 ) >> ( 4 + 1 ), b_Q8 ),                                       /* Q11*/
+            silk_SMULWB( SILK_FIX_CONST( 1.0, 14 ) + psEncCtrl->input_quality_Q14, psEncCtrl->coding_quality_Q14 ) );     /* Q12*/
+    }
+
+    if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) {
+        /* Reduce gains for periodic signals */
+        SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( HARM_SNR_INCR_dB, 8 ), psEnc->LTPCorr_Q15 );
+    } else {
+        /* For unvoiced signals and low-quality input, adjust the quality slower than SNR_dB setting */
+        SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7,
+            silk_SMLAWB( SILK_FIX_CONST( 6.0, 9 ), -SILK_FIX_CONST( 0.4, 18 ), psEnc->sCmn.SNR_dB_Q7 ),
+            SILK_FIX_CONST( 1.0, 14 ) - psEncCtrl->input_quality_Q14 );
+    }
+
+    /*************************/
+    /* SPARSENESS PROCESSING */
+    /*************************/
+    /* Set quantizer offset */
+    if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) {
+        /* Initially set to 0; may be overruled in process_gains(..) */
+        psEnc->sCmn.indices.quantOffsetType = 0;
+        psEncCtrl->sparseness_Q8 = 0;
+    } else {
+        /* Sparseness measure, based on relative fluctuations of energy per 2 milliseconds */
+        nSamples = silk_LSHIFT( psEnc->sCmn.fs_kHz, 1 );
+        energy_variation_Q7 = 0;
+        log_energy_prev_Q7  = 0;
+        pitch_res_ptr = pitch_res;
+        for( k = 0; k < silk_SMULBB( SUB_FRAME_LENGTH_MS, psEnc->sCmn.nb_subfr ) / 2; k++ ) {
+            silk_sum_sqr_shift( &nrg, &scale, pitch_res_ptr, nSamples );
+            nrg += silk_RSHIFT( nSamples, scale );           /* Q(-scale)*/
+
+            log_energy_Q7 = silk_lin2log( nrg );
+            if( k > 0 ) {
+                energy_variation_Q7 += silk_abs( log_energy_Q7 - log_energy_prev_Q7 );
+            }
+            log_energy_prev_Q7 = log_energy_Q7;
+            pitch_res_ptr += nSamples;
+        }
+
+        psEncCtrl->sparseness_Q8 = silk_RSHIFT( silk_sigm_Q15( silk_SMULWB( energy_variation_Q7 -
+            SILK_FIX_CONST( 5.0, 7 ), SILK_FIX_CONST( 0.1, 16 ) ) ), 7 );
+
+        /* Set quantization offset depending on sparseness measure */
+        if( psEncCtrl->sparseness_Q8 > SILK_FIX_CONST( SPARSENESS_THRESHOLD_QNT_OFFSET, 8 ) ) {
+            psEnc->sCmn.indices.quantOffsetType = 0;
+        } else {
+            psEnc->sCmn.indices.quantOffsetType = 1;
+        }
+
+        /* Increase coding SNR for sparse signals */
+        SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( SPARSE_SNR_INCR_dB, 15 ), psEncCtrl->sparseness_Q8 - SILK_FIX_CONST( 0.5, 8 ) );
+    }
+
+    /*******************************/
+    /* Control bandwidth expansion */
+    /*******************************/
+    /* More BWE for signals with high prediction gain */
+    strength_Q16 = silk_SMULWB( psEncCtrl->predGain_Q16, SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) );
+    BWExp1_Q16 = BWExp2_Q16 = silk_DIV32_varQ( SILK_FIX_CONST( BANDWIDTH_EXPANSION, 16 ),
+        silk_SMLAWW( SILK_FIX_CONST( 1.0, 16 ), strength_Q16, strength_Q16 ), 16 );
+    delta_Q16  = silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - silk_SMULBB( 3, psEncCtrl->coding_quality_Q14 ),
+        SILK_FIX_CONST( LOW_RATE_BANDWIDTH_EXPANSION_DELTA, 16 ) );
+    BWExp1_Q16 = silk_SUB32( BWExp1_Q16, delta_Q16 );
+    BWExp2_Q16 = silk_ADD32( BWExp2_Q16, delta_Q16 );
+    /* BWExp1 will be applied after BWExp2, so make it relative */
+    BWExp1_Q16 = silk_DIV32_16( silk_LSHIFT( BWExp1_Q16, 14 ), silk_RSHIFT( BWExp2_Q16, 2 ) );
+
+    if( psEnc->sCmn.warping_Q16 > 0 ) {
+        /* Slightly more warping in analysis will move quantization noise up in frequency, where it's better masked */
+        warping_Q16 = silk_SMLAWB( psEnc->sCmn.warping_Q16, (opus_int32)psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( 0.01, 18 ) );
+    } else {
+        warping_Q16 = 0;
+    }
+
+    /********************************************/
+    /* Compute noise shaping AR coefs and gains */
+    /********************************************/
+    ALLOC( x_windowed, psEnc->sCmn.shapeWinLength, opus_int16 );
+    for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) {
+        /* Apply window: sine slope followed by flat part followed by cosine slope */
+        opus_int shift, slope_part, flat_part;
+        flat_part = psEnc->sCmn.fs_kHz * 3;
+        slope_part = silk_RSHIFT( psEnc->sCmn.shapeWinLength - flat_part, 1 );
+
+        silk_apply_sine_window( x_windowed, x_ptr, 1, slope_part );
+        shift = slope_part;
+        silk_memcpy( x_windowed + shift, x_ptr + shift, flat_part * sizeof(opus_int16) );
+        shift += flat_part;
+        silk_apply_sine_window( x_windowed + shift, x_ptr + shift, 2, slope_part );
+
+        /* Update pointer: next LPC analysis block */
+        x_ptr += psEnc->sCmn.subfr_length;
+
+        if( psEnc->sCmn.warping_Q16 > 0 ) {
+            /* Calculate warped auto correlation */
+            silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
+        } else {
+            /* Calculate regular auto correlation */
+            silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch );
+        }
+
+        /* Add white noise, as a fraction of energy */
+        auto_corr[0] = silk_ADD32( auto_corr[0], silk_max_32( silk_SMULWB( silk_RSHIFT( auto_corr[ 0 ], 4 ),
+            SILK_FIX_CONST( SHAPE_WHITE_NOISE_FRACTION, 20 ) ), 1 ) );
+
+        /* Calculate the reflection coefficients using schur */
+        nrg = silk_schur64( refl_coef_Q16, auto_corr, psEnc->sCmn.shapingLPCOrder );
+        silk_assert( nrg >= 0 );
+
+        /* Convert reflection coefficients to prediction coefficients */
+        silk_k2a_Q16( AR2_Q24, refl_coef_Q16, psEnc->sCmn.shapingLPCOrder );
+
+        Qnrg = -scale;          /* range: -12...30*/
+        silk_assert( Qnrg >= -12 );
+        silk_assert( Qnrg <=  30 );
+
+        /* Make sure that Qnrg is an even number */
+        if( Qnrg & 1 ) {
+            Qnrg -= 1;
+            nrg >>= 1;
+        }
+
+        tmp32 = silk_SQRT_APPROX( nrg );
+        Qnrg >>= 1;             /* range: -6...15*/
+
+        psEncCtrl->Gains_Q16[ k ] = (silk_LSHIFT32( silk_LIMIT( (tmp32), silk_RSHIFT32( silk_int32_MIN, (16 - Qnrg) ), \
+                            silk_RSHIFT32( silk_int32_MAX, (16 - Qnrg) ) ), (16 - Qnrg) ));
+
+        if( psEnc->sCmn.warping_Q16 > 0 ) {
+            /* Adjust gain for warping */
+            gain_mult_Q16 = warped_gain( AR2_Q24, warping_Q16, psEnc->sCmn.shapingLPCOrder );
+            silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 );
+            if ( silk_SMULWW( silk_RSHIFT_ROUND( psEncCtrl->Gains_Q16[ k ], 1 ), gain_mult_Q16 ) >= ( silk_int32_MAX >> 1 ) ) {
+               psEncCtrl->Gains_Q16[ k ] = silk_int32_MAX;
+            } else {
+               psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 );
+            }
+        }
+
+        /* Bandwidth expansion for synthesis filter shaping */
+        silk_bwexpander_32( AR2_Q24, psEnc->sCmn.shapingLPCOrder, BWExp2_Q16 );
+
+        /* Compute noise shaping filter coefficients */
+        silk_memcpy( AR1_Q24, AR2_Q24, psEnc->sCmn.shapingLPCOrder * sizeof( opus_int32 ) );
+
+        /* Bandwidth expansion for analysis filter shaping */
+        silk_assert( BWExp1_Q16 <= SILK_FIX_CONST( 1.0, 16 ) );
+        silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 );
+
+        /* Ratio of prediction gains, in energy domain */
+        pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder );
+        nrg         = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder );
+
+        /*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/
+        pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 );
+        psEncCtrl->GainsPre_Q14[ k ] = ( opus_int ) SILK_FIX_CONST( 0.3, 14 ) + silk_DIV32_varQ( pre_nrg_Q30, nrg, 14 );
+
+        /* Convert to monic warped prediction coefficients and limit absolute values */
+        limit_warped_coefs( AR2_Q24, AR1_Q24, warping_Q16, SILK_FIX_CONST( 3.999, 24 ), psEnc->sCmn.shapingLPCOrder );
+
+        /* Convert from Q24 to Q13 and store in int16 */
+        for( i = 0; i < psEnc->sCmn.shapingLPCOrder; i++ ) {
+            psEncCtrl->AR1_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR1_Q24[ i ], 11 ) );
+            psEncCtrl->AR2_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR2_Q24[ i ], 11 ) );
+        }
+    }
+
+    /*****************/
+    /* Gain tweaking */
+    /*****************/
+    /* Increase gains during low speech activity and put lower limit on gains */
+    gain_mult_Q16 = silk_log2lin( -silk_SMLAWB( -SILK_FIX_CONST( 16.0, 7 ), SNR_adj_dB_Q7, SILK_FIX_CONST( 0.16, 16 ) ) );
+    gain_add_Q16  = silk_log2lin(  silk_SMLAWB(  SILK_FIX_CONST( 16.0, 7 ), SILK_FIX_CONST( MIN_QGAIN_DB, 7 ), SILK_FIX_CONST( 0.16, 16 ) ) );
+    silk_assert( gain_mult_Q16 > 0 );
+    for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) {
+        psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 );
+        silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 );
+        psEncCtrl->Gains_Q16[ k ] = silk_ADD_POS_SAT32( psEncCtrl->Gains_Q16[ k ], gain_add_Q16 );
+    }
+
+    gain_mult_Q16 = SILK_FIX_CONST( 1.0, 16 ) + silk_RSHIFT_ROUND( silk_MLA( SILK_FIX_CONST( INPUT_TILT, 26 ),
+        psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( HIGH_RATE_INPUT_TILT, 12 ) ), 10 );
+    for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) {
+        psEncCtrl->GainsPre_Q14[ k ] = silk_SMULWB( gain_mult_Q16, psEncCtrl->GainsPre_Q14[ k ] );
+    }
+
+    /************************************************/
+    /* Control low-frequency shaping and noise tilt */
+    /************************************************/
+    /* Less low frequency shaping for noisy inputs */
+    strength_Q16 = silk_MUL( SILK_FIX_CONST( LOW_FREQ_SHAPING, 4 ), silk_SMLAWB( SILK_FIX_CONST( 1.0, 12 ),
+        SILK_FIX_CONST( LOW_QUALITY_LOW_FREQ_SHAPING_DECR, 13 ), psEnc->sCmn.input_quality_bands_Q15[ 0 ] - SILK_FIX_CONST( 1.0, 15 ) ) );
+    strength_Q16 = silk_RSHIFT( silk_MUL( strength_Q16, psEnc->sCmn.speech_activity_Q8 ), 8 );
+    if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) {
+        /* Reduce low frequencies quantization noise for periodic signals, depending on pitch lag */
+        /*f = 400; freqz([1, -0.98 + 2e-4 * f], [1, -0.97 + 7e-4 * f], 2^12, Fs); axis([0, 1000, -10, 1])*/
+        opus_int fs_kHz_inv = silk_DIV32_16( SILK_FIX_CONST( 0.2, 14 ), psEnc->sCmn.fs_kHz );
+        for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) {
+            b_Q14 = fs_kHz_inv + silk_DIV32_16( SILK_FIX_CONST( 3.0, 14 ), psEncCtrl->pitchL[ k ] );
+            /* Pack two coefficients in one int32 */
+            psEncCtrl->LF_shp_Q14[ k ]  = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 - silk_SMULWB( strength_Q16, b_Q14 ), 16 );
+            psEncCtrl->LF_shp_Q14[ k ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) );
+        }
+        silk_assert( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ) < SILK_FIX_CONST( 0.5, 24 ) ); /* Guarantees that second argument to SMULWB() is within range of an opus_int16*/
+        Tilt_Q16 = - SILK_FIX_CONST( HP_NOISE_COEF, 16 ) -
+            silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - SILK_FIX_CONST( HP_NOISE_COEF, 16 ),
+                silk_SMULWB( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ), psEnc->sCmn.speech_activity_Q8 ) );
+    } else {
+        b_Q14 = silk_DIV32_16( 21299, psEnc->sCmn.fs_kHz ); /* 1.3_Q0 = 21299_Q14*/
+        /* Pack two coefficients in one int32 */
+        psEncCtrl->LF_shp_Q14[ 0 ]  = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 -
+            silk_SMULWB( strength_Q16, silk_SMULWB( SILK_FIX_CONST( 0.6, 16 ), b_Q14 ) ), 16 );
+        psEncCtrl->LF_shp_Q14[ 0 ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) );
+        for( k = 1; k < psEnc->sCmn.nb_subfr; k++ ) {
+            psEncCtrl->LF_shp_Q14[ k ] = psEncCtrl->LF_shp_Q14[ 0 ];
+        }
+        Tilt_Q16 = -SILK_FIX_CONST( HP_NOISE_COEF, 16 );
+    }
+
+    /****************************/
+    /* HARMONIC SHAPING CONTROL */
+    /****************************/
+    /* Control boosting of harmonic frequencies */
+    HarmBoost_Q16 = silk_SMULWB( silk_SMULWB( SILK_FIX_CONST( 1.0, 17 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 3 ),
+        psEnc->LTPCorr_Q15 ), SILK_FIX_CONST( LOW_RATE_HARMONIC_BOOST, 16 ) );
+
+    /* More harmonic boost for noisy input signals */
+    HarmBoost_Q16 = silk_SMLAWB( HarmBoost_Q16,
+        SILK_FIX_CONST( 1.0, 16 ) - silk_LSHIFT( psEncCtrl->input_quality_Q14, 2 ), SILK_FIX_CONST( LOW_INPUT_QUALITY_HARMONIC_BOOST, 16 ) );
+
+    if( USE_HARM_SHAPING && psEnc->sCmn.indices.signalType == TYPE_VOICED ) {
+        /* More harmonic noise shaping for high bitrates or noisy input */
+        HarmShapeGain_Q16 = silk_SMLAWB( SILK_FIX_CONST( HARMONIC_SHAPING, 16 ),
+                SILK_FIX_CONST( 1.0, 16 ) - silk_SMULWB( SILK_FIX_CONST( 1.0, 18 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 4 ),
+                psEncCtrl->input_quality_Q14 ), SILK_FIX_CONST( HIGH_RATE_OR_LOW_QUALITY_HARMONIC_SHAPING, 16 ) );
+
+        /* Less harmonic noise shaping for less periodic signals */
+        HarmShapeGain_Q16 = silk_SMULWB( silk_LSHIFT( HarmShapeGain_Q16, 1 ),
+            silk_SQRT_APPROX( silk_LSHIFT( psEnc->LTPCorr_Q15, 15 ) ) );
+    } else {
+        HarmShapeGain_Q16 = 0;
+    }
+
+    /*************************/
+    /* Smooth over subframes */
+    /*************************/
+    for( k = 0; k < MAX_NB_SUBFR; k++ ) {
+        psShapeSt->HarmBoost_smth_Q16 =
+            silk_SMLAWB( psShapeSt->HarmBoost_smth_Q16,     HarmBoost_Q16     - psShapeSt->HarmBoost_smth_Q16,     SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) );
+        psShapeSt->HarmShapeGain_smth_Q16 =
+            silk_SMLAWB( psShapeSt->HarmShapeGain_smth_Q16, HarmShapeGain_Q16 - psShapeSt->HarmShapeGain_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) );
+        psShapeSt->Tilt_smth_Q16 =
+            silk_SMLAWB( psShapeSt->Tilt_smth_Q16,          Tilt_Q16          - psShapeSt->Tilt_smth_Q16,          SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) );
+
+        psEncCtrl->HarmBoost_Q14[ k ]     = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmBoost_smth_Q16,     2 );
+        psEncCtrl->HarmShapeGain_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmShapeGain_smth_Q16, 2 );
+        psEncCtrl->Tilt_Q14[ k ]          = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->Tilt_smth_Q16,          2 );
+    }
+    RESTORE_STACK;
+}
diff --git a/silk/fixed/mips/prefilter_FIX_mipsr1.h b/silk/fixed/mips/prefilter_FIX_mipsr1.h
new file mode 100644
index 0000000..21b2568
--- /dev/null
+++ b/silk/fixed/mips/prefilter_FIX_mipsr1.h
@@ -0,0 +1,184 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef __PREFILTER_FIX_MIPSR1_H__
+#define __PREFILTER_FIX_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "main_FIX.h"
+#include "stack_alloc.h"
+#include "tuning_parameters.h"
+
+#define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
+void silk_warped_LPC_analysis_filter_FIX(
+          opus_int32            state[],                    /* I/O  State [order + 1]                   */
+          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order,                      /* I    Filter order (even)                 */
+               int              arch
+)
+{
+    opus_int     n, i;
+    opus_int32   acc_Q11, acc_Q22, tmp1, tmp2, tmp3, tmp4;
+    opus_int32   state_cur, state_next;
+
+    (void)arch;
+
+    /* Order must be even */
+    /* Length must be even */
+
+    silk_assert( ( order & 1 ) == 0 );
+    silk_assert( ( length & 1 ) == 0 );
+
+    for( n = 0; n < length; n+=2 ) {
+        /* Output of lowpass section */
+        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
+        state_cur = silk_LSHIFT( input[ n ], 14 );
+        /* Output of allpass section */
+        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
+        state_next = tmp2;
+        acc_Q11 = silk_RSHIFT( order, 1 );
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
+
+
+        /* Output of lowpass section */
+        tmp4 = silk_SMLAWB( state_cur, state_next, lambda_Q16 );
+        state[ 0 ] = silk_LSHIFT( input[ n+1 ], 14 );
+        /* Output of allpass section */
+        tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
+        state[ 1 ] = tmp4;
+        acc_Q22 = silk_RSHIFT( order, 1 );
+        acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ 0 ] );
+
+        /* Loop over allpass sections */
+        for( i = 2; i < order; i += 2 ) {
+            /* Output of allpass section */
+            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
+            state_cur = tmp1;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
+            /* Output of allpass section */
+            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
+            state_next = tmp2;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
+
+
+            /* Output of allpass section */
+            tmp4 = silk_SMLAWB( state_cur, state_next - tmp3, lambda_Q16 );
+            state[ i ] = tmp3;
+            acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ i - 1 ] );
+            /* Output of allpass section */
+            tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
+            state[ i + 1 ] = tmp4;
+            acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ i ] );
+        }
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
+        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
+
+        state[ order ] = tmp3;
+        acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ order - 1 ] );
+        res_Q2[ n+1 ] = silk_LSHIFT( (opus_int32)input[ n+1 ], 2 ) - silk_RSHIFT_ROUND( acc_Q22, 9 );
+    }
+}
+
+
+
+/* Prefilter for finding Quantizer input signal */
+#define OVERRIDE_silk_prefilt_FIX
+static inline void silk_prefilt_FIX(
+    silk_prefilter_state_FIX    *P,                         /* I/O  state                               */
+    opus_int32                  st_res_Q12[],               /* I    short term residual signal          */
+    opus_int32                  xw_Q3[],                    /* O    prefiltered signal                  */
+    opus_int32                  HarmShapeFIRPacked_Q12,     /* I    Harmonic shaping coeficients        */
+    opus_int                    Tilt_Q14,                   /* I    Tilt shaping coeficient             */
+    opus_int32                  LF_shp_Q14,                 /* I    Low-frequancy shaping coeficients   */
+    opus_int                    lag,                        /* I    Lag for harmonic shaping            */
+    opus_int                    length                      /* I    Length of signals                   */
+)
+{
+    opus_int   i, idx, LTP_shp_buf_idx;
+    opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10;
+    opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12;
+    opus_int16 *LTP_shp_buf;
+
+    /* To speed up use temp variables instead of using the struct */
+    LTP_shp_buf     = P->sLTP_shp;
+    LTP_shp_buf_idx = P->sLTP_shp_buf_idx;
+    sLF_AR_shp_Q12  = P->sLF_AR_shp_Q12;
+    sLF_MA_shp_Q12  = P->sLF_MA_shp_Q12;
+
+    if( lag > 0 ) {
+        for( i = 0; i < length; i++ ) {
+            /* unrolled loop */
+            silk_assert( HARM_SHAPE_FIR_TAPS == 3 );
+            idx = lag + LTP_shp_buf_idx;
+            n_LTP_Q12 = silk_SMULBB(            LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+            n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2    ) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+            n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+
+            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
+            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
+
+            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
+            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
+
+            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
+            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
+
+            xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 );
+        }
+    }
+    else
+    {
+        for( i = 0; i < length; i++ ) {
+
+            n_LTP_Q12 = 0;
+
+            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
+            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
+
+            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
+            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
+
+            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
+            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
+
+            xw_Q3[i] = silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 9 );
+        }
+    }
+
+    /* Copy temp variable back to state */
+    P->sLF_AR_shp_Q12   = sLF_AR_shp_Q12;
+    P->sLF_MA_shp_Q12   = sLF_MA_shp_Q12;
+    P->sLTP_shp_buf_idx = LTP_shp_buf_idx;
+}
+
+#endif /* __PREFILTER_FIX_MIPSR1_H__ */
diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
new file mode 100644
index 0000000..e803ef0
--- /dev/null
+++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
@@ -0,0 +1,165 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__
+#define __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "main_FIX.h"
+
+#undef QC
+#define QC  10
+
+#undef QS
+#define QS  14
+
+/* Autocorrelations for a warped frequency axis */
+#define OVERRIDE_silk_warped_autocorrelation_FIX
+void silk_warped_autocorrelation_FIX(
+          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+    const opus_int                  length,                                 /* I    Length of input                                                             */
+    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+)
+{
+    opus_int   n, i, lsh;
+    opus_int32 tmp1_QS=0, tmp2_QS=0, tmp3_QS=0, tmp4_QS=0, tmp5_QS=0, tmp6_QS=0, tmp7_QS=0, tmp8_QS=0, start_1=0, start_2=0, start_3=0;
+    opus_int32 state_QS[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+    opus_int64 corr_QC[  MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+    opus_int64 temp64;
+
+    opus_int32 val;
+    val = 2 * QS - QC;
+
+    /* Order must be even */
+    silk_assert( ( order & 1 ) == 0 );
+    silk_assert( 2 * QS - QC >= 0 );
+
+    /* Loop over samples */
+    for( n = 0; n < length; n=n+4 ) {
+
+        tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+        start_1 = tmp1_QS;
+        tmp3_QS = silk_LSHIFT32( (opus_int32)input[ n+1], QS );
+        start_2 = tmp3_QS;
+        tmp5_QS = silk_LSHIFT32( (opus_int32)input[ n+2], QS );
+        start_3 = tmp5_QS;
+        tmp7_QS = silk_LSHIFT32( (opus_int32)input[ n+3], QS );
+
+        /* Loop over allpass sections */
+        for( i = 0; i < order; i += 2 ) {
+            /* Output of allpass section */
+            tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 );
+            corr_QC[  i ] = __builtin_mips_madd( corr_QC[  i ], tmp1_QS,  start_1);
+
+            tmp4_QS = silk_SMLAWB( tmp1_QS, tmp2_QS - tmp3_QS, warping_Q16 );
+            corr_QC[  i ] = __builtin_mips_madd( corr_QC[  i ], tmp3_QS,  start_2);
+
+            tmp6_QS = silk_SMLAWB( tmp3_QS, tmp4_QS - tmp5_QS, warping_Q16 );
+            corr_QC[  i ] = __builtin_mips_madd( corr_QC[  i ], tmp5_QS,  start_3);
+
+            tmp8_QS = silk_SMLAWB( tmp5_QS, tmp6_QS - tmp7_QS, warping_Q16 );
+            state_QS[ i ]  = tmp7_QS;
+            corr_QC[  i ] = __builtin_mips_madd( corr_QC[  i ], tmp7_QS, state_QS[0]);
+
+            /* Output of allpass section */
+            tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 );
+            corr_QC[  i+1 ] = __builtin_mips_madd( corr_QC[  i+1 ], tmp2_QS,  start_1);
+
+            tmp3_QS = silk_SMLAWB( tmp2_QS, tmp1_QS - tmp4_QS, warping_Q16 );
+            corr_QC[  i+1 ] = __builtin_mips_madd( corr_QC[  i+1 ], tmp4_QS,  start_2);
+
+            tmp5_QS = silk_SMLAWB( tmp4_QS, tmp3_QS - tmp6_QS, warping_Q16 );
+            corr_QC[  i+1 ] = __builtin_mips_madd( corr_QC[  i+1 ], tmp6_QS,  start_3);
+
+            tmp7_QS = silk_SMLAWB( tmp6_QS, tmp5_QS - tmp8_QS, warping_Q16 );
+            state_QS[ i + 1 ]  = tmp8_QS;
+            corr_QC[  i+1 ] = __builtin_mips_madd( corr_QC[  i+1 ], tmp8_QS,  state_QS[ 0 ]);
+
+        }
+        state_QS[ order ] = tmp7_QS;
+
+        corr_QC[  order ] = __builtin_mips_madd( corr_QC[  order ], tmp1_QS,  start_1);
+        corr_QC[  order ] = __builtin_mips_madd( corr_QC[  order ], tmp3_QS,  start_2);
+        corr_QC[  order ] = __builtin_mips_madd( corr_QC[  order ], tmp5_QS,  start_3);
+        corr_QC[  order ] = __builtin_mips_madd( corr_QC[  order ], tmp7_QS,  state_QS[ 0 ]);
+    }
+
+    for(;n< length; n++ ) {
+
+        tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+
+        /* Loop over allpass sections */
+        for( i = 0; i < order; i += 2 ) {
+
+            /* Output of allpass section */
+            tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 );
+            state_QS[ i ] = tmp1_QS;
+            corr_QC[  i ] = __builtin_mips_madd( corr_QC[  i ], tmp1_QS,   state_QS[ 0 ]);
+
+            /* Output of allpass section */
+            tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 );
+            state_QS[ i + 1 ]  = tmp2_QS;
+            corr_QC[  i+1 ] = __builtin_mips_madd( corr_QC[  i+1 ], tmp2_QS,   state_QS[ 0 ]);
+        }
+        state_QS[ order ] = tmp1_QS;
+        corr_QC[  order ] = __builtin_mips_madd( corr_QC[  order ], tmp1_QS,   state_QS[ 0 ]);
+    }
+
+    temp64 =  corr_QC[ 0 ];
+    temp64 = __builtin_mips_shilo(temp64, val);
+
+    lsh = silk_CLZ64( temp64 ) - 35;
+    lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+    *scale = -( QC + lsh );
+    silk_assert( *scale >= -30 && *scale <= 12 );
+    if( lsh >= 0 ) {
+        for( i = 0; i < order + 1; i++ ) {
+            temp64 = corr_QC[ i ];
+            //temp64 = __builtin_mips_shilo(temp64, val);
+            temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+            corr[ i ] = (opus_int32)silk_CHECK_FIT32( __builtin_mips_shilo( temp64, -lsh ) );
+        }
+    } else {
+        for( i = 0; i < order + 1; i++ ) {
+            temp64 = corr_QC[ i ];
+            //temp64 = __builtin_mips_shilo(temp64, val);
+            temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+            corr[ i ] = (opus_int32)silk_CHECK_FIT32( __builtin_mips_shilo( temp64, -lsh ) );
+        }
+    }
+
+     corr_QC[ 0 ] = __builtin_mips_shilo(corr_QC[ 0 ], val);
+
+     silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
+}
+#endif /* __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__ */
diff --git a/silk/fixed/noise_shape_analysis_FIX.c b/silk/fixed/noise_shape_analysis_FIX.c
index e24d2e9..22a89f7 100644
--- a/silk/fixed/noise_shape_analysis_FIX.c
+++ b/silk/fixed/noise_shape_analysis_FIX.c
@@ -138,9 +138,14 @@
     silk_assert( 0 );
 }
 
+#if defined(MIPSr1_ASM)
+#include "mips/noise_shape_analysis_FIX_mipsr1.h"
+#endif
+
 /**************************************************************/
 /* Compute noise shaping coefficients and initial gain values */
 /**************************************************************/
+#ifndef OVERRIDE_silk_noise_shape_analysis_FIX
 void silk_noise_shape_analysis_FIX(
     silk_encoder_state_FIX          *psEnc,                                 /* I/O  Encoder state FIX                                                           */
     silk_encoder_control_FIX        *psEncCtrl,                             /* I/O  Encoder control FIX                                                         */
@@ -443,3 +448,4 @@
     }
     RESTORE_STACK;
 }
+#endif /* OVERRIDE_silk_noise_shape_analysis_FIX */
diff --git a/silk/fixed/pitch_analysis_core_FIX.c b/silk/fixed/pitch_analysis_core_FIX.c
index 1641a0f..01bb9fc 100644
--- a/silk/fixed/pitch_analysis_core_FIX.c
+++ b/silk/fixed/pitch_analysis_core_FIX.c
@@ -72,7 +72,8 @@
     opus_int          start_lag,                       /* I lag offset to search around */
     opus_int          sf_length,                       /* I length of one 5 ms subframe */
     opus_int          nb_subfr,                        /* I number of subframes         */
-    opus_int          complexity                       /* I Complexity setting          */
+    opus_int          complexity,                      /* I Complexity setting          */
+    int               arch                             /* I Run-time architecture       */
 );
 
 /*************************************************************/
@@ -195,8 +196,8 @@
 
         /* Calculate first vector products before loop */
         cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
-        normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ );
-        normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ ) );
+        normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch );
+        normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ, arch ) );
         normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) );
 
         matrix_ptr( C, k, 0, CSTRIDE_4KHZ ) =
@@ -334,7 +335,7 @@
         silk_assert( target_ptr >= frame_8kHz );
         silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ ), 1 );
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ), 1 );
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
@@ -343,9 +344,9 @@
             silk_assert( basis_ptr >= frame_8kHz );
             silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ );
+            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
             if( cross_corr > 0 ) {
-                energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ );
+                energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
                 matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) =
                     (opus_int16)silk_DIV32_varQ( cross_corr,
                                                  silk_ADD32( energy_target,
@@ -519,14 +520,14 @@
         ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
         ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
         silk_P_Ana_calc_corr_st3(  cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
-        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
+        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
 
         lag_counter = 0;
         silk_assert( lag == silk_SAT16( lag ) );
         contour_bias_Q15 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 15 ), lag );
 
         target_ptr = &input_frame_ptr[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ];
-        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length ), 1 );
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length, arch ), 1 );
         for( d = start_lag; d <= end_lag; d++ ) {
             for( j = 0; j < nb_cbk_search; j++ ) {
                 cross_corr = 0;
@@ -671,7 +672,8 @@
     opus_int          start_lag,                        /* I lag offset to search around */
     opus_int          sf_length,                        /* I length of one 5 ms subframe */
     opus_int          nb_subfr,                         /* I number of subframes         */
-    opus_int          complexity                        /* I Complexity setting          */
+    opus_int          complexity,                       /* I Complexity setting          */
+    int               arch                              /* I Run-time architecture       */
 )
 {
     const opus_int16 *target_ptr, *basis_ptr;
@@ -705,7 +707,7 @@
 
         /* Calculate the energy for first lag */
         basis_ptr = target_ptr - ( start_lag + matrix_ptr( Lag_range_ptr, k, 0, 2 ) );
-        energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length );
+        energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length, arch );
         silk_assert( energy >= 0 );
         scratch_mem[ lag_counter ] = energy;
         lag_counter++;
diff --git a/silk/fixed/prefilter_FIX.c b/silk/fixed/prefilter_FIX.c
index d381730..6a8e351 100644
--- a/silk/fixed/prefilter_FIX.c
+++ b/silk/fixed/prefilter_FIX.c
@@ -33,6 +33,16 @@
 #include "stack_alloc.h"
 #include "tuning_parameters.h"
 
+#if defined(MIPSr1_ASM)
+#include "mips/prefilter_FIX_mipsr1.h"
+#endif
+
+
+#if !defined(OVERRIDE_silk_warped_LPC_analysis_filter_FIX)
+#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
+    ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
+#endif
+
 /* Prefilter for finding Quantizer input signal */
 static OPUS_INLINE void silk_prefilt_FIX(
     silk_prefilter_state_FIX    *P,                         /* I/O  state                               */
@@ -45,7 +55,7 @@
     opus_int                    length                      /* I    Length of signals                   */
 );
 
-void silk_warped_LPC_analysis_filter_FIX(
+void silk_warped_LPC_analysis_filter_FIX_c(
           opus_int32            state[],                    /* I/O  State [order + 1]                   */
           opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
     const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
@@ -130,7 +140,7 @@
 
         /* Short term FIR filtering*/
         silk_warped_LPC_analysis_filter_FIX( P->sAR_shp, st_res_Q2, AR1_shp_Q13, px,
-            psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder );
+            psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder, psEnc->sCmn.arch );
 
         /* Reduce (mainly) low frequencies during harmonic emphasis */
         B_Q10[ 0 ] = silk_RSHIFT_ROUND( psEncCtrl->GainsPre_Q14[ k ], 4 );
@@ -155,6 +165,7 @@
     RESTORE_STACK;
 }
 
+#ifndef OVERRIDE_silk_prefilt_FIX
 /* Prefilter for finding Quantizer input signal */
 static OPUS_INLINE void silk_prefilt_FIX(
     silk_prefilter_state_FIX    *P,                         /* I/O  state                               */
@@ -207,3 +218,4 @@
     P->sLF_MA_shp_Q12   = sLF_MA_shp_Q12;
     P->sLTP_shp_buf_idx = LTP_shp_buf_idx;
 }
+#endif /* OVERRIDE_silk_prefilt_FIX */
diff --git a/silk/fixed/residual_energy_FIX.c b/silk/fixed/residual_energy_FIX.c
index 105ae31..41f7477 100644
--- a/silk/fixed/residual_energy_FIX.c
+++ b/silk/fixed/residual_energy_FIX.c
@@ -42,7 +42,8 @@
     const opus_int32                gains[ MAX_NB_SUBFR ],                  /* I    Quantization gains                                                          */
     const opus_int                  subfr_length,                           /* I    Subframe length                                                             */
     const opus_int                  nb_subfr,                               /* I    Number of subframes                                                         */
-    const opus_int                  LPC_order                               /* I    LPC order                                                                   */
+    const opus_int                  LPC_order,                              /* I    LPC order                                                                   */
+          int                       arch                                    /* I    Run-time architecture                                                       */
 )
 {
     opus_int         offset, i, j, rshift, lz1, lz2;
@@ -60,7 +61,7 @@
     silk_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr );
     for( i = 0; i < nb_subfr >> 1; i++ ) {
         /* Calculate half frame LPC residual signal including preceding samples */
-        silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order );
+        silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order, arch );
 
         /* Point to first subframe of the just calculated LPC residual signal */
         LPC_res_ptr = LPC_res + LPC_order;
diff --git a/silk/fixed/structs_FIX.h b/silk/fixed/structs_FIX.h
index 244b479..3294b25 100644
--- a/silk/fixed/structs_FIX.h
+++ b/silk/fixed/structs_FIX.h
@@ -116,6 +116,7 @@
 typedef struct {
     silk_encoder_state_FIX      state_Fxx[ ENCODER_NUM_CHANNELS ];
     stereo_enc_state            sStereo;
+    opus_int32                  nBitsUsedLBRR;
     opus_int32                  nBitsExceeded;
     opus_int                    nChannelsAPI;
     opus_int                    nChannelsInternal;
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index 509c8b3..d949800 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -30,6 +30,7 @@
 #endif
 
 #include "SigProc_FIX.h"
+#include "pitch.h"
 
 /* Copy and multiply a vector by a constant */
 void silk_scale_copy_vector16(
@@ -70,18 +71,23 @@
 opus_int32 silk_inner_prod_aligned(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
-    const opus_int              len                 /*    I vector lengths                                              */
+    const opus_int              len,                /*    I vector lengths                                              */
+    int                         arch                /*    I Run-time architecture                                       */
 )
 {
+#ifdef FIXED_POINT
+   return celt_inner_prod(inVec1, inVec2, len, arch);
+#else
     opus_int   i;
     opus_int32 sum = 0;
     for( i = 0; i < len; i++ ) {
         sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
     }
     return sum;
+#endif
 }
 
-opus_int64 silk_inner_prod16_aligned_64(
+opus_int64 silk_inner_prod16_aligned_64_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index a4a579b..6ca6c11 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -34,6 +34,12 @@
 #define QC  10
 #define QS  14
 
+#if defined(MIPSr1_ASM)
+#include "mips/warped_autocorrelation_FIX_mipsr1.h"
+#endif
+
+
+#ifndef OVERRIDE_silk_warped_autocorrelation_FIX
 /* Autocorrelations for a warped frequency axis */
 void silk_warped_autocorrelation_FIX(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
@@ -86,3 +92,4 @@
     }
     silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
 }
+#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse.c b/silk/fixed/x86/burg_modified_FIX_sse.c
new file mode 100644
index 0000000..3756095
--- /dev/null
+++ b/silk/fixed/x86/burg_modified_FIX_sse.c
@@ -0,0 +1,375 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "SigProc_FIX.h"
+#include "define.h"
+#include "tuning_parameters.h"
+#include "pitch.h"
+#include "celt/x86/x86cpu.h"
+
+#define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
+
+#define QA                          25
+#define N_BITS_HEAD_ROOM            2
+#define MIN_RSHIFTS                 -16
+#define MAX_RSHIFTS                 (32 - QA)
+
+/* Compute reflection coefficients from input signal */
+void silk_burg_modified_sse4_1(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */
+)
+{
+    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
+    const opus_int16 *x_ptr;
+    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
+    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
+    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
+    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
+    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+
+    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
+    __m128i CONST1 = _mm_set1_epi32(1);
+
+    silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+
+    /* Compute autocorrelations, added over subframes */
+    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
+    if( rshifts > MAX_RSHIFTS ) {
+        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
+        silk_assert( C0 > 0 );
+        rshifts = MAX_RSHIFTS;
+    } else {
+        lz = silk_CLZ32( C0 ) - 1;
+        rshifts_extra = N_BITS_HEAD_ROOM - lz;
+        if( rshifts_extra > 0 ) {
+            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
+            C0 = silk_RSHIFT32( C0, rshifts_extra );
+        } else {
+            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
+            C0 = silk_LSHIFT32( C0, -rshifts_extra );
+        }
+        rshifts += rshifts_extra;
+    }
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+    silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+    if( rshifts > 0 ) {
+        for( s = 0; s < nb_subfr; s++ ) {
+            x_ptr = x + s * subfr_length;
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
+                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+            }
+        }
+    } else {
+        for( s = 0; s < nb_subfr; s++ ) {
+            int i;
+            opus_int32 d;
+            x_ptr = x + s * subfr_length;
+            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
+            for( n = 1; n < D + 1; n++ ) {
+               for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
+                  d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
+               xcorr[ n - 1 ] += d;
+            }
+            for( n = 1; n < D + 1; n++ ) {
+                C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
+            }
+        }
+    }
+    silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
+
+    /* Initialize */
+    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
+
+    invGain_Q30 = (opus_int32)1 << 30;
+    reached_max_gain = 0;
+    for( n = 0; n < D; n++ ) {
+        /* Update first row of correlation matrix (without first element) */
+        /* Update last row of correlation matrix (without last element, stored in reversed order) */
+        /* Update C * Af */
+        /* Update C * flipud(Af) (stored in reversed order) */
+        if( rshifts > -2 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
+                for( k = 0; k < n; k++ ) {
+                    C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp_QA = Af_QA[ k ];
+                    tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
+                    tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
+                }
+                tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
+                for( k = 0; k <= n; k++ ) {
+                    CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
+                    CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
+                }
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
+                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
+                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
+                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
+
+                X1_3210 = _mm_set1_epi32( x1 );
+                X2_3210 = _mm_set1_epi32( x2 );
+                TMP1_3210 = _mm_setzero_si128();
+                TMP2_3210 = _mm_setzero_si128();
+                for( k = 0; k < n - 3; k += 4 ) {
+                    PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
+                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
+                    FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
+                    PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
+                    LAST_3210  = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
+                    ATMP_3210  = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
+
+                    T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
+                    T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
+
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
+                    ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
+                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
+
+                    FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
+                    LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
+
+                    PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
+                    SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
+
+                    _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
+                    _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
+
+                    TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
+                    TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
+                }
+
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
+                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
+                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
+
+                tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
+                tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
+
+                for( ; k < n; k++ ) {
+                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
+                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
+                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
+                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                }
+
+                tmp1 = -tmp1;                /* Q17 */
+                tmp2 = -tmp2;                /* Q17 */
+
+                {
+                    __m128i xmm_tmp1, xmm_tmp2;
+                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
+                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
+
+                    xmm_tmp1 = _mm_set1_epi32( tmp1 );
+                    xmm_tmp2 = _mm_set1_epi32( tmp2 );
+
+                    for( k = 0; k <= n - 3; k += 4 ) {
+                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
+                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
+
+                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
+                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
+                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
+                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
+                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
+                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
+                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
+
+                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
+                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
+
+                        X1_3210  = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
+                        PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
+
+                        X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
+                        PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
+
+                        _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
+                        _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
+                    }
+
+                    for( ; k <= n; k++ ) {
+                        CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
+                        CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
+                            silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
+                    }
+                }
+            }
+        }
+
+        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
+        tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
+        tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
+        num  = 0;                                                                                       /* Q( -rshifts ) */
+        nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
+        for( k = 0; k < n; k++ ) {
+            Atmp_QA = Af_QA[ k ];
+            lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
+            lz = silk_min( 32 - QA, lz );
+            Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
+
+            tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
+            nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
+                                                                                Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
+        }
+        CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
+        CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
+        num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
+        num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
+
+        /* Calculate the next order reflection (parcor) coefficient */
+        if( silk_abs( num ) < nrg ) {
+            rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
+        } else {
+            rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
+        }
+
+        /* Update inverse prediction gain */
+        tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+        tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
+        if( tmp1 <= minInvGain_Q30 ) {
+            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
+            tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
+            rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
+            /* Newton-Raphson iteration */
+            rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
+            rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
+            if( num < 0 ) {
+                /* Ensure adjusted reflection coefficients has the original sign */
+                rc_Q31 = -rc_Q31;
+            }
+            invGain_Q30 = minInvGain_Q30;
+            reached_max_gain = 1;
+        } else {
+            invGain_Q30 = tmp1;
+        }
+
+        /* Update the AR coefficients */
+        for( k = 0; k < (n + 1) >> 1; k++ ) {
+            tmp1 = Af_QA[ k ];                                                                  /* QA */
+            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
+            Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
+            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
+        }
+        Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
+
+        if( reached_max_gain ) {
+            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
+            for( k = n + 1; k < D; k++ ) {
+                Af_QA[ k ] = 0;
+            }
+            break;
+        }
+
+        /* Update C * Af and C * Ab */
+        for( k = 0; k <= n + 1; k++ ) {
+            tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
+            tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
+            CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
+        }
+    }
+
+    if( reached_max_gain ) {
+        for( k = 0; k < D; k++ ) {
+            /* Scale coefficients */
+            A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
+        }
+        /* Subtract energy of preceding samples from C0 */
+        if( rshifts > 0 ) {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+            }
+        } else {
+            for( s = 0; s < nb_subfr; s++ ) {
+                x_ptr = x + s * subfr_length;
+                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
+            }
+        }
+        /* Approximate residual energy */
+        *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
+        *res_nrg_Q = -rshifts;
+    } else {
+        /* Return residual energy */
+        nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
+        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
+        for( k = 0; k < D; k++ ) {
+            Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
+            nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
+            tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
+            A_Q16[ k ] = -Atmp1;
+        }
+        *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
+        *res_nrg_Q = -rshifts;
+    }
+}
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
new file mode 100644
index 0000000..488a603
--- /dev/null
+++ b/silk/fixed/x86/prefilter_FIX_sse.c
@@ -0,0 +1,160 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+void silk_warped_LPC_analysis_filter_FIX_sse4_1(
+    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
+    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+)
+{
+    opus_int     n, i;
+    opus_int32   acc_Q11, tmp1, tmp2;
+
+    /* Order must be even */
+    silk_assert( ( order & 1 ) == 0 );
+
+    if (order == 10)
+    {
+        if (0 == lambda_Q16)
+        {
+            __m128i coef_Q13_3210, coef_Q13_7654;
+            __m128i coef_Q13_0123, coef_Q13_4567;
+            __m128i state_0123, state_4567;
+            __m128i xmm_product1, xmm_product2;
+            __m128i xmm_tempa, xmm_tempb;
+
+            register opus_int32 sum;
+            register opus_int32 state_8, state_9, state_a;
+            register opus_int64 coef_Q13_8, coef_Q13_9;
+
+            silk_assert( length > 0 );
+
+            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
+            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
+
+            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
+            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
+
+            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
+            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
+
+            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+            state_8 = state[ 8 ];
+            state_9 = state[ 9 ];
+            state_a = 0;
+
+            for( n = 0; n < length; n++ )
+            {
+                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
+                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
+
+                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
+                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
+
+                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
+                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
+
+                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
+                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
+
+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
+                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
+                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
+
+                sum  = (coef_Q13_8 * state_8) >> 16;
+                sum += (coef_Q13_9 * state_9) >> 16;
+
+                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
+                sum += _mm_cvtsi128_si32( xmm_tempa);
+                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
+
+                /* move right */
+                state_a = state_9;
+                state_9 = state_8;
+                state_8 = _mm_cvtsi128_si32( state_4567 );
+                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
+
+                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
+            }
+
+            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+            state[ 8 ] = state_8;
+            state[ 9 ] = state_9;
+            state[ 10 ] = state_a;
+
+            return;
+        }
+    }
+
+    for( n = 0; n < length; n++ ) {
+        /* Output of lowpass section */
+        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
+        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
+        /* Output of allpass section */
+        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
+        state[ 1 ] = tmp2;
+        acc_Q11 = silk_RSHIFT( order, 1 );
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
+        /* Loop over allpass sections */
+        for( i = 2; i < order; i += 2 ) {
+            /* Output of allpass section */
+            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
+            state[ i ] = tmp1;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
+            /* Output of allpass section */
+            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
+            state[ i + 1 ] = tmp2;
+            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
+        }
+        state[ order ] = tmp1;
+        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
+        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
+    }
+}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse.c b/silk/fixed/x86/vector_ops_FIX_sse.c
new file mode 100644
index 0000000..c1e9056
--- /dev/null
+++ b/silk/fixed/x86/vector_ops_FIX_sse.c
@@ -0,0 +1,88 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+
+#include "SigProc_FIX.h"
+#include "pitch.h"
+
+opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+    const opus_int16            *inVec1,            /*    I input vector 1                                              */
+    const opus_int16            *inVec2,            /*    I input vector 2                                              */
+    const opus_int              len                 /*    I vector lengths                                              */
+)
+{
+    opus_int  i, dataSize8;
+    opus_int64 sum;
+
+    __m128i xmm_tempa;
+    __m128i inVec1_76543210, acc1;
+    __m128i inVec2_76543210, acc2;
+
+    sum = 0;
+    dataSize8 = len & ~7;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for( i = 0; i < dataSize8; i += 8 ) {
+        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
+        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+
+        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
+        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+
+        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
+        /* equal shift right 8 bytes */
+        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
+
+        acc1 = _mm_add_epi64( acc1, xmm_tempa );
+        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+    }
+
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    /* equal shift right 8 bytes */
+    acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+    acc1 = _mm_add_epi64( acc1, acc2 );
+
+    _mm_storel_epi64( (__m128i *)&sum, acc1 );
+
+    for( ; i < len; i++ ) {
+        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+    }
+
+    return sum;
+}
diff --git a/silk/float/encode_frame_FLP.c b/silk/float/encode_frame_FLP.c
index d54e268..2092a4d 100644
--- a/silk/float/encode_frame_FLP.c
+++ b/silk/float/encode_frame_FLP.c
@@ -47,7 +47,7 @@
     /****************************/
     /* Voice Activity Detection */
     /****************************/
-    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1 );
+    silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
 
     /**************************************************/
     /* Convert speech activity into VAD and DTX flags */
diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c
index 61c1ad9..fcfe1c3 100644
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -99,6 +99,6 @@
         silk_A2NLSF_FLP( NLSF_Q15, a, psEncC->predictLPCOrder );
     }
 
-    silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 || 
+    silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 ||
         ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) );
 }
diff --git a/silk/float/find_pred_coefs_FLP.c b/silk/float/find_pred_coefs_FLP.c
index ea2c6c4..1af4fe5 100644
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@@ -67,7 +67,8 @@
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
-            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr );
+            &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr,
+            psEnc->sCmn.arch );
 
         /* Control LTP scaling */
         silk_LTP_scale_ctrl_FLP( psEnc, psEncCtrl, condCoding );
@@ -90,13 +91,13 @@
         }
         silk_memset( psEncCtrl->LTPCoef, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( silk_float ) );
         psEncCtrl->LTPredCodGain = 0.0f;
-		psEnc->sCmn.sum_log_gain_Q7 = 0;
+        psEnc->sCmn.sum_log_gain_Q7 = 0;
     }
 
     /* Limit on total predictive coding gain */
     if( psEnc->sCmn.first_frame_after_reset ) {
         minInvGain = 1.0f / MAX_PREDICTION_POWER_GAIN_AFTER_RESET;
-    } else {        
+    } else {
         minInvGain = (silk_float)pow( 2, psEncCtrl->LTPredCodGain / 3 ) /  MAX_PREDICTION_POWER_GAIN;
         minInvGain /= 0.25f + 0.75f * psEncCtrl->coding_quality;
     }
diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h
index fb553b6..e5a7597 100644
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -205,7 +205,8 @@
     const silk_float                W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I    Error weights                        */
     const opus_int                  mu_Q10,                             /* I    Mu value (R/D tradeoff)                     */
     const opus_int                  lowComplexity,                      /* I    Flag for low complexity                     */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch                                /* I    Run-time architecture                       */
 );
 
 /* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */
diff --git a/silk/float/pitch_analysis_core_FLP.c b/silk/float/pitch_analysis_core_FLP.c
index e58f041..d0e637a 100644
--- a/silk/float/pitch_analysis_core_FLP.c
+++ b/silk/float/pitch_analysis_core_FLP.c
@@ -182,8 +182,8 @@
 
         /* Calculate first vector products before loop */
         cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ];
-        normalizer = silk_energy_FLP( target_ptr, sf_length_8kHz ) + 
-                     silk_energy_FLP( basis_ptr,  sf_length_8kHz ) + 
+        normalizer = silk_energy_FLP( target_ptr, sf_length_8kHz ) +
+                     silk_energy_FLP( basis_ptr,  sf_length_8kHz ) +
                      sf_length_8kHz * 4000.0f;
 
         C[ 0 ][ min_lag_4kHz ] += (silk_float)( 2 * cross_corr / normalizer );
diff --git a/silk/float/structs_FLP.h b/silk/float/structs_FLP.h
index bb529e7..14d647c 100644
--- a/silk/float/structs_FLP.h
+++ b/silk/float/structs_FLP.h
@@ -115,6 +115,7 @@
 typedef struct {
     silk_encoder_state_FLP      state_Fxx[ ENCODER_NUM_CHANNELS ];
     stereo_enc_state            sStereo;
+    opus_int32                  nBitsUsedLBRR;
     opus_int32                  nBitsExceeded;
     opus_int                    nChannelsAPI;
     opus_int                    nChannelsInternal;
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index 350599b..6666b8e 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -161,10 +161,10 @@
     /* Call NSQ */
     if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) {
         silk_NSQ_del_dec( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14,
-            AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14 );
+            AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch );
     } else {
         silk_NSQ( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14,
-            AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14 );
+            AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch );
     }
 }
 
@@ -179,7 +179,8 @@
     const silk_float                W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I    Error weights                        */
     const opus_int                  mu_Q10,                             /* I    Mu value (R/D tradeoff)                     */
     const opus_int                  lowComplexity,                      /* I    Flag for low complexity                     */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch                                /* I    Run-time architecture                       */
 )
 {
     opus_int   i;
@@ -193,7 +194,7 @@
         W_Q18[ i ] = (opus_int32)silk_float2int( W[ i ] * 262144.0f );
     }
 
-    silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, W_Q18, mu_Q10, lowComplexity, nb_subfr );
+    silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, W_Q18, mu_Q10, lowComplexity, nb_subfr, arch );
 
     for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
         B[ i ] = (silk_float)B_Q14[ i ] * ( 1.0f / 16384.0f );
diff --git a/silk/log2lin.c b/silk/log2lin.c
index a692e00..b7c48e4 100644
--- a/silk/log2lin.c
+++ b/silk/log2lin.c
@@ -33,7 +33,7 @@
 
 /* Approximation of 2^() (very close inverse of silk_lin2log()) */
 /* Convert input to a linear scale    */
-opus_int32 silk_log2lin( 
+opus_int32 silk_log2lin(
     const opus_int32            inLog_Q7            /* I  input on log scale                                            */
 )
 {
@@ -42,8 +42,8 @@
     if( inLog_Q7 < 0 ) {
         return 0;
     } else if ( inLog_Q7 >= 3967 ) {
-		return silk_int32_MAX;
-	}
+        return silk_int32_MAX;
+    }
 
     out = silk_LSHIFT( 1, silk_RSHIFT( inLog_Q7, 7 ) );
     frac_Q7 = inLog_Q7 & 0x7F;
diff --git a/silk/macros.h b/silk/macros.h
index a84e5a5..bc30303 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -35,19 +35,42 @@
 #include "opus_types.h"
 #include "opus_defines.h"
 
+#if OPUS_GNUC_PREREQ(3, 0)
+#define opus_likely(x)       (__builtin_expect(!!(x), 1))
+#define opus_unlikely(x)     (__builtin_expect(!!(x), 0))
+#else
+#define opus_likely(x)       (!!(x))
+#define opus_unlikely(x)     (!!(x))
+#endif
+
+/* Set this if opus_int64 is a native type of the CPU. */
+#define OPUS_FAST_INT64 (defined(__x86_64__) || defined(__LP64__) || defined(_WIN64))
+
 /* This is an OPUS_INLINE header file for general platform. */
 
 /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
+#if OPUS_FAST_INT64
+#define silk_SMULWB(a32, b32)            (((a32) * (opus_int64)((opus_int16)(b32))) >> 16)
+#else
 #define silk_SMULWB(a32, b32)            ((((a32) >> 16) * (opus_int32)((opus_int16)(b32))) + ((((a32) & 0x0000FFFF) * (opus_int32)((opus_int16)(b32))) >> 16))
+#endif
 
 /* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
+#if OPUS_FAST_INT64
+#define silk_SMLAWB(a32, b32, c32)       ((a32) + (((b32) * (opus_int64)((opus_int16)(c32))) >> 16))
+#else
 #define silk_SMLAWB(a32, b32, c32)       ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16)))
+#endif
 
 /* (a32 * (b32 >> 16)) >> 16 */
 #define silk_SMULWT(a32, b32)            (((a32) >> 16) * ((b32) >> 16) + ((((a32) & 0x0000FFFF) * ((b32) >> 16)) >> 16))
 
 /* a32 + (b32 * (c32 >> 16)) >> 16 */
+#if OPUS_FAST_INT64
+#define silk_SMLAWT(a32, b32, c32)       ((a32) + (((b32) * ((opus_int64)(c32) >> 16)) >> 16))
+#else
 #define silk_SMLAWT(a32, b32, c32)       ((a32) + (((b32) >> 16) * ((c32) >> 16)) + ((((b32) & 0x0000FFFF) * ((c32) >> 16)) >> 16))
+#endif
 
 /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
 #define silk_SMULBB(a32, b32)            ((opus_int32)((opus_int16)(a32)) * (opus_int32)((opus_int16)(b32)))
@@ -65,10 +88,18 @@
 #define silk_SMLAL(a64, b32, c32)        (silk_ADD64((a64), ((opus_int64)(b32) * (opus_int64)(c32))))
 
 /* (a32 * b32) >> 16 */
+#if OPUS_FAST_INT64
+#define silk_SMULWW(a32, b32)            (((opus_int64)(a32) * (b32)) >> 16)
+#else
 #define silk_SMULWW(a32, b32)            silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16))
+#endif
 
 /* a32 + ((b32 * c32) >> 16) */
+#if OPUS_FAST_INT64
+#define silk_SMLAWW(a32, b32, c32)       ((a32) + (((opus_int64)(b32) * (c32)) >> 16))
+#else
 #define silk_SMLAWW(a32, b32, c32)       silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16))
+#endif
 
 /* add/subtract with output saturated */
 #define silk_ADD_SAT32(a, b)             ((((opus_uint32)(a) + (opus_uint32)(b)) & 0x80000000) == 0 ?                              \
@@ -79,17 +110,24 @@
                                         (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) :    \
                                         ((((a)^0x80000000) & (b)  & 0x80000000) ? silk_int32_MAX : (a)-(b)) )
 
-#include "ecintrin.h"
+#if defined(MIPSr1_ASM)
+#include "mips/macros_mipsr1.h"
+#endif
 
+#include "ecintrin.h"
+#ifndef OVERRIDE_silk_CLZ16
 static OPUS_INLINE opus_int32 silk_CLZ16(opus_int16 in16)
 {
     return 32 - EC_ILOG(in16<<16|0x8000);
 }
+#endif
 
+#ifndef OVERRIDE_silk_CLZ32
 static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32)
 {
     return in32 ? 32 - EC_ILOG(in32) : 32;
 }
+#endif
 
 /* Row based */
 #define matrix_ptr(Matrix_base_adr, row, column, N) \
diff --git a/silk/main.h b/silk/main.h
index 2bdf897..2f90d68 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -38,6 +38,10 @@
 #include "entenc.h"
 #include "entdec.h"
 
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/main_sse.h"
+#endif
+
 /* Convert Left/Right stereo signal to adaptive Mid/Side representation */
 void silk_stereo_LR_to_MS(
     stereo_enc_state            *state,                         /* I/O  State                                       */
@@ -116,7 +120,7 @@
 /* Decodes signs of excitation */
 void silk_decode_signs(
     ec_dec                      *psRangeDec,                        /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                           /* I/O  pulse signal                                */
+    opus_int16                  pulses[],                           /* I/O  pulse signal                                */
     opus_int                    length,                             /* I    length of input                             */
     const opus_int              signalType,                         /* I    Signal type                                 */
     const opus_int              quantOffsetType,                    /* I    Quantization offset type                    */
@@ -161,7 +165,7 @@
 
 /* Shell decoder, operates on one shell code frame of 16 pulses */
 void silk_shell_decoder(
-    opus_int                    *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
+    opus_int16                  *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
     ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
     const opus_int              pulses4                         /* I    number of pulses per pulse-subframe         */
 );
@@ -204,15 +208,16 @@
     opus_int16                  B_Q14[ MAX_NB_SUBFR * LTP_ORDER ],          /* I/O  (un)quantized LTP gains         */
     opus_int8                   cbk_index[ MAX_NB_SUBFR ],                  /* O    Codebook Index                  */
     opus_int8                   *periodicity_index,                         /* O    Periodicity Index               */
-	opus_int32					*sum_gain_dB_Q7,							/* I/O  Cumulative max prediction gain  */
+    opus_int32                  *sum_gain_dB_Q7,                            /* I/O  Cumulative max prediction gain  */
     const opus_int32            W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ],  /* I    Error Weights in Q18            */
     opus_int                    mu_Q9,                                      /* I    Mu value (R/D tradeoff)         */
     opus_int                    lowComplexity,                              /* I    Flag for low complexity         */
-    const opus_int              nb_subfr                                    /* I    number of subframes             */
+    const opus_int              nb_subfr,                                   /* I    number of subframes             */
+    int                         arch                                        /* I    Run-time architecture           */
 );
 
 /* Entropy constrained matrix-weighted VQ, for a single input data vector */
-void silk_VQ_WMat_EC(
+void silk_VQ_WMat_EC_c(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
     opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
@@ -226,10 +231,18 @@
     opus_int                    L                               /* I    number of vectors in codebook               */
 );
 
+#if !defined(OVERRIDE_silk_VQ_WMat_EC)
+#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_c(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L))
+#endif
+
 /************************************/
 /* Noise shaping quantization (NSQ) */
 /************************************/
-void silk_NSQ(
+
+void silk_NSQ_c(
     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
@@ -247,8 +260,15 @@
     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
 );
 
+#if !defined(OVERRIDE_silk_NSQ)
+#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+#endif
+
 /* Noise shaping using delayed decision */
-void silk_NSQ_del_dec(
+void silk_NSQ_del_dec_c(
     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
@@ -266,6 +286,13 @@
     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
 );
 
+#if !defined(OVERRIDE_silk_NSQ_del_dec)
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+#endif
+
 /************/
 /* Silk VAD */
 /************/
@@ -275,11 +302,15 @@
 );
 
 /* Get speech activity level in Q8 */
-opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
+opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return value, 0 if success                  */
     silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
     const opus_int16            pIn[]                           /* I    PCM input                                   */
 );
 
+#if !defined(OVERRIDE_silk_VAD_GetSA_Q8)
+#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_c(psEnC, pIn))
+#endif
+
 /* Low-pass filter with variable cutoff frequency based on  */
 /* piece-wise linear interpolation between elliptic filters */
 /* Start by setting transition_frame_no = 1;                */
@@ -373,7 +404,8 @@
     opus_int16                  pOut[],                         /* O    Pointer to output speech frame              */
     opus_int32                  *pN,                            /* O    Pointer to size of output frame             */
     opus_int                    lostFlag,                       /* I    0: no loss, 1 loss, 2 decode fec            */
-    opus_int                    condCoding                      /* I    The type of conditional coding to use       */
+    opus_int                    condCoding,                     /* I    The type of conditional coding to use       */
+    int                         arch                            /* I    Run-time architecture                       */
 );
 
 /* Decode indices from bitstream */
@@ -397,13 +429,14 @@
     silk_decoder_state          *psDec,                         /* I/O  Decoder state                               */
     silk_decoder_control        *psDecCtrl,                     /* I    Decoder control                             */
     opus_int16                  xq[],                           /* O    Decoded speech                              */
-    const opus_int              pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
+    const opus_int16            pulses[ MAX_FRAME_LENGTH ],     /* I    Pulse signal                                */
+    int                         arch                            /* I    Run-time architecture                       */
 );
 
 /* Decode quantization indices of excitation (Shell coding) */
 void silk_decode_pulses(
     ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                       /* O    Excitation signal                           */
+    opus_int16                  pulses[],                       /* O    Excitation signal                           */
     const opus_int              signalType,                     /* I    Sigtype                                     */
     const opus_int              quantOffsetType,                /* I    quantOffsetType                             */
     const opus_int              frame_length                    /* I    Frame length                                */
diff --git a/silk/mips/NSQ_del_dec_mipsr1.h b/silk/mips/NSQ_del_dec_mipsr1.h
new file mode 100644
index 0000000..f6afd92
--- /dev/null
+++ b/silk/mips/NSQ_del_dec_mipsr1.h
@@ -0,0 +1,405 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef __NSQ_DEL_DEC_MIPSR1_H__
+#define __NSQ_DEL_DEC_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "main.h"
+#include "stack_alloc.h"
+
+#define OVERRIDE_silk_noise_shape_quantizer_del_dec
+static inline void silk_noise_shape_quantizer_del_dec(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
+    opus_int            signalType,             /* I    Signal type                         */
+    const opus_int32    x_Q10[],                /* I                                        */
+    opus_int8           pulses[],               /* O                                        */
+    opus_int16          xq[],                   /* O                                        */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
+    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
+    opus_int            lag,                    /* I    Pitch lag                           */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
+    opus_int32          LF_shp_Q14,             /* I                                        */
+    opus_int32          Gain_Q16,               /* I                                        */
+    opus_int            Lambda_Q10,             /* I                                        */
+    opus_int            offset_Q10,             /* I                                        */
+    opus_int            length,                 /* I    Input length                        */
+    opus_int            subfr,                  /* I    Subframe number                     */
+    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
+    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
+    opus_int            warping_Q16,            /* I                                        */
+    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
+    opus_int            *smpl_buf_idx,          /* I    Index to newest samples in buffers  */
+    opus_int            decisionDelay           /* I                                        */
+)
+{
+    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
+    opus_int32   Winner_rand_state;
+    opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
+    opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
+    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
+    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    NSQ_sample_struct  psSampleState[ MAX_DEL_DEC_STATES ][ 2 ];
+    NSQ_del_dec_struct *psDD;
+    NSQ_sample_struct  *psSS;
+    opus_int16 b_Q14_0, b_Q14_1, b_Q14_2, b_Q14_3, b_Q14_4;
+    opus_int16 a_Q12_0, a_Q12_1, a_Q12_2, a_Q12_3, a_Q12_4, a_Q12_5, a_Q12_6;
+    opus_int16 a_Q12_7, a_Q12_8, a_Q12_9, a_Q12_10, a_Q12_11, a_Q12_12, a_Q12_13;
+    opus_int16 a_Q12_14, a_Q12_15;
+
+    opus_int32 cur, prev, next;
+
+    //Intialize b_Q14 variables
+    b_Q14_0 = b_Q14[ 0 ];
+    b_Q14_1 = b_Q14[ 1 ];
+    b_Q14_2 = b_Q14[ 2 ];
+    b_Q14_3 = b_Q14[ 3 ];
+    b_Q14_4 = b_Q14[ 4 ];
+
+    //Intialize a_Q12 variables
+    a_Q12_0 = a_Q12[0];
+    a_Q12_1 = a_Q12[1];
+    a_Q12_2 = a_Q12[2];
+    a_Q12_3 = a_Q12[3];
+    a_Q12_4 = a_Q12[4];
+    a_Q12_5 = a_Q12[5];
+    a_Q12_6 = a_Q12[6];
+    a_Q12_7 = a_Q12[7];
+    a_Q12_8 = a_Q12[8];
+    a_Q12_9 = a_Q12[9];
+    a_Q12_10 = a_Q12[10];
+    a_Q12_11 = a_Q12[11];
+    a_Q12_12 = a_Q12[12];
+    a_Q12_13 = a_Q12[13];
+    a_Q12_14 = a_Q12[14];
+    a_Q12_15 = a_Q12[15];
+
+    long long temp64;
+
+    silk_assert( nStatesDelayedDecision > 0 );
+
+    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
+    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
+    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
+
+    for( i = 0; i < length; i++ ) {
+        /* Perform common calculations used in all states */
+
+        /* Long-term prediction */
+        if( signalType == TYPE_VOICED ) {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            temp64 = __builtin_mips_mult(pred_lag_ptr[ 0 ], b_Q14_0 );
+            temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -1 ], b_Q14_1 );
+            temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -2 ], b_Q14_2 );
+            temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -3 ], b_Q14_3 );
+            temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -4 ], b_Q14_4 );
+            temp64 += 32768;
+            LTP_pred_Q14 = __builtin_mips_extr_w(temp64, 16);
+            LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
+            pred_lag_ptr++;
+        } else {
+            LTP_pred_Q14 = 0;
+        }
+
+        /* Long-term shaping */
+        if( lag > 0 ) {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
+            shp_lag_ptr++;
+        } else {
+            n_LTP_Q14 = 0;
+        }
+
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            /* Delayed decision state */
+            psDD = &psDelDec[ k ];
+
+            /* Sample state */
+            psSS = psSampleState[ k ];
+
+            /* Generate dither */
+            psDD->Seed = silk_RAND( psDD->Seed );
+
+            /* Pointer used in short term prediction and shaping */
+            psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
+            /* Short-term prediction */
+            silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
+            temp64 = __builtin_mips_mult(psLPC_Q14[  0 ], a_Q12_0 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -1 ], a_Q12_1 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -2 ], a_Q12_2 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -3 ], a_Q12_3 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -4 ], a_Q12_4 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -5 ], a_Q12_5 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -6 ], a_Q12_6 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -7 ], a_Q12_7 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -8 ], a_Q12_8 );
+            temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -9 ], a_Q12_9 );
+            if( predictLPCOrder == 16 ) {
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -10 ], a_Q12_10 );
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -11 ], a_Q12_11 );
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -12 ], a_Q12_12 );
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -13 ], a_Q12_13 );
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -14 ], a_Q12_14 );
+                temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -15 ], a_Q12_15 );
+            }
+            temp64 += 32768;
+            LPC_pred_Q14 = __builtin_mips_extr_w(temp64, 16);
+
+            LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 );                              /* Q10 -> Q14 */
+
+            /* Noise shape feedback */
+            silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+            /* Output of lowpass section */
+            tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+            /* Output of allpass section */
+            tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
+            psDD->sAR2_Q14[ 0 ] = tmp2;
+
+            temp64 = __builtin_mips_mult(tmp2, AR_shp_Q13[ 0 ] );
+
+            prev = psDD->sAR2_Q14[ 1 ];
+
+            /* Loop over allpass sections */
+            for( j = 2; j < shapingLPCOrder; j += 2 ) {
+                cur = psDD->sAR2_Q14[ j ];
+                next = psDD->sAR2_Q14[ j+1 ];
+                /* Output of allpass section */
+                tmp2 = silk_SMLAWB( prev, cur - tmp1, warping_Q16 );
+                psDD->sAR2_Q14[ j - 1 ] = tmp1;
+                temp64 = __builtin_mips_madd( temp64, tmp1, AR_shp_Q13[ j - 1 ] );
+                temp64 = __builtin_mips_madd( temp64, tmp2, AR_shp_Q13[ j ] );
+                /* Output of allpass section */
+                tmp1 = silk_SMLAWB( cur, next - tmp2, warping_Q16 );
+                psDD->sAR2_Q14[ j + 0 ] = tmp2;
+                prev = next;
+            }
+            psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
+            temp64 = __builtin_mips_madd( temp64, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
+            temp64 += 32768;
+            n_AR_Q14 = __builtin_mips_extr_w(temp64, 16);
+            n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
+            n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
+            n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
+
+            n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
+            n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
+            n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
+
+            /* Input minus prediction plus noise feedback                       */
+            /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
+            tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+            tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
+            tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+            tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
+
+            r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
+
+            /* Flip sign depending on dither */
+            if ( psDD->Seed < 0 ) {
+                r_Q10 = -r_Q10;
+            }
+            r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
+
+            /* Find two quantization level candidates and measure their rate-distortion */
+            q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
+            q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+            if( q1_Q0 > 0 ) {
+                q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+            } else if( q1_Q0 == 0 ) {
+                q1_Q10  = offset_Q10;
+                q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+            } else if( q1_Q0 == -1 ) {
+                q2_Q10  = offset_Q10;
+                q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
+            } else {            /* q1_Q0 < -1 */
+                q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
+            }
+            rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
+            rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
+            rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
+            rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
+
+            if( rd1_Q10 < rd2_Q10 ) {
+                psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                psSS[ 0 ].Q_Q10  = q1_Q10;
+                psSS[ 1 ].Q_Q10  = q2_Q10;
+            } else {
+                psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                psSS[ 0 ].Q_Q10  = q2_Q10;
+                psSS[ 1 ].Q_Q10  = q1_Q10;
+            }
+
+            /* Update states for best quantization */
+
+            /* Quantized excitation */
+            exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
+            if ( psDD->Seed < 0 ) {
+                exc_Q14 = -exc_Q14;
+            }
+
+            /* Add predictions */
+            LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+            xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+            /* Update states */
+            sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+            psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+            psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+            psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
+            psSS[ 0 ].xq_Q14       = xq_Q14;
+
+            /* Update states for second best quantization */
+
+            /* Quantized excitation */
+            exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
+            if ( psDD->Seed < 0 ) {
+                exc_Q14 = -exc_Q14;
+            }
+
+
+            /* Add predictions */
+            LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+            xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+            /* Update states */
+            sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+            psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+            psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+            psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
+            psSS[ 1 ].xq_Q14       = xq_Q14;
+        }
+
+        *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK;                   /* Index to newest samples              */
+        last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK;       /* Index to decisionDelay old samples   */
+
+        /* Find winner */
+        RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
+        Winner_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                Winner_ind = k;
+            }
+        }
+
+        /* Increase RD values of expired states */
+        Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
+                psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
+                psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
+                silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
+            }
+        }
+
+        /* Find worst in first set and best in second set */
+        RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
+        RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
+        RDmax_ind = 0;
+        RDmin_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            /* find worst in first set */
+            if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
+                RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                RDmax_ind = k;
+            }
+            /* find best in second set */
+            if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
+                RDmin_ind = k;
+            }
+        }
+
+        /* Replace a state if best from second set outperforms worst in first set */
+        if( RDmin_Q10 < RDmax_Q10 ) {
+            silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
+                         ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
+            silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
+        }
+
+        /* Write samples from winner to output and long-term filter states */
+        psDD = &psDelDec[ Winner_ind ];
+        if( subfr > 0 || i >= decisionDelay ) {
+            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+                silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
+            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
+            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
+        }
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Update states */
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            psDD                                     = &psDelDec[ k ];
+            psSS                                     = &psSampleState[ k ][ 0 ];
+            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
+            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
+            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
+            psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
+            psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
+            psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
+            psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
+            psDD->RD_Q10                             = psSS->RD_Q10;
+        }
+        delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
+    }
+    /* Update LPC states */
+    for( k = 0; k < nStatesDelayedDecision; k++ ) {
+        psDD = &psDelDec[ k ];
+        silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+    }
+}
+
+#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */
diff --git a/silk/mips/macros_mipsr1.h b/silk/mips/macros_mipsr1.h
new file mode 100644
index 0000000..12ed981
--- /dev/null
+++ b/silk/mips/macros_mipsr1.h
@@ -0,0 +1,92 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+
+#ifndef __SILK_MACROS_MIPSR1_H__
+#define __SILK_MACROS_MIPSR1_H__
+
+#define mips_clz(x) __builtin_clz(x)
+
+#undef silk_SMULWB
+static inline int silk_SMULWB(int a, int b)
+{
+    long long ac;
+    int c;
+
+    ac = __builtin_mips_mult(a, (opus_int32)(opus_int16)b);
+    c = __builtin_mips_extr_w(ac, 16);
+
+    return c;
+}
+
+#undef silk_SMLAWB
+#define silk_SMLAWB(a32, b32, c32)       ((a32) + silk_SMULWB(b32, c32))
+
+#undef silk_SMULWW
+static inline int silk_SMULWW(int a, int b)
+{
+    long long ac;
+    int c;
+
+    ac = __builtin_mips_mult(a, b);
+    c = __builtin_mips_extr_w(ac, 16);
+
+    return c;
+}
+
+#undef silk_SMLAWW
+static inline int silk_SMLAWW(int a, int b, int c)
+{
+    long long ac;
+    int res;
+
+    ac = __builtin_mips_mult(b, c);
+    res = __builtin_mips_extr_w(ac, 16);
+    res += a;
+
+    return res;
+}
+
+#define OVERRIDE_silk_CLZ16
+static inline opus_int32 silk_CLZ16(opus_int16 in16)
+{
+    int re32;
+    opus_int32 in32 = (opus_int32 )in16;
+    re32 = mips_clz(in32);
+    re32-=16;
+    return re32;
+}
+
+#define OVERRIDE_silk_CLZ32
+static inline opus_int32 silk_CLZ32(opus_int32 in32)
+{
+    int re32;
+    re32 = mips_clz(in32);
+    return re32;
+}
+
+#endif /* __SILK_MACROS_MIPSR1_H__ */
diff --git a/silk/mips/sigproc_fix_mipsr1.h b/silk/mips/sigproc_fix_mipsr1.h
new file mode 100644
index 0000000..3b0a695
--- /dev/null
+++ b/silk/mips/sigproc_fix_mipsr1.h
@@ -0,0 +1,65 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_SIGPROC_FIX_MIPSR1_H
+#define SILK_SIGPROC_FIX_MIPSR1_H
+
+#ifdef  __cplusplus
+extern "C"
+{
+#endif
+
+#undef silk_SAT16
+static inline short int silk_SAT16(int a)
+{
+    int c;
+    c = __builtin_mips_shll_s_w(a, 16);
+    c = c>>16;
+
+    return c;
+}
+
+#undef silk_LSHIFT_SAT32
+static inline int silk_LSHIFT_SAT32(int a, int shift)
+{
+    int r;
+
+    r = __builtin_mips_shll_s_w(a, shift);
+
+    return r;
+}
+
+#undef silk_RSHIFT_ROUND
+static inline int silk_RSHIFT_ROUND(int a, int shift)
+{
+    int r;
+
+    r = __builtin_mips_shra_r_w(a, shift);
+    return r;
+}
+
+#endif /* SILK_SIGPROC_FIX_MIPSR1_H */
diff --git a/silk/quant_LTP_gains.c b/silk/quant_LTP_gains.c
index fd0870d..513a8c4 100644
--- a/silk/quant_LTP_gains.c
+++ b/silk/quant_LTP_gains.c
@@ -36,11 +36,12 @@
     opus_int16                  B_Q14[ MAX_NB_SUBFR * LTP_ORDER ],          /* I/O  (un)quantized LTP gains         */
     opus_int8                   cbk_index[ MAX_NB_SUBFR ],                  /* O    Codebook Index                  */
     opus_int8                   *periodicity_index,                         /* O    Periodicity Index               */
-	opus_int32					*sum_log_gain_Q7,							/* I/O  Cumulative max prediction gain  */
+    opus_int32                  *sum_log_gain_Q7,                           /* I/O  Cumulative max prediction gain  */
     const opus_int32            W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ],  /* I    Error Weights in Q18            */
     opus_int                    mu_Q9,                                      /* I    Mu value (R/D tradeoff)         */
     opus_int                    lowComplexity,                              /* I    Flag for low complexity         */
-    const opus_int              nb_subfr                                    /* I    number of subframes             */
+    const opus_int              nb_subfr,                                   /* I    number of subframes             */
+    int                         arch                                        /* I    Run-time architecture           */
 )
 {
     opus_int             j, k, cbk_size;
@@ -51,7 +52,7 @@
     const opus_int16     *b_Q14_ptr;
     const opus_int32     *W_Q18_ptr;
     opus_int32           rate_dist_Q14_subfr, rate_dist_Q14, min_rate_dist_Q14;
-	opus_int32           sum_log_gain_tmp_Q7, best_sum_log_gain_Q7, max_gain_Q7, gain_Q7;
+    opus_int32           sum_log_gain_tmp_Q7, best_sum_log_gain_Q7, max_gain_Q7, gain_Q7;
 
     /***************************************************/
     /* iterate over different codebooks with different */
@@ -74,23 +75,24 @@
         b_Q14_ptr = B_Q14;
 
         rate_dist_Q14 = 0;
-		sum_log_gain_tmp_Q7 = *sum_log_gain_Q7;
+        sum_log_gain_tmp_Q7 = *sum_log_gain_Q7;
         for( j = 0; j < nb_subfr; j++ ) {
-			max_gain_Q7 = silk_log2lin( ( SILK_FIX_CONST( MAX_SUM_LOG_GAIN_DB / 6.0, 7 ) - sum_log_gain_tmp_Q7 ) 
-										+ SILK_FIX_CONST( 7, 7 ) ) - gain_safety;
+            max_gain_Q7 = silk_log2lin( ( SILK_FIX_CONST( MAX_SUM_LOG_GAIN_DB / 6.0, 7 ) - sum_log_gain_tmp_Q7 )
+                                        + SILK_FIX_CONST( 7, 7 ) ) - gain_safety;
 
             silk_VQ_WMat_EC(
                 &temp_idx[ j ],         /* O    index of best codebook vector                           */
                 &rate_dist_Q14_subfr,   /* O    best weighted quantization error + mu * rate            */
-				&gain_Q7,               /* O    sum of absolute LTP coefficients                        */
+                &gain_Q7,               /* O    sum of absolute LTP coefficients                        */
                 b_Q14_ptr,              /* I    input vector to be quantized                            */
                 W_Q18_ptr,              /* I    weighting matrix                                        */
                 cbk_ptr_Q7,             /* I    codebook                                                */
                 cbk_gain_ptr_Q7,        /* I    codebook effective gains                                */
                 cl_ptr_Q5,              /* I    code length for each codebook vector                    */
                 mu_Q9,                  /* I    tradeoff between weighted error and rate                */
-				max_gain_Q7,            /* I    maximum sum of absolute LTP coefficients                */
-                cbk_size                /* I    number of vectors in codebook                           */
+                max_gain_Q7,            /* I    maximum sum of absolute LTP coefficients                */
+                cbk_size,               /* I    number of vectors in codebook                           */
+                arch                    /* I    Run-time architecture                                   */
             );
 
             rate_dist_Q14 = silk_ADD_POS_SAT32( rate_dist_Q14, rate_dist_Q14_subfr );
@@ -108,7 +110,7 @@
             min_rate_dist_Q14 = rate_dist_Q14;
             *periodicity_index = (opus_int8)k;
             silk_memcpy( cbk_index, temp_idx, nb_subfr * sizeof( opus_int8 ) );
-			best_sum_log_gain_Q7 = sum_log_gain_tmp_Q7;
+            best_sum_log_gain_Q7 = sum_log_gain_tmp_Q7;
         }
 
         /* Break early in low-complexity mode if rate distortion is below threshold */
@@ -123,6 +125,5 @@
             B_Q14[ j * LTP_ORDER + k ] = silk_LSHIFT( cbk_ptr_Q7[ cbk_index[ j ] * LTP_ORDER + k ], 7 );
         }
     }
-	*sum_log_gain_Q7 = best_sum_log_gain_Q7;
+    *sum_log_gain_Q7 = best_sum_log_gain_Q7;
 }
-
diff --git a/silk/resampler_rom.c b/silk/resampler_rom.c
index 2d50270..5e6b044 100644
--- a/silk/resampler_rom.c
+++ b/silk/resampler_rom.c
@@ -41,36 +41,36 @@
 
 /* Tables with IIR and FIR coefficients for fractional downsamplers (123 Words) */
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_3_4_COEFS[ 2 + 3 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ] = {
-	-20694, -13867,
-	   -49,     64,     17,   -157,    353,   -496,    163,  11047,  22205,
-	   -39,      6,     91,   -170,    186,     23,   -896,   6336,  19928,
-	   -19,    -36,    102,    -89,    -24,    328,   -951,   2568,  15909,
+    -20694, -13867,
+       -49,     64,     17,   -157,    353,   -496,    163,  11047,  22205,
+       -39,      6,     91,   -170,    186,     23,   -896,   6336,  19928,
+       -19,    -36,    102,    -89,    -24,    328,   -951,   2568,  15909,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_2_3_COEFS[ 2 + 2 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ] = {
-	-14457, -14019,
-	    64,    128,   -122,     36,    310,   -768,    584,   9267,  17733,
-	    12,    128,     18,   -142,    288,   -117,   -865,   4123,  14459,
+    -14457, -14019,
+        64,    128,   -122,     36,    310,   -768,    584,   9267,  17733,
+        12,    128,     18,   -142,    288,   -117,   -865,   4123,  14459,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_2_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR1 / 2 ] = {
-	   616, -14323,
-	   -10,     39,     58,    -46,    -84,    120,    184,   -315,   -541,   1284,   5380,   9024,
+       616, -14323,
+       -10,     39,     58,    -46,    -84,    120,    184,   -315,   -541,   1284,   5380,   9024,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_3_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = {
-	 16102, -15162,
-	   -13,      0,     20,     26,      5,    -31,    -43,     -4,     65,     90,      7,   -157,   -248,    -44,    593,   1583,   2612,   3271,
+     16102, -15162,
+       -13,      0,     20,     26,      5,    -31,    -43,     -4,     65,     90,      7,   -157,   -248,    -44,    593,   1583,   2612,   3271,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_4_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = {
-	 22500, -15099,
-	     3,    -14,    -20,    -15,      2,     25,     37,     25,    -16,    -71,   -107,    -79,     50,    292,    623,    982,   1288,   1464,
+     22500, -15099,
+         3,    -14,    -20,    -15,      2,     25,     37,     25,    -16,    -71,   -107,    -79,     50,    292,    623,    982,   1288,   1464,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_6_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = {
-	 27540, -15257,
-	    17,     12,      8,      1,    -10,    -22,    -30,    -32,    -22,      3,     44,    100,    168,    243,    317,    381,    429,    455,
+     27540, -15257,
+        17,     12,      8,      1,    -10,    -22,    -30,    -32,    -22,      3,     44,    100,    168,    243,    317,    381,    429,    455,
 };
 
 silk_DWORD_ALIGN const opus_int16 silk_Resampler_2_3_COEFS_LQ[ 2 + 2 * 2 ] = {
@@ -81,16 +81,16 @@
 
 /* Table with interplation fractions of 1/24, 3/24, 5/24, ... , 23/24 : 23/24 (46 Words) */
 silk_DWORD_ALIGN const opus_int16 silk_resampler_frac_FIR_12[ 12 ][ RESAMPLER_ORDER_FIR_12 / 2 ] = {
-	{  189,  -600,   617, 30567 },
-	{  117,  -159, -1070, 29704 },
-	{   52,   221, -2392, 28276 },
-	{   -4,   529, -3350, 26341 },
-	{  -48,   758, -3956, 23973 },
-	{  -80,   905, -4235, 21254 },
-	{  -99,   972, -4222, 18278 },
-	{ -107,   967, -3957, 15143 },
-	{ -103,   896, -3487, 11950 },
-	{  -91,   773, -2865,  8798 },
-	{  -71,   611, -2143,  5784 },
-	{  -46,   425, -1375,  2996 },
+    {  189,  -600,   617, 30567 },
+    {  117,  -159, -1070, 29704 },
+    {   52,   221, -2392, 28276 },
+    {   -4,   529, -3350, 26341 },
+    {  -48,   758, -3956, 23973 },
+    {  -80,   905, -4235, 21254 },
+    {  -99,   972, -4222, 18278 },
+    { -107,   967, -3957, 15143 },
+    { -103,   896, -3487, 11950 },
+    {  -91,   773, -2865,  8798 },
+    {  -71,   611, -2143,  5784 },
+    {  -46,   425, -1375,  2996 },
 };
diff --git a/silk/shell_coder.c b/silk/shell_coder.c
index 796f57d..4af3414 100644
--- a/silk/shell_coder.c
+++ b/silk/shell_coder.c
@@ -58,8 +58,8 @@
 }
 
 static OPUS_INLINE void decode_split(
-    opus_int                    *p_child1,      /* O    pulse amplitude of first child subframe     */
-    opus_int                    *p_child2,      /* O    pulse amplitude of second child subframe    */
+    opus_int16                  *p_child1,      /* O    pulse amplitude of first child subframe     */
+    opus_int16                  *p_child2,      /* O    pulse amplitude of second child subframe    */
     ec_dec                      *psRangeDec,    /* I/O  Compressor data structure                   */
     const opus_int              p,              /* I    pulse amplitude of current subframe         */
     const opus_uint8            *shell_table    /* I    table of shell cdfs                         */
@@ -117,12 +117,12 @@
 
 /* Shell decoder, operates on one shell code frame of 16 pulses */
 void silk_shell_decoder(
-    opus_int                    *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
+    opus_int16                  *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
     ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
     const opus_int              pulses4                         /* I    number of pulses per pulse-subframe         */
 )
 {
-    opus_int pulses3[ 2 ], pulses2[ 4 ], pulses1[ 8 ];
+    opus_int16 pulses3[ 2 ], pulses2[ 4 ], pulses1[ 8 ];
 
     /* this function operates on one shell code frame of 16 pulses */
     silk_assert( SHELL_CODEC_FRAME_LENGTH == 16 );
diff --git a/silk/structs.h b/silk/structs.h
index 1826b36..827829d 100644
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -171,7 +171,7 @@
     opus_int32                   pitchEstimationThreshold_Q16;      /* Threshold for pitch estimator                                    */
     opus_int                     LTPQuantLowComplexity;             /* Flag for low complexity LTP quantization                         */
     opus_int                     mu_LTP_Q9;                         /* Rate-distortion tradeoff in LTP quantization                     */
-    opus_int32                   sum_log_gain_Q7;					/* Cumulative max prediction gain									*/
+    opus_int32                   sum_log_gain_Q7;                   /* Cumulative max prediction gain                                   */
     opus_int                     NLSF_MSVQ_Survivors;               /* Number of survivors in NLSF MSVQ                                 */
     opus_int                     first_frame_after_reset;           /* Flag for deactivating NLSF interpolation, pitch prediction       */
     opus_int                     controlled_since_last_payload;     /* Flag for ensuring codec_control only runs once per packet        */
diff --git a/silk/sum_sqr_shift.c b/silk/sum_sqr_shift.c
index 12514c9..129df19 100644
--- a/silk/sum_sqr_shift.c
+++ b/silk/sum_sqr_shift.c
@@ -53,6 +53,7 @@
             /* Scale down */
             nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 );
             shft = 2;
+            i+=2;
             break;
         }
     }
diff --git a/silk/tables.h b/silk/tables.h
index a91431e..7fea6fd 100644
--- a/silk/tables.h
+++ b/silk/tables.h
@@ -47,8 +47,8 @@
 extern const opus_uint8  silk_pitch_contour_10_ms_iCDF[ 12 ];                                       /*  12 */
 extern const opus_uint8  silk_pitch_contour_10_ms_NB_iCDF[ 3 ];                                     /*   3 */
 
-extern const opus_uint8  silk_pulses_per_block_iCDF[ N_RATE_LEVELS ][ MAX_PULSES + 2 ];             /* 180 */
-extern const opus_uint8  silk_pulses_per_block_BITS_Q5[ N_RATE_LEVELS - 1 ][ MAX_PULSES + 2 ];      /* 162 */
+extern const opus_uint8  silk_pulses_per_block_iCDF[ N_RATE_LEVELS ][ SILK_MAX_PULSES + 2 ];        /* 180 */
+extern const opus_uint8  silk_pulses_per_block_BITS_Q5[ N_RATE_LEVELS - 1 ][ SILK_MAX_PULSES + 2 ]; /* 162 */
 
 extern const opus_uint8  silk_rate_levels_iCDF[ 2 ][ N_RATE_LEVELS - 1 ];                           /*  18 */
 extern const opus_uint8  silk_rate_levels_BITS_Q5[ 2 ][ N_RATE_LEVELS - 1 ];                        /*  18 */
@@ -59,7 +59,7 @@
 extern const opus_uint8  silk_shell_code_table1[ 152 ];                                             /* 152 */
 extern const opus_uint8  silk_shell_code_table2[ 152 ];                                             /* 152 */
 extern const opus_uint8  silk_shell_code_table3[ 152 ];                                             /* 152 */
-extern const opus_uint8  silk_shell_code_table_offsets[ MAX_PULSES + 1 ];                           /*  17 */
+extern const opus_uint8  silk_shell_code_table_offsets[ SILK_MAX_PULSES + 1 ];                      /*  17 */
 
 extern const opus_uint8  silk_lsb_iCDF[ 2 ];                                                        /*   2 */
 
diff --git a/silk/tuning_parameters.h b/silk/tuning_parameters.h
index e1057bb..5b8f404 100644
--- a/silk/tuning_parameters.h
+++ b/silk/tuning_parameters.h
@@ -64,7 +64,7 @@
 #define MU_LTP_QUANT_WB                                 0.02f
 
 /* Max cumulative LTP gain */
-#define MAX_SUM_LOG_GAIN_DB								250.0f
+#define MAX_SUM_LOG_GAIN_DB                             250.0f
 
 /***********************/
 /* High pass filtering */
diff --git a/silk/x86/NSQ_del_dec_sse.c b/silk/x86/NSQ_del_dec_sse.c
new file mode 100644
index 0000000..21d4a8b
--- /dev/null
+++ b/silk/x86/NSQ_del_dec_sse.c
@@ -0,0 +1,857 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+#include "stack_alloc.h"
+
+typedef struct {
+    opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
+    opus_int32 RandState[ DECISION_DELAY ];
+    opus_int32 Q_Q10[     DECISION_DELAY ];
+    opus_int32 Xq_Q14[    DECISION_DELAY ];
+    opus_int32 Pred_Q15[  DECISION_DELAY ];
+    opus_int32 Shape_Q14[ DECISION_DELAY ];
+    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
+    opus_int32 LF_AR_Q14;
+    opus_int32 Seed;
+    opus_int32 SeedInit;
+    opus_int32 RD_Q10;
+} NSQ_del_dec_struct;
+
+typedef struct {
+    opus_int32 Q_Q10;
+    opus_int32 RD_Q10;
+    opus_int32 xq_Q14;
+    opus_int32 LF_AR_Q14;
+    opus_int32 sLTP_shp_Q14;
+    opus_int32 LPC_exc_Q14;
+} NSQ_sample_struct;
+
+typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
+    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
+    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
+    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
+    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
+    opus_int            subfr,                      /* I    Subframe number                     */
+    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
+    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
+    const opus_int      signal_type,                /* I    Signal type                         */
+    const opus_int      decisionDelay               /* I    Decision delay                      */
+);
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
+    opus_int            signalType,             /* I    Signal type                         */
+    const opus_int32    x_Q10[],                /* I                                        */
+    opus_int8           pulses[],               /* O                                        */
+    opus_int16          xq[],                   /* O                                        */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
+    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
+    opus_int            lag,                    /* I    Pitch lag                           */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
+    opus_int32          LF_shp_Q14,             /* I                                        */
+    opus_int32          Gain_Q16,               /* I                                        */
+    opus_int            Lambda_Q10,             /* I                                        */
+    opus_int            offset_Q10,             /* I                                        */
+    opus_int            length,                 /* I    Input length                        */
+    opus_int            subfr,                  /* I    Subframe number                     */
+    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
+    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
+    opus_int            warping_Q16,            /* I                                        */
+    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
+    opus_int            *smpl_buf_idx,          /* I    Index to newest samples in buffers  */
+    opus_int            decisionDelay           /* I                                        */
+);
+
+void silk_NSQ_del_dec_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+)
+{
+    opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
+    opus_int            last_smple_idx, smpl_buf_idx, decisionDelay;
+    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16          *pxq;
+    VARDECL( opus_int32, sLTP_Q15 );
+    VARDECL( opus_int16, sLTP );
+    opus_int32          HarmShapeFIRPacked_Q14;
+    opus_int            offset_Q10;
+    opus_int32          RDmin_Q10, Gain_Q10;
+    VARDECL( opus_int32, x_sc_Q10 );
+    VARDECL( opus_int32, delayedGain_Q10 );
+    VARDECL( NSQ_del_dec_struct, psDelDec );
+    NSQ_del_dec_struct  *psDD;
+    SAVE_STACK;
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert( NSQ->prev_gain_Q16 != 0 );
+
+    /* Initialize delayed decision states */
+    ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
+    silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
+    for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
+        psDD                 = &psDelDec[ k ];
+        psDD->Seed           = ( k + psIndices->Seed ) & 3;
+        psDD->SeedInit       = psDD->Seed;
+        psDD->RD_Q10         = 0;
+        psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
+        silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+        silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
+    }
+
+    offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
+    smpl_buf_idx = 0; /* index of oldest samples */
+
+    decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
+
+    /* For voiced frames limit the decision delay to lower than the pitch lag */
+    if( psIndices->signalType == TYPE_VOICED ) {
+        for( k = 0; k < psEncC->nb_subfr; k++ ) {
+            decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
+        }
+    } else {
+        if( lag > 0 ) {
+            decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
+        }
+    }
+
+    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
+        LSF_interpolation_flag = 0;
+    } else {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC( sLTP_Q15,
+           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
+    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
+    ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
+    /* Set up pointers to start of sub frame */
+    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
+    subfr = 0;
+    for( k = 0; k < psEncC->nb_subfr; k++ ) {
+        A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
+        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
+        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+
+        /* Noise shape parameters */
+        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if( psIndices->signalType == TYPE_VOICED ) {
+            /* Voiced */
+            lag = pitchL[ k ];
+
+            /* Re-whitening */
+            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
+                if( k == 2 ) {
+                    /* RESET DELAYED DECISIONS */
+                    /* Find winner */
+                    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
+                    Winner_ind = 0;
+                    for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
+                        if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
+                            RDmin_Q10 = psDelDec[ i ].RD_Q10;
+                            Winner_ind = i;
+                        }
+                    }
+                    for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
+                        if( i != Winner_ind ) {
+                            psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
+                            silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
+                        }
+                    }
+
+                    /* Copy final part of signals from winner state to output and long-term filter states */
+                    psDD = &psDelDec[ Winner_ind ];
+                    last_smple_idx = smpl_buf_idx + decisionDelay;
+                    for( i = 0; i < decisionDelay; i++ ) {
+                        last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
+                        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+                        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+                            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
+                        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
+                    }
+
+                    subfr = 0;
+                }
+
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                silk_assert( start_idx > 0 );
+
+                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
+
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+                NSQ->rewhite_flag = 1;
+            }
+        }
+
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+            psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
+
+        silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
+            delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
+            Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
+            psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
+
+        x_Q3   += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq    += psEncC->subfr_length;
+    }
+
+    /* Find winner */
+    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
+    Winner_ind = 0;
+    for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
+        if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
+            RDmin_Q10 = psDelDec[ k ].RD_Q10;
+            Winner_ind = k;
+        }
+    }
+
+    /* Copy final part of signals from winner state to output and long-term filter states */
+    psDD = &psDelDec[ Winner_ind ];
+    psIndices->Seed = psDD->SeedInit;
+    last_smple_idx = smpl_buf_idx + decisionDelay;
+    Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
+    for( i = 0; i < decisionDelay; i++ ) {
+        last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
+        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
+        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
+    }
+    silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+    silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
+
+    /* Update states */
+    NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
+
+    /* Save quantized speech signal */
+    /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->frame_length * sizeof( opus_int16 ) ) */
+    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
+    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+    RESTORE_STACK;
+}
+
+/******************************************/
+/* Noise shape quantizer for one subframe */
+/******************************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
+    opus_int            signalType,             /* I    Signal type                         */
+    const opus_int32    x_Q10[],                /* I                                        */
+    opus_int8           pulses[],               /* O                                        */
+    opus_int16          xq[],                   /* O                                        */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
+    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
+    opus_int            lag,                    /* I    Pitch lag                           */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
+    opus_int32          LF_shp_Q14,             /* I                                        */
+    opus_int32          Gain_Q16,               /* I                                        */
+    opus_int            Lambda_Q10,             /* I                                        */
+    opus_int            offset_Q10,             /* I                                        */
+    opus_int            length,                 /* I    Input length                        */
+    opus_int            subfr,                  /* I    Subframe number                     */
+    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
+    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
+    opus_int            warping_Q16,            /* I                                        */
+    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
+    opus_int            *smpl_buf_idx,          /* I    Index to newest samples in buffers  */
+    opus_int            decisionDelay           /* I                                        */
+)
+{
+    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
+    opus_int32   Winner_rand_state;
+    opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
+    opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
+    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
+    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    VARDECL( NSQ_sample_pair, psSampleState );
+    NSQ_del_dec_struct *psDD;
+    NSQ_sample_struct  *psSS;
+
+    __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
+    __m128i b_Q12_0123, b_sr_Q12_0123;
+    SAVE_STACK;
+
+    silk_assert( nStatesDelayedDecision > 0 );
+    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
+
+    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
+    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
+    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
+
+    a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
+    a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
+
+    if( opus_likely( predictLPCOrder == 16 ) ) {
+        a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
+        a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
+    }
+
+    if( signalType == TYPE_VOICED ){
+        b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
+        b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+    }
+    for( i = 0; i < length; i++ ) {
+        /* Perform common calculations used in all states */
+
+        /* Long-term prediction */
+        if( signalType == TYPE_VOICED ) {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q14 = 2;
+            {
+                __m128i tmpa, tmpb, pred_lag_ptr_tmp;
+                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
+                tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
+                tmpa                = _mm_srli_si128( tmpa, 2 );
+
+                pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
+                pred_lag_ptr_tmp    = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
+                pred_lag_ptr_tmp    = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
+                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
+
+                tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
+                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
+                LTP_pred_Q14        += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
+
+                LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
+                LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
+                pred_lag_ptr++;
+            }
+        } else {
+            LTP_pred_Q14 = 0;
+        }
+
+        /* Long-term shaping */
+        if( lag > 0 ) {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
+            shp_lag_ptr++;
+        } else {
+            n_LTP_Q14 = 0;
+        }
+        {
+            __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
+
+            for( k = 0; k < nStatesDelayedDecision; k++ ) {
+                /* Delayed decision state */
+                psDD = &psDelDec[ k ];
+
+                /* Sample state */
+                psSS = psSampleState[ k ];
+
+                /* Generate dither */
+                psDD->Seed = silk_RAND( psDD->Seed );
+
+                /* Pointer used in short term prediction and shaping */
+                psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
+                /* Short-term prediction */
+                silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
+                /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+                LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
+
+                tmpb = _mm_setzero_si128();
+
+                /* step 1 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
+                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
+                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
+
+                tmpa            = _mm_srli_epi64( tmpa, 16 );
+                tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
+                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                /* step 2 */
+                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
+                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
+                tmpa            = _mm_srli_epi64( tmpa, 16 );
+                tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                if ( opus_likely( predictLPCOrder == 16 ) )
+                {
+                    /* step 3 */
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
+                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
+                    tmpa            = _mm_srli_epi64( tmpa, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
+                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                    /* setp 4 */
+                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
+                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
+                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
+                    tmpa            = _mm_srli_epi64( tmpa, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+
+                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
+                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
+                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
+                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
+
+                    /* add at last */
+                    /* equal shift right 8 bytes*/
+                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
+                }
+                else
+                {
+                    /* add at last */
+                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
+                    tmpb            = _mm_add_epi32( tmpb, tmpa );
+                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
+
+                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
+                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
+                }
+
+                LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
+
+                /* Noise shape feedback */
+                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                /* Output of lowpass section */
+                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                /* Output of allpass section */
+                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
+                psDD->sAR2_Q14[ 0 ] = tmp2;
+                n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
+                /* Loop over allpass sections */
+                for( j = 2; j < shapingLPCOrder; j += 2 ) {
+                    /* Output of allpass section */
+                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
+                    psDD->sAR2_Q14[ j - 1 ] = tmp1;
+                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
+                    /* Output of allpass section */
+                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
+                    psDD->sAR2_Q14[ j + 0 ] = tmp2;
+                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
+                }
+                psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
+
+                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
+                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
+                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
+
+                n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
+                n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
+                n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
+
+                /* Input minus prediction plus noise feedback                       */
+                /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
+                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+                tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
+                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
+
+                r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
+
+                /* Flip sign depending on dither */
+                if ( psDD->Seed < 0 ) {
+                    r_Q10 = -r_Q10;
+                }
+                r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
+
+                /* Find two quantization level candidates and measure their rate-distortion */
+                q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
+                q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if( q1_Q0 > 0 ) {
+                    q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+                } else if( q1_Q0 == 0 ) {
+                    q1_Q10  = offset_Q10;
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
+                } else if( q1_Q0 == -1 ) {
+                    q2_Q10  = offset_Q10;
+                    q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
+                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
+                } else {            /* q1_Q0 < -1 */
+                    q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
+                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
+                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
+                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
+                    rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
+                }
+                rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
+                rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
+                rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
+                rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
+
+                if( rd1_Q10 < rd2_Q10 ) {
+                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                    psSS[ 0 ].Q_Q10  = q1_Q10;
+                    psSS[ 1 ].Q_Q10  = q2_Q10;
+                } else {
+                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
+                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
+                    psSS[ 0 ].Q_Q10  = q2_Q10;
+                    psSS[ 1 ].Q_Q10  = q1_Q10;
+                }
+
+                /* Update states for best quantization */
+
+                /* Quantized excitation */
+                exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
+                if ( psDD->Seed < 0 ) {
+                    exc_Q14 = -exc_Q14;
+                }
+
+                /* Add predictions */
+                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+                /* Update states */
+                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+                psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
+                psSS[ 0 ].xq_Q14       = xq_Q14;
+
+                /* Update states for second best quantization */
+
+                /* Quantized excitation */
+                exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
+                if ( psDD->Seed < 0 ) {
+                    exc_Q14 = -exc_Q14;
+                }
+
+
+                /* Add predictions */
+                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
+                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
+
+                /* Update states */
+                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
+                psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
+                psSS[ 1 ].xq_Q14       = xq_Q14;
+            }
+        }
+        *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK;                   /* Index to newest samples              */
+        last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK;       /* Index to decisionDelay old samples   */
+
+        /* Find winner */
+        RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
+        Winner_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                Winner_ind = k;
+            }
+        }
+
+        /* Increase RD values of expired states */
+        Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
+                psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
+                psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
+                silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
+            }
+        }
+
+        /* Find worst in first set and best in second set */
+        RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
+        RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
+        RDmax_ind = 0;
+        RDmin_ind = 0;
+        for( k = 1; k < nStatesDelayedDecision; k++ ) {
+            /* find worst in first set */
+            if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
+                RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
+                RDmax_ind = k;
+            }
+            /* find best in second set */
+            if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
+                RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
+                RDmin_ind = k;
+            }
+        }
+
+        /* Replace a state if best from second set outperforms worst in first set */
+        if( RDmin_Q10 < RDmax_Q10 ) {
+            silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
+                         ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
+            silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
+        }
+
+        /* Write samples from winner to output and long-term filter states */
+        psDD = &psDelDec[ Winner_ind ];
+        if( subfr > 0 || i >= decisionDelay ) {
+            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
+            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
+                silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
+            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
+            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
+        }
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Update states */
+        for( k = 0; k < nStatesDelayedDecision; k++ ) {
+            psDD                                     = &psDelDec[ k ];
+            psSS                                     = &psSampleState[ k ][ 0 ];
+            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
+            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
+            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
+            psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
+            psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
+            psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
+            psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
+            psDD->RD_Q10                             = psSS->RD_Q10;
+        }
+        delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
+    }
+    /* Update LPC states */
+    for( k = 0; k < nStatesDelayedDecision; k++ ) {
+        psDD = &psDelDec[ k ];
+        silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+    }
+    RESTORE_STACK;
+}
+
+static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
+    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
+    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
+    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
+    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
+    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
+    opus_int            subfr,                      /* I    Subframe number                     */
+    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
+    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
+    const opus_int      signal_type,                /* I    Signal type                         */
+    const opus_int      decisionDelay               /* I    Decision delay                      */
+)
+{
+    opus_int            i, k, lag;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    NSQ_del_dec_struct  *psDD;
+    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+
+    lag          = pitchL[ subfr ];
+    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
+
+    silk_assert( inv_gain_Q31 != 0 );
+
+    /* Calculate gain adjustment factor */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+    } else {
+        gain_adj_Q16 = (opus_int32)1 << 16;
+    }
+
+    /* Scale input */
+    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+
+    /* prepare inv_gain_Q23 in packed 4 32-bits */
+    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+
+    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
+        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        /* equal shift right 4 bytes*/
+        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
+        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+
+        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
+        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+
+        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+    }
+
+    for( ; i < psEncC->subfr_length; i++ ) {
+        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+    }
+
+    /* Save inverse gain */
+    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if( NSQ->rewhite_flag ) {
+        if( subfr == 0 ) {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
+        }
+        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+            silk_assert( i < MAX_FRAME_LENGTH );
+            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
+        }
+    }
+
+    /* Adjust for changing gain */
+    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+        /* Scale long-term shaping state */
+        {
+            __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+
+            /* prepare gain_adj_Q16 in packed 4 32-bits */
+            xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
+
+            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
+            {
+                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+                /* equal shift right 4 bytes*/
+                xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
+                xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
+                xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
+
+                xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
+
+                _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+            }
+
+            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
+                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
+            }
+
+            /* Scale long-term prediction state */
+            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
+                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
+                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
+                }
+            }
+
+            for( k = 0; k < nStatesDelayedDecision; k++ ) {
+                psDD = &psDelDec[ k ];
+
+                /* Scale scalar states */
+                psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+
+                /* Scale short-term prediction and shaping states */
+                for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
+                    psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
+                }
+                for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
+                    psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
+                }
+                for( i = 0; i < DECISION_DELAY; i++ ) {
+                    psDD->Pred_Q15[  i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[  i ] );
+                    psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
+                }
+            }
+        }
+    }
+}
diff --git a/silk/x86/NSQ_sse.c b/silk/x86/NSQ_sse.c
new file mode 100644
index 0000000..72f34fd
--- /dev/null
+++ b/silk/x86/NSQ_sse.c
@@ -0,0 +1,720 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+#include "stack_alloc.h"
+
+static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
+    opus_int            subfr,                  /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,          /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
+    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
+    const opus_int      signal_type             /* I    Signal type                     */
+);
+
+static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    opus_int            signalType,             /* I    Signal type                     */
+    const opus_int32    x_sc_Q10[],             /* I                                    */
+    opus_int8           pulses[],               /* O                                    */
+    opus_int16          xq[],                   /* O                                    */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
+    opus_int            lag,                    /* I    Pitch lag                       */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
+    opus_int32          LF_shp_Q14,             /* I                                    */
+    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            offset_Q10,             /* I                                    */
+    opus_int            length,                 /* I    Input length                    */
+    opus_int32          table[][4]              /* I                                    */
+);
+
+void silk_NSQ_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+)
+{
+    opus_int            k, lag, start_idx, LSF_interpolation_flag;
+    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
+    opus_int16          *pxq;
+    VARDECL( opus_int32, sLTP_Q15 );
+    VARDECL( opus_int16, sLTP );
+    opus_int32          HarmShapeFIRPacked_Q14;
+    opus_int            offset_Q10;
+    VARDECL( opus_int32, x_sc_Q10 );
+
+    opus_int32   table[ 64 ][ 4 ];
+    opus_int32   tmp1;
+    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
+
+    SAVE_STACK;
+
+    NSQ->rand_seed = psIndices->Seed;
+
+    /* Set unvoiced lag to the previous one, overwrite later for voiced */
+    lag = NSQ->lagPrev;
+
+    silk_assert( NSQ->prev_gain_Q16 != 0 );
+
+    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
+
+    /* 0 */
+    q1_Q10  = offset_Q10;
+    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
+    rd1_Q20 = q1_Q10 * Lambda_Q10;
+    rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+    table[ 32 ][ 0 ] = q1_Q10;
+    table[ 32 ][ 1 ] = q2_Q10;
+    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+
+    /* -1 */
+    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
+    q2_Q10  = offset_Q10;
+    rd1_Q20 = - q1_Q10 * Lambda_Q10;
+    rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+    table[ 31 ][ 0 ] = q1_Q10;
+    table[ 31 ][ 1 ] = q2_Q10;
+    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+
+    /* > 0 */
+    for (k = 1; k <= 31; k++)
+    {
+        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
+
+        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
+        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
+        rd1_Q20 = q1_Q10 * Lambda_Q10;
+        rd2_Q20 = q2_Q10 * Lambda_Q10;
+
+        table[ 32 + k ][ 0 ] = q1_Q10;
+        table[ 32 + k ][ 1 ] = q2_Q10;
+        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+    }
+
+    /* < -1 */
+    for (k = -32; k <= -2; k++)
+    {
+        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
+
+        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
+        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
+        rd1_Q20 = - q1_Q10 * Lambda_Q10;
+        rd2_Q20 = - q2_Q10 * Lambda_Q10;
+
+        table[ 32 + k ][ 0 ] = q1_Q10;
+        table[ 32 + k ][ 1 ] = q2_Q10;
+        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
+        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
+    }
+
+    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
+        LSF_interpolation_flag = 0;
+    } else {
+        LSF_interpolation_flag = 1;
+    }
+
+    ALLOC( sLTP_Q15,
+           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
+    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
+    /* Set up pointers to start of sub frame */
+    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
+    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
+    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
+    for( k = 0; k < psEncC->nb_subfr; k++ ) {
+        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
+        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
+        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+
+        /* Noise shape parameters */
+        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
+        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
+        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
+
+        NSQ->rewhite_flag = 0;
+        if( psIndices->signalType == TYPE_VOICED ) {
+            /* Voiced */
+            lag = pitchL[ k ];
+
+            /* Re-whitening */
+            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
+                /* Rewhiten with new A coefs */
+                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
+                silk_assert( start_idx > 0 );
+
+                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
+                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
+
+                NSQ->rewhite_flag = 1;
+                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
+            }
+        }
+
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+
+        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
+        {
+            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                offset_Q10, psEncC->subfr_length, &(table[32]) );
+        }
+        else
+        {
+            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
+                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder );
+        }
+
+        x_Q3   += psEncC->subfr_length;
+        pulses += psEncC->subfr_length;
+        pxq    += psEncC->subfr_length;
+    }
+
+    /* Update lagPrev for next frame */
+    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
+
+    /* Save quantized speech and noise shaping signals */
+    /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof( opus_int16 ) ) */
+    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
+    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+    RESTORE_STACK;
+}
+
+/***********************************/
+/* silk_noise_shape_quantizer_10_16  */
+/***********************************/
+static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    opus_int            signalType,             /* I    Signal type                     */
+    const opus_int32    x_sc_Q10[],             /* I                                    */
+    opus_int8           pulses[],               /* O                                    */
+    opus_int16          xq[],                   /* O                                    */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
+    opus_int            lag,                    /* I    Pitch lag                       */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
+    opus_int32          LF_shp_Q14,             /* I                                    */
+    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            offset_Q10,             /* I                                    */
+    opus_int            length,                 /* I    Input length                    */
+    opus_int32          table[][4]              /* I                                    */
+)
+{
+    opus_int     i;
+    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
+    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
+    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
+
+    __m128i xmm_tempa, xmm_tempb;
+
+    __m128i xmm_one;
+
+    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
+    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
+    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
+
+    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
+    __m128i AR_shp_Q13_76543210;
+
+    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
+    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
+    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
+
+    /* Set up short term AR state */
+    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
+
+    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
+    xq_Q14         = psLPC_Q14[ 0 ];
+    LTP_pred_Q13   = 0;
+
+    /* load a_Q12 */
+    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
+
+    /* load a_Q12[0] - a_Q12[7] */
+    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
+    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
+    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
+
+    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
+    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
+
+    /* load AR_shp_Q13 */
+    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
+
+    /* load psLPC_Q14 */
+    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
+
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    /* load sAR2_Q14 */
+    xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
+    xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
+
+    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
+    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
+
+    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
+    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
+
+    /* prepare 1 in 8 * 16bit */
+    xmm_one = _mm_set1_epi16(1);
+
+    for( i = 0; i < length; i++ )
+    {
+        /* Short-term prediction */
+        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
+
+        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
+
+        /* shift psLPC_Q14 */
+        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
+        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
+
+        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
+        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
+
+        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
+        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
+
+        /* high part, use pmaddwd, results in 4 32-bit */
+        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
+        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
+
+        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
+        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
+        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
+
+        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
+        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
+
+        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
+        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
+
+        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
+        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
+
+        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
+        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
+
+        /* accumulate */
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
+        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
+
+        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
+
+        /* Long-term prediction */
+        if ( opus_likely( signalType == TYPE_VOICED ) ) {
+            /* Unrolled loop */
+            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
+            LTP_pred_Q13 = 2;
+            {
+                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
+
+                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
+                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
+
+                /* loaded: [0] [-1] [-2] [-3] */
+                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
+                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
+                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
+                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
+                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
+                /* right shift 2 bytes (16 bits), zero extended */
+                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
+
+                /* a[1] * b[-1], a[3] * b[-3] */
+                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
+                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
+
+                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
+                /* equal shift right 8 bytes*/
+                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
+                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
+
+                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
+
+                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
+                pred_lag_ptr++;
+            }
+        }
+
+        /* Noise shape feedback */
+        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
+        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
+
+        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
+        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
+
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+
+        /* high part, use pmaddwd, results in 4 32-bit */
+        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
+
+        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
+        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
+        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
+
+        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
+        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
+
+        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
+
+        /* accumulate */
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
+
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
+        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
+
+        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
+
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
+
+        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
+        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
+
+        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
+        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
+
+        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+
+        /* Combine prediction and noise shaping signals */
+        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
+        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
+        if( lag > 0 ) {
+            /* Symmetric, packed FIR coefficients */
+            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
+            shp_lag_ptr++;
+
+            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
+            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
+            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
+        } else {
+            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
+        }
+
+        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
+
+        /* Generate dither */
+        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
+
+        /* Flip sign depending on dither */
+        tmp2 = -r_Q10;
+        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
+
+        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
+
+        /* Find two quantization level candidates and measure their rate-distortion */
+        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
+        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+
+        q1_Q10 = table[q1_Q0][0];
+        q2_Q10 = table[q1_Q0][1];
+
+        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
+        {
+            q1_Q10 = q2_Q10;
+        }
+
+        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
+
+        /* Excitation */
+        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
+
+        tmp2 = -exc_Q14;
+        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
+
+        /* Add predictions */
+        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
+        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
+
+        /* Update states */
+        psLPC_Q14++;
+        *psLPC_Q14 = xq_Q14;
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+
+        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
+        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
+        NSQ->sLTP_shp_buf_idx++;
+        NSQ->sLTP_buf_idx++;
+
+        /* Make dither dependent on quantized signal */
+        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
+    }
+
+    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
+
+    /* Scale XQ back to normal level before saving */
+    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
+
+    /* write back sAR2_Q14 */
+    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
+    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
+    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
+    _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
+
+    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
+    {
+        __m128i xmm_Gain_Q10;
+        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
+
+        /* prepare (1 << 7) in packed 4 32-bits */
+        xmm_tempa = _mm_set1_epi32( (1 << 7) );
+
+        /* prepare Gain_Q10 in packed 4 32-bits */
+        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
+
+        /* process xq */
+        for (i = 0; i < length - 7; i += 8)
+        {
+            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
+            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
+
+            /* equal shift right 4 bytes*/
+            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+            /* equal shift right 4 bytes*/
+            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
+            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
+            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
+            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
+
+            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
+            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
+            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
+            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
+
+            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
+            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
+
+            /* silk_RSHIFT_ROUND(xq, 8) */
+            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
+            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
+
+            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
+            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
+
+            /* silk_SAT16 */
+            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
+
+            /* save to xq */
+            _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
+        }
+    }
+    for ( ; i < length; i++)
+    {
+        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
+    }
+
+    /* Update LPC synth buffer */
+    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
+}
+
+static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
+    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
+    opus_int            subfr,                  /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,          /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
+    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
+    const opus_int      signal_type             /* I    Signal type                     */
+)
+{
+    opus_int   i, lag;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+
+    lag          = pitchL[ subfr ];
+    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
+    silk_assert( inv_gain_Q31 != 0 );
+
+    /* Calculate gain adjustment factor */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+    } else {
+        gain_adj_Q16 = (opus_int32)1 << 16;
+    }
+
+    /* Scale input */
+    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+
+    /* prepare inv_gain_Q23 in packed 4 32-bits */
+    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+
+    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
+        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+
+        /* equal shift right 4 bytes*/
+        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
+        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+
+        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
+        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+
+        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+    }
+
+    for( ; i < psEncC->subfr_length; i++ ) {
+        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+    }
+
+    /* Save inverse gain */
+    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
+
+    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
+    if( NSQ->rewhite_flag ) {
+        if( subfr == 0 ) {
+            /* Do LTP downscaling */
+            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
+        }
+        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+            silk_assert( i < MAX_FRAME_LENGTH );
+            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
+        }
+    }
+
+    /* Adjust for changing gain */
+    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+        /* Scale long-term shaping state */
+        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+
+        /* prepare gain_adj_Q16 in packed 4 32-bits */
+        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
+
+        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
+        {
+            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
+            /* equal shift right 4 bytes*/
+            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
+            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
+            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
+
+            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
+
+            _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
+        }
+
+        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
+            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
+        }
+
+        /* Scale long-term prediction state */
+        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
+            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
+                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
+            }
+        }
+
+        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+
+        /* Scale short-term prediction and shaping states */
+        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
+            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
+        }
+        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
+            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
+        }
+    }
+}
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
new file mode 100644
index 0000000..61efa8d
--- /dev/null
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SIGPROC_FIX_SSE_H
+#define SIGPROC_FIX_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void silk_burg_modified_sse4_1(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */
+);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#else
+
+extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */);
+
+#  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+    ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#endif
+
+opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+    const opus_int16 *inVec1,
+    const opus_int16 *inVec2,
+    const opus_int   len
+);
+
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+
+#else
+
+extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_int16 *inVec1,
+                    const opus_int16 *inVec2,
+                    const opus_int   len);
+
+#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+
+#endif
+#endif
+#endif
diff --git a/silk/x86/VAD_sse.c b/silk/x86/VAD_sse.c
new file mode 100644
index 0000000..4e90f44
--- /dev/null
+++ b/silk/x86/VAD_sse.c
@@ -0,0 +1,277 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "main.h"
+#include "stack_alloc.h"
+
+/* Weighting factors for tilt measure */
+static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
+
+/***************************************/
+/* Get the speech activity level in Q8 */
+/***************************************/
+opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
+    silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
+    const opus_int16            pIn[]               /* I    PCM input                                   */
+)
+{
+    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
+    opus_int   decimated_framelength1, decimated_framelength2;
+    opus_int   decimated_framelength;
+    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
+    opus_int32 sumSquared, smooth_coef_Q16;
+    opus_int16 HPstateTmp;
+    VARDECL( opus_int16, X );
+    opus_int32 Xnrg[ VAD_N_BANDS ];
+    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
+    opus_int32 speech_nrg, x_tmp;
+    opus_int   X_offset[ VAD_N_BANDS ];
+    opus_int   ret = 0;
+    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
+
+    SAVE_STACK;
+
+    /* Safety checks */
+    silk_assert( VAD_N_BANDS == 4 );
+    silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
+    silk_assert( psEncC->frame_length <= 512 );
+    silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
+
+    /***********************/
+    /* Filter and Decimate */
+    /***********************/
+    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
+    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
+    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
+    /* Decimate into 4 bands:
+       0       L      3L       L              3L                             5L
+               -      --       -              --                             --
+               8       8       2               4                              4
+
+       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
+
+       They're arranged to allow the minimal ( frame_length / 4 ) extra
+       scratch space during the downsampling process */
+    X_offset[ 0 ] = 0;
+    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
+    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
+    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
+    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
+
+    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
+    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
+        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
+
+    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
+    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
+        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
+
+    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
+    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
+        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
+
+    /*********************************************/
+    /* HP filter on lowest band (differentiator) */
+    /*********************************************/
+    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
+    HPstateTmp = X[ decimated_framelength - 1 ];
+    for( i = decimated_framelength - 1; i > 0; i-- ) {
+        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
+        X[ i ]     -= X[ i - 1 ];
+    }
+    X[ 0 ] -= psSilk_VAD->HPstate;
+    psSilk_VAD->HPstate = HPstateTmp;
+
+    /*************************************/
+    /* Calculate the energy in each band */
+    /*************************************/
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* Find the decimated framelength in the non-uniformly divided bands */
+        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
+
+        /* Split length into subframe lengths */
+        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
+        dec_subframe_offset = 0;
+
+        /* Compute energy per sub-frame */
+        /* initialize with summed energy of last subframe */
+        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
+        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
+            __m128i xmm_X, xmm_acc;
+            sumSquared = 0;
+
+            xmm_acc = _mm_setzero_si128();
+
+            for( i = 0; i < dec_subframe_length - 7; i += 8 )
+            {
+                xmm_X   = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
+                xmm_X   = _mm_srai_epi16( xmm_X, 3 );
+                xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
+                xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
+            }
+
+            xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
+            xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
+
+            sumSquared += _mm_cvtsi128_si32( xmm_acc );
+
+            for( ; i < dec_subframe_length; i++ ) {
+                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
+                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
+                x_tmp = silk_RSHIFT(
+                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
+                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
+
+                /* Safety check */
+                silk_assert( sumSquared >= 0 );
+            }
+
+            /* Add/saturate summed energy of current subframe */
+            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
+                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
+            } else {
+                /* Look-ahead subframe */
+                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
+            }
+
+            dec_subframe_offset += dec_subframe_length;
+        }
+        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
+    }
+
+    /********************/
+    /* Noise estimation */
+    /********************/
+    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
+
+    /***********************************************/
+    /* Signal-plus-noise to noise ratio estimation */
+    /***********************************************/
+    sumSquared = 0;
+    input_tilt = 0;
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
+        if( speech_nrg > 0 ) {
+            /* Divide, with sufficient resolution */
+            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
+                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
+            } else {
+                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
+            }
+
+            /* Convert to log domain */
+            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
+
+            /* Sum-of-squares */
+            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
+
+            /* Tilt measure */
+            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
+                /* Scale down SNR value for small subband speech energies */
+                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
+            }
+            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
+        } else {
+            NrgToNoiseRatio_Q8[ b ] = 256;
+        }
+    }
+
+    /* Mean-of-squares */
+    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
+
+    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
+    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
+
+    /*********************************/
+    /* Speech Probability Estimation */
+    /*********************************/
+    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
+
+    /**************************/
+    /* Frequency Tilt Measure */
+    /**************************/
+    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
+
+    /**************************************************/
+    /* Scale the sigmoid output based on power levels */
+    /**************************************************/
+    speech_nrg = 0;
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
+        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
+    }
+
+    /* Power scaling */
+    if( speech_nrg <= 0 ) {
+        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
+    } else if( speech_nrg < 32768 ) {
+        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
+            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
+        } else {
+            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
+        }
+
+        /* square-root */
+        speech_nrg = silk_SQRT_APPROX( speech_nrg );
+        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
+    }
+
+    /* Copy the resulting speech activity in Q8 */
+    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
+
+    /***********************************/
+    /* Energy Level and SNR estimation */
+    /***********************************/
+    /* Smoothing coefficient */
+    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
+
+    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
+        smooth_coef_Q16 >>= 1;
+    }
+
+    for( b = 0; b < VAD_N_BANDS; b++ ) {
+        /* compute smoothed energy-to-noise ratio per band */
+        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
+            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
+
+        /* signal to noise ratio in dB per band */
+        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
+        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
+        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
+    }
+
+    RESTORE_STACK;
+    return( ret );
+}
diff --git a/silk/x86/VQ_WMat_EC_sse.c b/silk/x86/VQ_WMat_EC_sse.c
new file mode 100644
index 0000000..74d6c6d
--- /dev/null
+++ b/silk/x86/VQ_WMat_EC_sse.c
@@ -0,0 +1,142 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
+void silk_VQ_WMat_EC_sse4_1(
+    opus_int8                   *ind,                           /* O    index of best codebook vector               */
+    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
+    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
+    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
+    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
+    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
+    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
+    opus_int                    L                               /* I    number of vectors in codebook               */
+)
+{
+    opus_int   k, gain_tmp_Q7;
+    const opus_int8 *cb_row_Q7;
+    opus_int16 diff_Q14[ 5 ];
+    opus_int32 sum1_Q14, sum2_Q16;
+
+    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
+    /* Loop over codebook */
+    *rate_dist_Q14 = silk_int32_MAX;
+    cb_row_Q7 = cb_Q7;
+    for( k = 0; k < L; k++ ) {
+        gain_tmp_Q7 = cb_gain_Q7[k];
+
+        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
+
+        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
+        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
+        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
+
+        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
+        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
+        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
+        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
+
+        /* Weighted rate */
+        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+
+        /* Penalty for too large gain */
+        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
+
+        silk_assert( sum1_Q14 >= 0 );
+
+        /* first row of W_Q18 */
+        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
+        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
+        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
+
+        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+
+        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
+        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
+
+        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
+        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
+
+        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
+        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
+
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
+
+        /* second row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
+
+        /* third row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
+
+        /* fourth row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
+        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
+        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
+
+        /* last row of W_Q18 */
+        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
+        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
+
+        silk_assert( sum1_Q14 >= 0 );
+
+        /* find best */
+        if( sum1_Q14 < *rate_dist_Q14 ) {
+            *rate_dist_Q14 = sum1_Q14;
+            *ind = (opus_int8)k;
+            *gain_Q7 = gain_tmp_Q7;
+        }
+
+        /* Go to next cbk vector */
+        cb_row_Q7 += LTP_ORDER;
+    }
+}
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
new file mode 100644
index 0000000..afd5ec2
--- /dev/null
+++ b/silk/x86/main_sse.h
@@ -0,0 +1,276 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef MAIN_SSE_H
+#define MAIN_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#  define OVERRIDE_silk_VQ_WMat_EC
+
+void silk_VQ_WMat_EC_sse4_1(
+    opus_int8                   *ind,                           /* O    index of best codebook vector               */
+    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
+    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
+    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
+    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
+    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
+    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
+    opus_int                    L                               /* I    number of vectors in codebook               */
+);
+
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L))
+
+#else
+
+extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
+    opus_int8                   *ind,                           /* O    index of best codebook vector               */
+    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
+    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
+    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
+    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
+    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
+    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
+    opus_int                    L                               /* I    number of vectors in codebook               */
+);
+
+#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L, arch) \
+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          mu_Q9, max_gain_Q7, L))
+
+#endif
+
+#  define OVERRIDE_silk_NSQ
+
+void silk_NSQ_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+);
+
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#else
+
+extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+);
+
+#  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#endif
+
+#  define OVERRIDE_silk_NSQ_del_dec
+
+void silk_NSQ_del_dec_sse4_1(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+);
+
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#else
+
+extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+);
+
+#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#endif
+
+void silk_noise_shape_quantizer(
+    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
+    opus_int            signalType,             /* I    Signal type                     */
+    const opus_int32    x_sc_Q10[],             /* I                                    */
+    opus_int8           pulses[],               /* O                                    */
+    opus_int16          xq[],                   /* O                                    */
+    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
+    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
+    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
+    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
+    opus_int            lag,                    /* I    Pitch lag                       */
+    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
+    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
+    opus_int32          LF_shp_Q14,             /* I                                    */
+    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
+    opus_int            offset_Q10,             /* I                                    */
+    opus_int            length,                 /* I    Input length                    */
+    opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter order   */
+    opus_int            predictLPCOrder         /* I    Prediction filter order         */
+);
+
+/**************************/
+/* Noise level estimation */
+/**************************/
+void silk_VAD_GetNoiseLevels(
+    const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
+    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
+);
+
+#  define OVERRIDE_silk_VAD_GetSA_Q8
+
+opus_int silk_VAD_GetSA_Q8_sse4_1(
+    silk_encoder_state *psEnC,
+    const opus_int16   pIn[]
+);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+
+#else
+
+#  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+     ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+
+extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
+     silk_encoder_state *psEnC,
+     const opus_int16   pIn[]);
+
+#  define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
+
+#endif
+
+void silk_warped_LPC_analysis_filter_FIX_sse4_1(
+          opus_int32            state[],                    /* I/O  State [order + 1]                   */
+          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
+    ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
+
+#else
+
+extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])(
+          opus_int32            state[],                    /* I/O  State [order + 1]                   */
+          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+);
+
+#  define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
+    ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
+
+#endif
+
+# endif
+#endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
new file mode 100644
index 0000000..818841f
--- /dev/null
+++ b/silk/x86/x86_silk_map.c
@@ -0,0 +1,174 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "celt/x86/x86cpu.h"
+#include "structs.h"
+#include "SigProc_FIX.h"
+#include "pitch.h"
+#include "main.h"
+
+#if !defined(OPUS_X86_PRESUME_SSE4_1)
+
+#if defined(FIXED_POINT)
+
+#include "fixed/main_FIX.h"
+
+opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const opus_int16 *inVec1,
+    const opus_int16 *inVec2,
+    const opus_int   len
+) = {
+  silk_inner_prod16_aligned_64_c,                  /* non-sse */
+  silk_inner_prod16_aligned_64_c,
+  silk_inner_prod16_aligned_64_c,
+  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
+};
+
+#endif
+
+opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    silk_encoder_state *psEncC,
+    const opus_int16   pIn[]
+) = {
+  silk_VAD_GetSA_Q8_c,                  /* non-sse */
+  silk_VAD_GetSA_Q8_c,
+  silk_VAD_GetSA_Q8_c,
+  MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
+};
+
+void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+) = {
+  silk_NSQ_c,                  /* non-sse */
+  silk_NSQ_c,
+  silk_NSQ_c,
+  MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
+};
+
+void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    opus_int8                   *ind,                           /* O    index of best codebook vector               */
+    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
+    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
+    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
+    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
+    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
+    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
+    opus_int                    L                               /* I    number of vectors in codebook               */
+) = {
+  silk_VQ_WMat_EC_c,                  /* non-sse */
+  silk_VQ_WMat_EC_c,
+  silk_VQ_WMat_EC_c,
+  MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
+};
+
+void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */
+    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
+    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+) = {
+  silk_NSQ_del_dec_c,                  /* non-sse */
+  silk_NSQ_del_dec_c,
+  silk_NSQ_del_dec_c,
+  MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
+};
+
+#if defined(FIXED_POINT)
+
+void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
+    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
+    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
+    const opus_int16            input[],                    /* I    Input signal [length]               */
+    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
+    const opus_int              length,                     /* I    Length of input signal              */
+    const opus_int              order                       /* I    Filter order (even)                 */
+) = {
+  silk_warped_LPC_analysis_filter_FIX_c,                  /* non-sse */
+  silk_warped_LPC_analysis_filter_FIX_c,
+  silk_warped_LPC_analysis_filter_FIX_c,
+  MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX )  /* avx */
+};
+
+void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
+    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
+    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
+    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
+    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
+    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
+    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
+    const opus_int              D,                  /* I    Order                                                       */
+    int                         arch                /* I    Run-time architecture                                       */
+) = {
+  silk_burg_modified_c,                  /* non-sse */
+  silk_burg_modified_c,
+  silk_burg_modified_c,
+  MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_burg_modified )  /* avx */
+};
+
+#endif
+#endif
diff --git a/silk_headers.mk b/silk_headers.mk
index 316cd4b..679ff8f 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -6,6 +6,7 @@
 silk/typedef.h \
 silk/define.h \
 silk/main.h \
+silk/x86/main_sse.h \
 silk/PLC.h \
 silk/structs.h \
 silk/tables.h \
@@ -19,12 +20,19 @@
 silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
+silk/x86/SigProc_FIX_sse.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
+silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
+silk/fixed/mips/prefilter_FIX_mipsr1.h \
+silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \
 silk/float/main_FLP.h \
 silk/float/structs_FLP.h \
-silk/float/SigProc_FLP.h
+silk/float/SigProc_FLP.h \
+silk/mips/macros_mipsr1.h \
+silk/mips/NSQ_del_dec_mipsr1.h \
+silk/mips/sigproc_fix_mipsr1.h
diff --git a/silk_sources.mk b/silk_sources.mk
index 0de367b..7cfb7d3 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -76,6 +76,11 @@
 silk/stereo_find_predictor.c \
 silk/stereo_quant_pred.c
 
+SILK_SOURCES_SSE4_1 = silk/x86/NSQ_sse.c \
+silk/x86/NSQ_del_dec_sse.c \
+silk/x86/x86_silk_map.c \
+silk/x86/VAD_sse.c \
+silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_FIXED = \
 silk/fixed/LTP_analysis_filter_FIX.c \
@@ -104,6 +109,10 @@
 silk/fixed/schur64_FIX.c \
 silk/fixed/schur_FIX.c
 
+SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \
+silk/fixed/x86/burg_modified_FIX_sse.c \
+silk/fixed/x86/prefilter_FIX_sse.c
+
 SILK_SOURCES_FLOAT = \
 silk/float/apply_sine_window_FLP.c \
 silk/float/corrMatrix_FLP.c \
diff --git a/src/analysis.c b/src/analysis.c
index 778a62a..360ebcc 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -39,8 +39,6 @@
 #include "mlp.h"
 #include "stack_alloc.h"
 
-extern const MLP net;
-
 #ifndef M_PI
 #define M_PI 3.141592653
 #endif
@@ -140,6 +138,21 @@
    }
 }
 
+void tonality_analysis_init(TonalityAnalysisState *tonal)
+{
+  /* Initialize reusable fields. */
+  tonal->arch = opus_select_arch();
+  /* Clear remaining fields. */
+  tonality_analysis_reset(tonal);
+}
+
+void tonality_analysis_reset(TonalityAnalysisState *tonal)
+{
+  /* Clear non-reusable fields. */
+  char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
+  OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
+}
+
 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
 {
    int pos;
@@ -189,7 +202,7 @@
    info_out->music_prob = psum;
 }
 
-void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
+static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
 {
     int i, b;
     const kiss_fft_state *kfft;
@@ -262,7 +275,16 @@
     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
     tonal->mem_fill = 240 + remaining;
-    opus_fft(kfft, in, out);
+    opus_fft(kfft, in, out, tonal->arch);
+#ifndef FIXED_POINT
+    /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
+    if (celt_isnan(out[0].r))
+    {
+       info->valid = 0;
+       RESTORE_STACK;
+       return;
+    }
+#endif
 
     for (i=1;i<N2;i++)
     {
@@ -334,6 +356,16 @@
           tE += binE*tonality[i];
           nE += binE*2.f*(.5f-noisiness[i]);
        }
+#ifndef FIXED_POINT
+       /* Check for extreme band energies that could cause NaNs later. */
+       if (!(E<1e9f) || celt_isnan(E))
+       {
+          info->valid = 0;
+          RESTORE_STACK;
+          return;
+       }
+#endif
+
        tonal->E[tonal->E_count][b] = E;
        frame_noisiness += nE/(1e-15f+E);
 
@@ -611,8 +643,6 @@
     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
     info->noisiness = frame_noisiness;
     info->valid = 1;
-    if (info_out!=NULL)
-       OPUS_COPY(info_out, info, 1);
     RESTORE_STACK;
 }
 
@@ -631,7 +661,7 @@
       pcm_len = analysis_frame_size - analysis->analysis_offset;
       offset = analysis->analysis_offset;
       do {
-         tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
+         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
          offset += 480;
          pcm_len -= 480;
       } while (pcm_len>0);
diff --git a/src/analysis.h b/src/analysis.h
index be0388f..9eae56a 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -39,6 +39,8 @@
 #define DETECT_SIZE 200
 
 typedef struct {
+   int arch;
+#define TONALITY_ANALYSIS_RESET_START angle
    float angle[240];
    float d_angle[240];
    float d2_angle[240];
@@ -78,8 +80,19 @@
    AnalysisInfo info[DETECT_SIZE];
 } TonalityAnalysisState;
 
-void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info,
-     const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix);
+/** Initialize a TonalityAnalysisState struct.
+ *
+ * This performs some possibly slow initialization steps which should
+ * not be repeated every analysis step. No allocated memory is retained
+ * by the state struct, so no cleanup call is required.
+ */
+void tonality_analysis_init(TonalityAnalysisState *analysis);
+
+/** Reset a TonalityAnalysisState stuct.
+ *
+ * Call this when there's a discontinuity in the data.
+ */
+void tonality_analysis_reset(TonalityAnalysisState *analysis);
 
 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len);
 
diff --git a/src/mlp.c b/src/mlp.c
index 4638602..ff9e50d 100644
--- a/src/mlp.c
+++ b/src/mlp.c
@@ -41,77 +41,82 @@
 #if 0
 static OPUS_INLINE opus_val16 tansig_approx(opus_val32 _x) /* Q19 */
 {
-	int i;
-	opus_val16 xx; /* Q11 */
-	/*double x, y;*/
-	opus_val16 dy, yy; /* Q14 */
-	/*x = 1.9073e-06*_x;*/
-	if (_x>=QCONST32(8,19))
-		return QCONST32(1.,14);
-	if (_x<=-QCONST32(8,19))
-		return -QCONST32(1.,14);
-	xx = EXTRACT16(SHR32(_x, 8));
-	/*i = lrint(25*x);*/
-	i = SHR32(ADD32(1024,MULT16_16(25, xx)),11);
-	/*x -= .04*i;*/
-	xx -= EXTRACT16(SHR32(MULT16_16(20972,i),8));
-	/*x = xx*(1./2048);*/
-	/*y = tansig_table[250+i];*/
-	yy = tansig_table[250+i];
-	/*y = yy*(1./16384);*/
-	dy = 16384-MULT16_16_Q14(yy,yy);
-	yy = yy + MULT16_16_Q14(MULT16_16_Q11(xx,dy),(16384 - MULT16_16_Q11(yy,xx)));
-	return yy;
+    int i;
+    opus_val16 xx; /* Q11 */
+    /*double x, y;*/
+    opus_val16 dy, yy; /* Q14 */
+    /*x = 1.9073e-06*_x;*/
+    if (_x>=QCONST32(8,19))
+        return QCONST32(1.,14);
+    if (_x<=-QCONST32(8,19))
+        return -QCONST32(1.,14);
+    xx = EXTRACT16(SHR32(_x, 8));
+    /*i = lrint(25*x);*/
+    i = SHR32(ADD32(1024,MULT16_16(25, xx)),11);
+    /*x -= .04*i;*/
+    xx -= EXTRACT16(SHR32(MULT16_16(20972,i),8));
+    /*x = xx*(1./2048);*/
+    /*y = tansig_table[250+i];*/
+    yy = tansig_table[250+i];
+    /*y = yy*(1./16384);*/
+    dy = 16384-MULT16_16_Q14(yy,yy);
+    yy = yy + MULT16_16_Q14(MULT16_16_Q11(xx,dy),(16384 - MULT16_16_Q11(yy,xx)));
+    return yy;
 }
 #else
 /*extern const float tansig_table[501];*/
 static OPUS_INLINE float tansig_approx(float x)
 {
-	int i;
-	float y, dy;
-	float sign=1;
-	/* Tests are reversed to catch NaNs */
+    int i;
+    float y, dy;
+    float sign=1;
+    /* Tests are reversed to catch NaNs */
     if (!(x<8))
         return 1;
     if (!(x>-8))
         return -1;
-	if (x<0)
-	{
-	   x=-x;
-	   sign=-1;
-	}
-	i = (int)floor(.5f+25*x);
-	x -= .04f*i;
-	y = tansig_table[i];
-	dy = 1-y*y;
-	y = y + x*dy*(1 - y*x);
-	return sign*y;
+#ifndef FIXED_POINT
+    /* Another check in case of -ffast-math */
+    if (celt_isnan(x))
+       return 0;
+#endif
+    if (x<0)
+    {
+       x=-x;
+       sign=-1;
+    }
+    i = (int)floor(.5f+25*x);
+    x -= .04f*i;
+    y = tansig_table[i];
+    dy = 1-y*y;
+    y = y + x*dy*(1 - y*x);
+    return sign*y;
 }
 #endif
 
 #if 0
 void mlp_process(const MLP *m, const opus_val16 *in, opus_val16 *out)
 {
-	int j;
-	opus_val16 hidden[MAX_NEURONS];
-	const opus_val16 *W = m->weights;
-	/* Copy to tmp_in */
-	for (j=0;j<m->topo[1];j++)
-	{
-		int k;
-		opus_val32 sum = SHL32(EXTEND32(*W++),8);
-		for (k=0;k<m->topo[0];k++)
-			sum = MAC16_16(sum, in[k],*W++);
-		hidden[j] = tansig_approx(sum);
-	}
-	for (j=0;j<m->topo[2];j++)
-	{
-		int k;
-		opus_val32 sum = SHL32(EXTEND32(*W++),14);
-		for (k=0;k<m->topo[1];k++)
-			sum = MAC16_16(sum, hidden[k], *W++);
-		out[j] = tansig_approx(EXTRACT16(PSHR32(sum,17)));
-	}
+    int j;
+    opus_val16 hidden[MAX_NEURONS];
+    const opus_val16 *W = m->weights;
+    /* Copy to tmp_in */
+    for (j=0;j<m->topo[1];j++)
+    {
+        int k;
+        opus_val32 sum = SHL32(EXTEND32(*W++),8);
+        for (k=0;k<m->topo[0];k++)
+            sum = MAC16_16(sum, in[k],*W++);
+        hidden[j] = tansig_approx(sum);
+    }
+    for (j=0;j<m->topo[2];j++)
+    {
+        int k;
+        opus_val32 sum = SHL32(EXTEND32(*W++),14);
+        for (k=0;k<m->topo[1];k++)
+            sum = MAC16_16(sum, hidden[k], *W++);
+        out[j] = tansig_approx(EXTRACT16(PSHR32(sum,17)));
+    }
 }
 #else
 void mlp_process(const MLP *m, const float *in, float *out)
diff --git a/src/mlp.h b/src/mlp.h
index 86c8e06..618e246 100644
--- a/src/mlp.h
+++ b/src/mlp.h
@@ -31,11 +31,13 @@
 #include "arch.h"
 
 typedef struct {
-	int layers;
-	const int *topo;
-	const float *weights;
+    int layers;
+    const int *topo;
+    const float *weights;
 } MLP;
 
+extern const MLP net;
+
 void mlp_process(const MLP *m, const float *in, float *out);
 
 #endif /* _MLP_H_ */
diff --git a/src/mlp_data.c b/src/mlp_data.c
index 401c4c0..c2fda4e 100644
--- a/src/mlp_data.c
+++ b/src/mlp_data.c
@@ -1,6 +1,10 @@
 /* The contents of this file was automatically generated by mlp_train.c
    It contains multi-layer perceptron (MLP) weights. */
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #include "mlp.h"
 
 /* RMS error was 0.138320, seed was 1361535663 */
diff --git a/src/opus.c b/src/opus.c
index 30890b9..e9ce93b 100644
--- a/src/opus.c
+++ b/src/opus.c
@@ -166,6 +166,27 @@
    }
 }
 
+int opus_packet_get_samples_per_frame(const unsigned char *data,
+      opus_int32 Fs)
+{
+   int audiosize;
+   if (data[0]&0x80)
+   {
+      audiosize = ((data[0]>>3)&0x3);
+      audiosize = (Fs<<audiosize)/400;
+   } else if ((data[0]&0x60) == 0x60)
+   {
+      audiosize = (data[0]&0x08) ? Fs/50 : Fs/100;
+   } else {
+      audiosize = ((data[0]>>3)&0x3);
+      if (audiosize == 3)
+         audiosize = Fs*60/1000;
+      else
+         audiosize = (Fs<<audiosize)/100;
+   }
+   return audiosize;
+}
+
 int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
       int self_delimited, unsigned char *out_toc,
       const unsigned char *frames[48], opus_int16 size[48],
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 919ba52..080bec5 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -33,7 +33,7 @@
 # error "OPUS_BUILD _MUST_ be defined to build Opus. This probably means you need other defines as well, as in a config.h. See the included build files for details."
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__) && !defined(OPUS_WILL_BE_SLOW)
 # pragma message "You appear to be compiling without optimization, if so opus will be very slow."
 #endif
 
@@ -59,6 +59,7 @@
    opus_int32   Fs;          /** Sampling rate (at the API level) */
    silk_DecControlStruct DecControl;
    int          decode_gain;
+   int          arch;
 
    /* Everything beyond this point gets cleared on a reset */
 #define OPUS_DECODER_RESET_START stream_channels
@@ -77,12 +78,6 @@
    opus_uint32  rangeFinal;
 };
 
-#ifdef FIXED_POINT
-static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
-   return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
-}
-#endif
-
 
 int opus_decoder_get_size(int channels)
 {
@@ -137,6 +132,7 @@
 
    st->prev_mode = 0;
    st->frame_size = Fs/400;
+   st->arch = opus_select_arch();
    return OPUS_OK;
 }
 
@@ -215,7 +211,7 @@
    VARDECL(opus_val16, pcm_transition_silk);
    int pcm_transition_celt_size;
    VARDECL(opus_val16, pcm_transition_celt);
-   opus_val16 *pcm_transition;
+   opus_val16 *pcm_transition=NULL;
    int redundant_audio_size;
    VARDECL(opus_val16, redundant_audio);
 
@@ -230,6 +226,7 @@
    int F2_5, F5, F10, F20;
    const opus_val16 *window;
    opus_uint32 redundant_rng = 0;
+   int celt_accum;
    ALLOC_STACK;
 
    silk_dec = (char*)st+st->silk_dec_offset;
@@ -295,6 +292,14 @@
       }
    }
 
+   /* In fixed-point, we can tell CELT to do the accumulation on top of the
+      SILK PCM buffer. This saves some stack space. */
+#ifdef FIXED_POINT
+   celt_accum = (mode != MODE_CELT_ONLY) && (frame_size >= F10);
+#else
+   celt_accum = 0;
+#endif
+
    pcm_transition_silk_size = ALLOC_NONE;
    pcm_transition_celt_size = ALLOC_NONE;
    if (data!=NULL && st->prev_mode > 0 && (
@@ -325,14 +330,20 @@
    }
 
    /* Don't allocate any memory when in CELT-only mode */
-   pcm_silk_size = (mode != MODE_CELT_ONLY) ? IMAX(F10, frame_size)*st->channels : ALLOC_NONE;
+   pcm_silk_size = (mode != MODE_CELT_ONLY && !celt_accum) ? IMAX(F10, frame_size)*st->channels : ALLOC_NONE;
    ALLOC(pcm_silk, pcm_silk_size, opus_int16);
 
    /* SILK processing */
    if (mode != MODE_CELT_ONLY)
    {
       int lost_flag, decoded_samples;
-      opus_int16 *pcm_ptr = pcm_silk;
+      opus_int16 *pcm_ptr;
+#ifdef FIXED_POINT
+      if (celt_accum)
+         pcm_ptr = pcm;
+      else
+#endif
+         pcm_ptr = pcm_silk;
 
       if (st->prev_mode==MODE_CELT_ONLY)
          silk_InitDecoder( silk_dec );
@@ -366,7 +377,7 @@
         /* Call SILK decoder */
         int first_frame = decoded_samples == 0;
         silk_ret = silk_Decode( silk_dec, &st->DecControl,
-                                lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size );
+                                lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, st->arch );
         if( silk_ret ) {
            if (lost_flag) {
               /* PLC failure should not be fatal */
@@ -462,7 +473,7 @@
    {
       celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
       celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
-                          redundant_audio, F5, NULL);
+                          redundant_audio, F5, NULL, 0);
       celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng));
    }
 
@@ -477,25 +488,28 @@
          celt_decoder_ctl(celt_dec, OPUS_RESET_STATE);
       /* Decode CELT */
       celt_ret = celt_decode_with_ec(celt_dec, decode_fec ? NULL : data,
-                                     len, pcm, celt_frame_size, &dec);
+                                     len, pcm, celt_frame_size, &dec, celt_accum);
    } else {
       unsigned char silence[2] = {0xFF, 0xFF};
-      for (i=0;i<frame_size*st->channels;i++)
-         pcm[i] = 0;
+      if (!celt_accum)
+      {
+         for (i=0;i<frame_size*st->channels;i++)
+            pcm[i] = 0;
+      }
       /* For hybrid -> SILK transitions, we let the CELT MDCT
          do a fade-out by decoding a silence frame */
       if (st->prev_mode == MODE_HYBRID && !(redundancy && celt_to_silk && st->prev_redundancy) )
       {
          celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
-         celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL);
+         celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL, celt_accum);
       }
    }
 
-   if (mode != MODE_CELT_ONLY)
+   if (mode != MODE_CELT_ONLY && !celt_accum)
    {
 #ifdef FIXED_POINT
       for (i=0;i<frame_size*st->channels;i++)
-         pcm[i] = SAT16(pcm[i] + pcm_silk[i]);
+         pcm[i] = SAT16(ADD32(pcm[i], pcm_silk[i]));
 #else
       for (i=0;i<frame_size*st->channels;i++)
          pcm[i] = pcm[i] + (opus_val16)((1.f/32768.f)*pcm_silk[i]);
@@ -514,7 +528,7 @@
       celt_decoder_ctl(celt_dec, OPUS_RESET_STATE);
       celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
 
-      celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL);
+      celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL, 0);
       celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng));
       smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
                   pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
@@ -710,6 +724,7 @@
 {
    VARDECL(opus_int16, out);
    int ret, i;
+   int nb_samples;
    ALLOC_STACK;
 
    if(frame_size<=0)
@@ -717,6 +732,14 @@
       RESTORE_STACK;
       return OPUS_BAD_ARG;
    }
+   if (data != NULL && len > 0 && !decode_fec)
+   {
+      nb_samples = opus_decoder_get_nb_samples(st, data, len);
+      if (nb_samples>0)
+         frame_size = IMIN(frame_size, nb_samples);
+      else
+         return OPUS_INVALID_PACKET;
+   }
    ALLOC(out, frame_size*st->channels, opus_int16);
 
    ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0);
@@ -737,6 +760,7 @@
 {
    VARDECL(float, out);
    int ret, i;
+   int nb_samples;
    ALLOC_STACK;
 
    if(frame_size<=0)
@@ -745,6 +769,14 @@
       return OPUS_BAD_ARG;
    }
 
+   if (data != NULL && len > 0 && !decode_fec)
+   {
+      nb_samples = opus_decoder_get_nb_samples(st, data, len);
+      if (nb_samples>0)
+         frame_size = IMIN(frame_size, nb_samples);
+      else
+         return OPUS_INVALID_PACKET;
+   }
    ALLOC(out, frame_size*st->channels, float);
 
    ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1);
@@ -904,27 +936,6 @@
    return bandwidth;
 }
 
-int opus_packet_get_samples_per_frame(const unsigned char *data,
-      opus_int32 Fs)
-{
-   int audiosize;
-   if (data[0]&0x80)
-   {
-      audiosize = ((data[0]>>3)&0x3);
-      audiosize = (Fs<<audiosize)/400;
-   } else if ((data[0]&0x60) == 0x60)
-   {
-      audiosize = (data[0]&0x08) ? Fs/50 : Fs/100;
-   } else {
-      audiosize = ((data[0]>>3)&0x3);
-      if (audiosize == 3)
-         audiosize = Fs*60/1000;
-      else
-         audiosize = (Fs<<audiosize)/100;
-   }
-   return audiosize;
-}
-
 int opus_packet_get_nb_channels(const unsigned char *data)
 {
    return (data[0]&0x4) ? 2 : 1;
diff --git a/src/opus_demo.c b/src/opus_demo.c
index f8cdf03..9e99a3b 100644
--- a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ -48,7 +48,7 @@
         "<bits per second>  [options] <input> <output>\n", argv[0]);
     fprintf(stderr, "       %s -d <sampling rate (Hz)> <channels (1/2)> "
         "[options] <input> <output>\n\n", argv[0]);
-    fprintf(stderr, "mode: voip | audio | restricted-lowdelay\n" );
+    fprintf(stderr, "application: voip | audio | restricted-lowdelay\n" );
     fprintf(stderr, "options:\n" );
     fprintf(stderr, "-e                   : only runs the encoder (output the bit-stream)\n" );
     fprintf(stderr, "-d                   : only runs the decoder (reads the bit-stream as input)\n" );
@@ -245,14 +245,14 @@
     double bits=0.0, bits_max=0.0, bits_act=0.0, bits2=0.0, nrg;
     double tot_samples=0;
     opus_uint64 tot_in, tot_out;
-    int bandwidth=-1;
+    int bandwidth=OPUS_AUTO;
     const char *bandwidth_string;
     int lost = 0, lost_prev = 1;
     int toggle = 0;
     opus_uint32 enc_final_range[2];
     opus_uint32 dec_final_range;
     int encode_only=0, decode_only=0;
-    int max_frame_size = 960*6;
+    int max_frame_size = 48000*2;
     int curr_read=0;
     int sweep_bps = 0;
     int random_framesize=0, newsize=0, delayed_celt=0;
@@ -336,15 +336,12 @@
 
     /* defaults: */
     use_vbr = 1;
-    bandwidth = OPUS_AUTO;
     max_payload_bytes = MAX_PACKET;
     complexity = 10;
     use_inbandfec = 0;
     forcechannels = OPUS_AUTO;
     use_dtx = 0;
     packet_loss_perc = 0;
-    max_frame_size = 2*48000;
-    curr_read=0;
 
     while( args < argc - 2 ) {
         /* process command line options */
@@ -576,7 +573,7 @@
          bandwidth_string = "fullband";
          break;
     case OPUS_AUTO:
-         bandwidth_string = "auto";
+         bandwidth_string = "auto bandwidth";
          break;
     default:
          bandwidth_string = "unknown";
@@ -588,7 +585,7 @@
                        (long)sampling_rate, channels);
     else
        fprintf(stderr, "Encoding %ld Hz input at %.3f kb/s "
-                       "in %s mode with %d-sample frames.\n",
+                       "in %s with %d-sample frames.\n",
                        (long)sampling_rate, bitrate_bps*0.001,
                        bandwidth_string, frame_size);
 
@@ -637,7 +634,7 @@
             case 4: newsize=sampling_rate/25; break;
             case 5: newsize=3*sampling_rate/50; break;
             }
-            while (newsize < sampling_rate/25 && bitrate_bps-fabs(sweep_bps) <= 3*12*sampling_rate/newsize)
+            while (newsize < sampling_rate/25 && bitrate_bps-abs(sweep_bps) <= 3*12*sampling_rate/newsize)
                newsize*=2;
             if (newsize < sampling_rate/100 && frame_size >= sampling_rate/100)
             {
@@ -837,35 +834,39 @@
         }
 
         lost_prev = lost;
-
-        /* count bits */
-        bits += len[toggle]*8;
-        bits_max = ( len[toggle]*8 > bits_max ) ? len[toggle]*8 : bits_max;
         if( count >= use_inbandfec ) {
-            nrg = 0.0;
+            /* count bits */
+            bits += len[toggle]*8;
+            bits_max = ( len[toggle]*8 > bits_max ) ? len[toggle]*8 : bits_max;
+            bits2 += len[toggle]*len[toggle]*64;
             if (!decode_only)
             {
+                nrg = 0.0;
                 for ( k = 0; k < frame_size * channels; k++ ) {
                     nrg += in[ k ] * (double)in[ k ];
                 }
+                nrg /= frame_size * channels;
+                if( nrg > 1e5 ) {
+                    bits_act += len[toggle]*8;
+                    count_act++;
+                }
             }
-            if ( ( nrg / ( frame_size * channels ) ) > 1e5 ) {
-                bits_act += len[toggle]*8;
-                count_act++;
-            }
-            /* Variance */
-            bits2 += len[toggle]*len[toggle]*64;
         }
         count++;
         toggle = (toggle + use_inbandfec) & 1;
     }
+
+    /* Print out bitrate statistics */
+    if(decode_only)
+        frame_size = (int)(tot_samples / count);
+    count -= use_inbandfec;
     fprintf (stderr, "average bitrate:             %7.3f kb/s\n",
                      1e-3*bits*sampling_rate/tot_samples);
     fprintf (stderr, "maximum bitrate:             %7.3f kb/s\n",
                      1e-3*bits_max*sampling_rate/frame_size);
     if (!decode_only)
        fprintf (stderr, "active bitrate:              %7.3f kb/s\n",
-               1e-3*bits_act*sampling_rate/(frame_size*(double)count_act));
+               1e-3*bits_act*sampling_rate/(1e-15+frame_size*(double)count_act));
     fprintf (stderr, "bitrate standard deviation:  %7.3f kb/s\n",
             1e-3*sqrt(bits2/count - bits*bits/(count*(double)count))*sampling_rate/frame_size);
     /* Close any files to which intermediate results were stored */
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index fbd3de6..a7e1912 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -38,6 +38,7 @@
 #include "float_cast.h"
 #include "opus.h"
 #include "arch.h"
+#include "pitch.h"
 #include "opus_private.h"
 #include "os_support.h"
 #include "cpu_support.h"
@@ -80,6 +81,10 @@
     int          lsb_depth;
     int          encoder_buffer;
     int          lfe;
+    int          arch;
+#ifndef DISABLE_FLOAT_API
+    TonalityAnalysisState analysis;
+#endif
 
 #define OPUS_ENCODER_RESET_START stream_channels
     int          stream_channels;
@@ -99,12 +104,9 @@
     StereoWidthState width_mem;
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef DISABLE_FLOAT_API
-    TonalityAnalysisState analysis;
     int          detected_bandwidth;
-    int          analysis_offset;
 #endif
     opus_uint32  rangeFinal;
-    int          arch;
 };
 
 /* Transition tables for the voice and music. First column is the
@@ -231,7 +233,7 @@
     st->lsb_depth = 24;
     st->variable_duration = OPUS_FRAMESIZE_ARG;
 
-    /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead 
+    /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead
        + 1.5 ms for SILK resamplers and stereo prediction) */
     st->delay_compensation = st->Fs/250;
 
@@ -242,6 +244,10 @@
     st->mode = MODE_HYBRID;
     st->bandwidth = OPUS_BANDWIDTH_FULLBAND;
 
+#ifndef DISABLE_FLOAT_API
+    tonality_analysis_init(&st->analysis);
+#endif
+
     return OPUS_OK;
 }
 
@@ -648,7 +654,7 @@
    return best_state;
 }
 
-int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
+static int optimize_framesize(const void *x, int len, int C, opus_int32 Fs,
                 int bitrate, opus_val16 tonality, float *mem, int buffering,
                 downmix_func downmix)
 {
@@ -660,6 +666,7 @@
    int bestLM=0;
    int subframe;
    int pos;
+   int offset;
    VARDECL(opus_val32, sub);
 
    subframe = Fs/400;
@@ -670,9 +677,8 @@
    {
       /* Consider the CELT delay when not in restricted-lowdelay */
       /* We assume the buffering is between 2.5 and 5 ms */
-      int offset = 2*subframe - buffering;
+      offset = 2*subframe - buffering;
       celt_assert(offset>=0 && offset <= subframe);
-      x += C*offset;
       len -= offset;
       e[1]=mem[1];
       e_1[1]=1.f/(EPSILON+mem[1]);
@@ -681,6 +687,7 @@
       pos = 3;
    } else {
       pos=1;
+      offset=0;
    }
    N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE);
    /* Just silencing a warning, it's really initialized later */
@@ -692,7 +699,7 @@
       int j;
       tmp=EPSILON;
 
-      downmix(x, sub, subframe, i*subframe, 0, -2, C);
+      downmix(x, sub, subframe, i*subframe+offset, 0, -2, C);
       if (i==0)
          memx = sub[0];
       for (j=0;j<subframe;j++)
@@ -836,6 +843,12 @@
          LM--;
       frame_size = (Fs/400<<LM);
    } else
+#else
+   (void)analysis_pcm;
+   (void)C;
+   (void)bitrate_bps;
+   (void)delay_compensation;
+   (void)downmix;
 #endif
    {
       frame_size = frame_size_select(frame_size, variable_duration, Fs);
@@ -924,7 +937,8 @@
 
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
                 unsigned char *data, opus_int32 out_data_bytes, int lsb_depth,
-                const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, int analysis_channels, downmix_func downmix)
+                const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2,
+                int analysis_channels, downmix_func downmix, int float_api)
 {
     void *silk_enc;
     CELTEncoder *celt_enc;
@@ -954,9 +968,11 @@
     int total_buffer;
     opus_val16 stereo_width;
     const CELTMode *celt_mode;
+#ifndef DISABLE_FLOAT_API
     AnalysisInfo analysis_info;
     int analysis_read_pos_bak=-1;
     int analysis_read_subframe_bak=-1;
+#endif
     VARDECL(opus_val16, tmp_prefill);
 
     ALLOC_STACK;
@@ -982,9 +998,9 @@
 
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
-    analysis_info.valid = 0;
     celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode));
 #ifndef DISABLE_FLOAT_API
+    analysis_info.valid = 0;
 #ifdef FIXED_POINT
     if (st->silk_mode.complexity >= 10 && st->Fs==48000)
 #else
@@ -997,6 +1013,9 @@
              c1, c2, analysis_channels, st->Fs,
              lsb_depth, downmix, &analysis_info);
     }
+#else
+    (void)analysis_pcm;
+    (void)analysis_size;
 #endif
 
     st->voice_ratio = -1;
@@ -1377,7 +1396,7 @@
              st->user_forced_mode = MODE_CELT_ONLY;
           tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50,
                 tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth,
-                NULL, 0, c1, c2, analysis_channels, downmix);
+                NULL, 0, c1, c2, analysis_channels, downmix, float_api);
           if (tmp_len<0)
           {
              RESTORE_STACK;
@@ -1424,8 +1443,7 @@
     ec_enc_init(&enc, data, max_data_bytes-1);
 
     ALLOC(pcm_buf, (total_buffer+frame_size)*st->channels, opus_val16);
-    for (i=0;i<total_buffer*st->channels;i++)
-       pcm_buf[i] = st->delay_buffer[(st->encoder_buffer-total_buffer)*st->channels+i];
+    OPUS_COPY(pcm_buf, &st->delay_buffer[(st->encoder_buffer-total_buffer)*st->channels], total_buffer*st->channels);
 
     if (st->mode == MODE_CELT_ONLY)
        hp_freq_smth1 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 );
@@ -1444,7 +1462,20 @@
     } else {
        dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
     }
-
+#ifndef FIXED_POINT
+    if (float_api)
+    {
+       opus_val32 sum;
+       sum = celt_inner_prod(&pcm_buf[total_buffer*st->channels], &pcm_buf[total_buffer*st->channels], frame_size*st->channels, st->arch);
+       /* This should filter out both NaNs and ridiculous signals that could
+          cause NaNs further down. */
+       if (!(sum < 1e9f) || celt_isnan(sum))
+       {
+          OPUS_CLEAR(&pcm_buf[total_buffer*st->channels], frame_size*st->channels);
+          st->hp_mem[0] = st->hp_mem[1] = st->hp_mem[2] = st->hp_mem[3] = 0;
+       }
+    }
+#endif
 
 
     /* SILK processing */
@@ -1599,8 +1630,7 @@
             prefill_offset = st->channels*(st->encoder_buffer-st->delay_compensation-st->Fs/400);
             gain_fade(st->delay_buffer+prefill_offset, st->delay_buffer+prefill_offset,
                   0, Q15ONE, celt_mode->overlap, st->Fs/400, st->channels, celt_mode->window, st->Fs);
-            for(i=0;i<prefill_offset;i++)
-               st->delay_buffer[i]=0;
+            OPUS_CLEAR(st->delay_buffer, prefill_offset);
 #ifdef FIXED_POINT
             pcm_silk = st->delay_buffer;
 #else
@@ -1727,15 +1757,18 @@
     ALLOC(tmp_prefill, st->channels*st->Fs/400, opus_val16);
     if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0)
     {
-       for (i=0;i<st->channels*st->Fs/400;i++)
-          tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i];
+       OPUS_COPY(tmp_prefill, &st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels], st->channels*st->Fs/400);
     }
 
-    for (i=0;i<st->channels*(st->encoder_buffer-(frame_size+total_buffer));i++)
-        st->delay_buffer[i] = st->delay_buffer[i+st->channels*frame_size];
-    for (;i<st->encoder_buffer*st->channels;i++)
-        st->delay_buffer[i] = pcm_buf[(frame_size+total_buffer-st->encoder_buffer)*st->channels+i];
-
+    if (st->channels*(st->encoder_buffer-(frame_size+total_buffer)) > 0)
+    {
+       OPUS_MOVE(st->delay_buffer, &st->delay_buffer[st->channels*frame_size], st->channels*(st->encoder_buffer-frame_size-total_buffer));
+       OPUS_COPY(&st->delay_buffer[st->channels*(st->encoder_buffer-frame_size-total_buffer)],
+             &pcm_buf[0],
+             (frame_size+total_buffer)*st->channels);
+    } else {
+       OPUS_COPY(st->delay_buffer, &pcm_buf[(frame_size+total_buffer-st->encoder_buffer)*st->channels], st->encoder_buffer*st->channels);
+    }
     /* gain_fade() and stereo_fade() need to be after the buffer copying
        because we don't want any of this to affect the SILK part */
     if( st->prev_HB_gain < Q15ONE || HB_gain < Q15ONE ) {
@@ -1955,7 +1988,8 @@
 
    for (i=0;i<frame_size*st->channels;i++)
       in[i] = FLOAT2INT16(pcm[i]);
-   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, pcm, analysis_frame_size, 0, -2, st->channels, downmix_float);
+   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16,
+                            pcm, analysis_frame_size, 0, -2, st->channels, downmix_float, 1);
    RESTORE_STACK;
    return ret;
 }
@@ -1977,7 +2011,8 @@
          , st->analysis.subframe_mem
 #endif
          );
-   return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16, pcm, analysis_frame_size, 0, -2, st->channels, downmix_int);
+   return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16,
+                             pcm, analysis_frame_size, 0, -2, st->channels, downmix_int, 0);
 }
 
 #else
@@ -2002,7 +2037,8 @@
 
    for (i=0;i<frame_size*st->channels;i++)
       in[i] = (1.0f/32768)*pcm[i];
-   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, pcm, analysis_frame_size, 0, -2, st->channels, downmix_int);
+   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16,
+                            pcm, analysis_frame_size, 0, -2, st->channels, downmix_int, 0);
    RESTORE_STACK;
    return ret;
 }
@@ -2019,7 +2055,7 @@
          st->variable_duration, st->channels, st->Fs, st->bitrate_bps,
          delay_compensation, downmix_float, st->analysis.subframe_mem);
    return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24,
-                             pcm, analysis_frame_size, 0, -2, st->channels, downmix_float);
+                             pcm, analysis_frame_size, 0, -2, st->channels, downmix_float, 1);
 }
 #endif
 
@@ -2108,7 +2144,7 @@
         case OPUS_SET_MAX_BANDWIDTH_REQUEST:
         {
             opus_int32 value = va_arg(ap, opus_int32);
-            if (value < OPUS_BANDWIDTH_NARROWBAND || value > OPUS_BANDWIDTH_FULLBAND) 
+            if (value < OPUS_BANDWIDTH_NARROWBAND || value > OPUS_BANDWIDTH_FULLBAND)
             {
                goto bad_arg;
             }
@@ -2418,11 +2454,14 @@
         {
            void *silk_enc;
            silk_EncControlStruct dummy;
+           char *start;
            silk_enc = (char*)st+st->silk_enc_offset;
+#ifndef DISABLE_FLOAT_API
+           tonality_analysis_reset(&st->analysis);
+#endif
 
-           OPUS_CLEAR((char*)&st->OPUS_ENCODER_RESET_START,
-                 sizeof(OpusEncoder)-
-                 ((char*)&st->OPUS_ENCODER_RESET_START - (char*)st));
+           start = (char*)&st->OPUS_ENCODER_RESET_START;
+           OPUS_CLEAR(start, sizeof(OpusEncoder) - (start - (char*)st));
 
            celt_encoder_ctl(celt_enc, OPUS_RESET_STATE);
            silk_InitEncoder( silk_enc, st->arch, &dummy );
diff --git a/src/opus_multistream_decoder.c b/src/opus_multistream_decoder.c
index a05fa1e..b95eaa6 100644
--- a/src/opus_multistream_decoder.c
+++ b/src/opus_multistream_decoder.c
@@ -75,7 +75,7 @@
    char *ptr;
 
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (coupled_streams+streams>255) || (streams<1) || (coupled_streams<0))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
       return OPUS_BAD_ARG;
 
    st->layout.nb_channels = channels;
@@ -119,7 +119,7 @@
    int ret;
    OpusMSDecoder *st;
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (coupled_streams+streams>255) || (streams<1) || (coupled_streams<0))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
    {
       if (error)
          *error = OPUS_BAD_ARG;
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index 49e2791..9e85773 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -41,6 +41,7 @@
 #include "modes.h"
 #include "bands.h"
 #include "quant_bands.h"
+#include "pitch.h"
 
 typedef struct {
    int nb_streams;
@@ -71,6 +72,7 @@
 
 struct OpusMSEncoder {
    ChannelLayout layout;
+   int arch;
    int lfe_stream;
    int application;
    int variable_duration;
@@ -98,7 +100,8 @@
       else
          ptr += align(mono_size);
    }
-   return (opus_val32*)(ptr+st->layout.nb_channels*120*sizeof(opus_val32));
+   /* void* cast avoids clang -Wcast-align warning */
+   return (opus_val32*)(void*)(ptr+st->layout.nb_channels*120*sizeof(opus_val32));
 }
 
 static opus_val32 *ms_get_window_mem(OpusMSEncoder *st)
@@ -117,7 +120,8 @@
       else
          ptr += align(mono_size);
    }
-   return (opus_val32*)ptr;
+   /* void* cast avoids clang -Wcast-align warning */
+   return (opus_val32*)(void*)ptr;
 }
 
 static int validate_encoder_layout(const ChannelLayout *layout)
@@ -199,7 +203,7 @@
       max = b;
       diff = SUB32(EXTEND32(b),EXTEND32(a));
    }
-   if (diff >= QCONST16(8.f, DB_SHIFT))
+   if (!(diff < QCONST16(8.f, DB_SHIFT)))  /* inverted to catch NaNs */
       return max;
 #ifdef FIXED_POINT
    low = SHR32(diff, DB_SHIFT-1);
@@ -218,7 +222,7 @@
 #endif
 
 void surround_analysis(const CELTMode *celt_mode, const void *pcm, opus_val16 *bandLogE, opus_val32 *mem, opus_val32 *preemph_mem,
-      int len, int overlap, int channels, int rate, opus_copy_channel_in_func copy_channel_in
+      int len, int overlap, int channels, int rate, opus_copy_channel_in_func copy_channel_in, int arch
 )
 {
    int c;
@@ -257,7 +261,21 @@
       OPUS_COPY(in, mem+c*overlap, overlap);
       (*copy_channel_in)(x, 1, pcm, channels, c, len);
       celt_preemphasis(x, in+overlap, frame_size, 1, upsample, celt_mode->preemph, preemph_mem+c, 0);
-      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window, overlap, celt_mode->maxLM-LM, 1);
+#ifndef FIXED_POINT
+      {
+         opus_val32 sum;
+         sum = celt_inner_prod(in, in, frame_size+overlap, 0);
+         /* This should filter out both NaNs and ridiculous signals that could
+            cause NaNs further down. */
+         if (!(sum < 1e9f) || celt_isnan(sum))
+         {
+            OPUS_CLEAR(in, frame_size+overlap);
+            preemph_mem[c] = 0;
+         }
+      }
+#endif
+      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window,
+            overlap, celt_mode->maxLM-LM, 1, arch);
       if (upsample != 1)
       {
          int bound = len;
@@ -267,7 +285,7 @@
             freq[i] = 0;
       }
 
-      compute_band_energies(celt_mode, freq, bandE, 21, 1, 1<<LM);
+      compute_band_energies(celt_mode, freq, bandE, 21, 1, LM);
       amp2Log2(celt_mode, 21, 21, bandE, bandLogE+21*c, 1);
       /* Apply spreading function with -6 dB/band going up and -12 dB/band going down. */
       for (i=1;i<21;i++)
@@ -408,9 +426,10 @@
    char *ptr;
 
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (coupled_streams+streams>255) || (streams<1) || (coupled_streams<0))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
       return OPUS_BAD_ARG;
 
+   st->arch = opus_select_arch();
    st->layout.nb_channels = channels;
    st->layout.nb_streams = streams;
    st->layout.nb_coupled_streams = coupled_streams;
@@ -530,7 +549,7 @@
    int ret;
    OpusMSEncoder *st;
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (coupled_streams+streams>255) || (streams<1) || (coupled_streams<0))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
    {
       if (error)
          *error = OPUS_BAD_ARG;
@@ -566,6 +585,7 @@
 )
 {
    int ret;
+   opus_int32 size;
    OpusMSEncoder *st;
    if ((channels>255) || (channels<1))
    {
@@ -573,7 +593,14 @@
          *error = OPUS_BAD_ARG;
       return NULL;
    }
-   st = (OpusMSEncoder *)opus_alloc(opus_multistream_surround_encoder_get_size(channels, mapping_family));
+   size = opus_multistream_surround_encoder_get_size(channels, mapping_family);
+   if (!size)
+   {
+      if (error)
+         *error = OPUS_UNIMPLEMENTED;
+      return NULL;
+   }
+   st = (OpusMSEncoder *)opus_alloc(size);
    if (st==NULL)
    {
       if (error)
@@ -591,7 +618,7 @@
    return st;
 }
 
-static void surround_rate_allocation(
+static opus_int32 surround_rate_allocation(
       OpusMSEncoder *st,
       opus_int32 *rate,
       int frame_size
@@ -605,6 +632,7 @@
    int lfe_offset;
    int coupled_ratio; /* Q8 */
    int lfe_ratio;     /* Q8 */
+   opus_int32 rate_sum=0;
 
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
    opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs));
@@ -660,7 +688,10 @@
          rate[i] = stream_offset+channel_rate;
       else
          rate[i] = lfe_offset+(channel_rate*lfe_ratio>>8);
+      rate[i] = IMAX(rate[i], 500);
+      rate_sum += rate[i];
    }
+   return rate_sum;
 }
 
 /* Max size in case the encoder decides to return three frames */
@@ -674,7 +705,8 @@
     unsigned char *data,
     opus_int32 max_data_bytes,
     int lsb_depth,
-    downmix_func downmix
+    downmix_func downmix,
+    int float_api
 )
 {
    opus_int32 Fs;
@@ -694,6 +726,8 @@
    opus_val32 *mem = NULL;
    opus_val32 *preemph_mem=NULL;
    int frame_size;
+   opus_int32 rate_sum;
+   opus_int32 smallest_packet;
    ALLOC_STACK;
 
    if (st->surround)
@@ -737,6 +771,14 @@
       RESTORE_STACK;
       return OPUS_BAD_ARG;
    }
+
+   /* Smallest packet the encoder can produce. */
+   smallest_packet = st->layout.nb_streams*2-1;
+   if (max_data_bytes < smallest_packet)
+   {
+      RESTORE_STACK;
+      return OPUS_BUFFER_TOO_SMALL;
+   }
    ALLOC(buf, 2*frame_size, opus_val16);
    coupled_size = opus_encoder_get_size(2);
    mono_size = opus_encoder_get_size(1);
@@ -744,21 +786,23 @@
    ALLOC(bandSMR, 21*st->layout.nb_channels, opus_val16);
    if (st->surround)
    {
-      surround_analysis(celt_mode, pcm, bandSMR, mem, preemph_mem, frame_size, 120, st->layout.nb_channels, Fs, copy_channel_in);
-   }
-
-   if (max_data_bytes < 4*st->layout.nb_streams-1)
-   {
-      RESTORE_STACK;
-      return OPUS_BUFFER_TOO_SMALL;
+      surround_analysis(celt_mode, pcm, bandSMR, mem, preemph_mem, frame_size, 120, st->layout.nb_channels, Fs, copy_channel_in, st->arch);
    }
 
    /* Compute bitrate allocation between streams (this could be a lot better) */
-   surround_rate_allocation(st, bitrates, frame_size);
+   rate_sum = surround_rate_allocation(st, bitrates, frame_size);
 
    if (!vbr)
-      max_data_bytes = IMIN(max_data_bytes, 3*st->bitrate_bps/(3*8*Fs/frame_size));
-
+   {
+      if (st->bitrate_bps == OPUS_AUTO)
+      {
+         max_data_bytes = IMIN(max_data_bytes, 3*rate_sum/(3*8*Fs/frame_size));
+      } else if (st->bitrate_bps != OPUS_BITRATE_MAX)
+      {
+         max_data_bytes = IMIN(max_data_bytes, IMAX(smallest_packet,
+                          3*st->bitrate_bps/(3*8*Fs/frame_size)));
+      }
+   }
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
    for (s=0;s<st->layout.nb_streams;s++)
    {
@@ -843,13 +887,15 @@
          opus_encoder_ctl(enc, OPUS_SET_ENERGY_MASK(bandLogE));
       /* number of bytes left (+Toc) */
       curr_max = max_data_bytes - tot_size;
-      /* Reserve three bytes for the last stream and four for the others */
-      curr_max -= IMAX(0,4*(st->layout.nb_streams-s-1)-1);
+      /* Reserve one byte for the last stream and two for the others */
+      curr_max -= IMAX(0,2*(st->layout.nb_streams-s-1)-1);
       curr_max = IMIN(curr_max,MS_FRAME_TMP);
+      /* Repacketizer will add one or two bytes for self-delimited frames */
+      if (s != st->layout.nb_streams-1) curr_max -=  curr_max>253 ? 2 : 1;
       if (!vbr && s == st->layout.nb_streams-1)
          opus_encoder_ctl(enc, OPUS_SET_BITRATE(curr_max*(8*Fs/frame_size)));
       len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth,
-            pcm, analysis_frame_size, c1, c2, st->layout.nb_channels, downmix);
+            pcm, analysis_frame_size, c1, c2, st->layout.nb_channels, downmix, float_api);
       if (len<0)
       {
          RESTORE_STACK;
@@ -922,7 +968,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16, downmix_int);
+      pcm, frame_size, data, max_data_bytes, 16, downmix_int, 0);
 }
 
 #ifndef DISABLE_FLOAT_API
@@ -935,7 +981,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 16, downmix_float);
+      pcm, frame_size, data, max_data_bytes, 16, downmix_float, 1);
 }
 #endif
 
@@ -951,7 +997,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 24, downmix_float);
+      pcm, frame_size, data, max_data_bytes, 24, downmix_float, 1);
 }
 
 int opus_multistream_encode(
@@ -963,7 +1009,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16, downmix_int);
+      pcm, frame_size, data, max_data_bytes, 16, downmix_int, 0);
 }
 #endif
 
diff --git a/src/opus_private.h b/src/opus_private.h
index 83225f2..3b62eed 100644
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -33,6 +33,8 @@
 #include "opus.h"
 #include "celt.h"
 
+#include <stddef.h> /* offsetof */
+
 struct OpusRepacketizer {
    unsigned char toc;
    int nb_frames;
@@ -86,10 +88,6 @@
 void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
 void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
 
-int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
-                int bitrate, opus_val16 tonality, float *mem, int buffering,
-                downmix_func downmix);
-
 int encode_size(int size, unsigned char *data);
 
 opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);
@@ -104,16 +102,23 @@
 
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
       unsigned char *data, opus_int32 out_data_bytes, int lsb_depth,
-      const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, int analysis_channels, downmix_func downmix);
+      const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2,
+      int analysis_channels, downmix_func downmix, int float_api);
 
 int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len,
       opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited,
       opus_int32 *packet_offset, int soft_clip);
 
-/* Make sure everything's aligned to sizeof(void *) bytes */
+/* Make sure everything is properly aligned. */
 static OPUS_INLINE int align(int i)
 {
-    return (i+(int)sizeof(void *)-1)&-(int)sizeof(void *);
+    struct foo {char c; union { void* p; opus_int32 i; opus_val32 v; } u;};
+
+    unsigned int alignment = offsetof(struct foo, u);
+
+    /* Optimizing compilers should optimize div and multiply into and
+       for all sensible alignment values. */
+    return ((i + alignment - 1) / alignment) * alignment;
 }
 
 int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
diff --git a/src/repacketizer.c b/src/repacketizer.c
index a62675c..f27e9ab 100644
--- a/src/repacketizer.c
+++ b/src/repacketizer.c
@@ -219,8 +219,9 @@
    }
    if (pad)
    {
-      for (i=ptr-data;i<maxlen;i++)
-         data[i] = 0;
+      /* Fill padding with zeros. */
+      while (ptr<data+maxlen)
+         *ptr++=0;
    }
    return tot_size;
 }
diff --git a/test-driver b/test-driver
new file mode 100755
index 0000000..8e575b0
--- /dev/null
+++ b/test-driver
@@ -0,0 +1,148 @@
+#! /bin/sh
+# test-driver - basic testsuite driver script.
+
+scriptversion=2013-07-13.22; # UTC
+
+# Copyright (C) 2011-2014 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+# Make unconditional expansion of undefined variables an error.  This
+# helps a lot in preventing typo-related bugs.
+set -u
+
+usage_error ()
+{
+  echo "$0: $*" >&2
+  print_usage >&2
+  exit 2
+}
+
+print_usage ()
+{
+  cat <<END
+Usage:
+  test-driver --test-name=NAME --log-file=PATH --trs-file=PATH
+              [--expect-failure={yes|no}] [--color-tests={yes|no}]
+              [--enable-hard-errors={yes|no}] [--]
+              TEST-SCRIPT [TEST-SCRIPT-ARGUMENTS]
+The '--test-name', '--log-file' and '--trs-file' options are mandatory.
+END
+}
+
+test_name= # Used for reporting.
+log_file=  # Where to save the output of the test script.
+trs_file=  # Where to save the metadata of the test run.
+expect_failure=no
+color_tests=no
+enable_hard_errors=yes
+while test $# -gt 0; do
+  case $1 in
+  --help) print_usage; exit $?;;
+  --version) echo "test-driver $scriptversion"; exit $?;;
+  --test-name) test_name=$2; shift;;
+  --log-file) log_file=$2; shift;;
+  --trs-file) trs_file=$2; shift;;
+  --color-tests) color_tests=$2; shift;;
+  --expect-failure) expect_failure=$2; shift;;
+  --enable-hard-errors) enable_hard_errors=$2; shift;;
+  --) shift; break;;
+  -*) usage_error "invalid option: '$1'";;
+   *) break;;
+  esac
+  shift
+done
+
+missing_opts=
+test x"$test_name" = x && missing_opts="$missing_opts --test-name"
+test x"$log_file"  = x && missing_opts="$missing_opts --log-file"
+test x"$trs_file"  = x && missing_opts="$missing_opts --trs-file"
+if test x"$missing_opts" != x; then
+  usage_error "the following mandatory options are missing:$missing_opts"
+fi
+
+if test $# -eq 0; then
+  usage_error "missing argument"
+fi
+
+if test $color_tests = yes; then
+  # Keep this in sync with 'lib/am/check.am:$(am__tty_colors)'.
+  red='' # Red.
+  grn='' # Green.
+  lgn='' # Light green.
+  blu='' # Blue.
+  mgn='' # Magenta.
+  std=''     # No color.
+else
+  red= grn= lgn= blu= mgn= std=
+fi
+
+do_exit='rm -f $log_file $trs_file; (exit $st); exit $st'
+trap "st=129; $do_exit" 1
+trap "st=130; $do_exit" 2
+trap "st=141; $do_exit" 13
+trap "st=143; $do_exit" 15
+
+# Test script is run here.
+"$@" >$log_file 2>&1
+estatus=$?
+
+if test $enable_hard_errors = no && test $estatus -eq 99; then
+  tweaked_estatus=1
+else
+  tweaked_estatus=$estatus
+fi
+
+case $tweaked_estatus:$expect_failure in
+  0:yes) col=$red res=XPASS recheck=yes gcopy=yes;;
+  0:*)   col=$grn res=PASS  recheck=no  gcopy=no;;
+  77:*)  col=$blu res=SKIP  recheck=no  gcopy=yes;;
+  99:*)  col=$mgn res=ERROR recheck=yes gcopy=yes;;
+  *:yes) col=$lgn res=XFAIL recheck=no  gcopy=yes;;
+  *:*)   col=$red res=FAIL  recheck=yes gcopy=yes;;
+esac
+
+# Report the test outcome and exit status in the logs, so that one can
+# know whether the test passed or failed simply by looking at the '.log'
+# file, without the need of also peaking into the corresponding '.trs'
+# file (automake bug#11814).
+echo "$res $test_name (exit status: $estatus)" >>$log_file
+
+# Report outcome to console.
+echo "${col}${res}${std}: $test_name"
+
+# Register the test result, and other relevant metadata.
+echo ":test-result: $res" > $trs_file
+echo ":global-test-result: $res" >> $trs_file
+echo ":recheck: $recheck" >> $trs_file
+echo ":copy-in-global-log: $gcopy" >> $trs_file
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/tests/run_vectors.sh b/tests/run_vectors.sh
index 116a743..1d447c4 100755
--- a/tests/run_vectors.sh
+++ b/tests/run_vectors.sh
@@ -45,8 +45,8 @@
 VECTOR_PATH=$2
 RATE=$3
 
-OPUS_DEMO=$CMD_PATH/opus_demo
-OPUS_COMPARE=$CMD_PATH/opus_compare
+: ${OPUS_DEMO:=$CMD_PATH/opus_demo}
+: ${OPUS_COMPARE:=$CMD_PATH/opus_compare}
 
 if [ -d $VECTOR_PATH ]; then
     echo Test vectors found in $VECTOR_PATH
diff --git a/tests/test_opus_api.c b/tests/test_opus_api.c
index bafe4e4..9bfa5cc 100644
--- a/tests/test_opus_api.c
+++ b/tests/test_opus_api.c
@@ -1753,7 +1753,7 @@
 #endif
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
-typedef void *(*mhook)(size_t __size, __const __malloc_ptr_t);
+typedef void *(*mhook)(size_t __size, __const void *);
 #endif
 
 int test_malloc_fail(void)
diff --git a/version.mk b/version.mk
index 519ff14..f073d40 100644
--- a/version.mk
+++ b/version.mk
@@ -1,2 +1,2 @@
 # static version string; update manually every release.
-PACKAGE_VERSION = "1.1-beta"
+PACKAGE_VERSION = "1.1.2"
diff --git a/win32/VS2010/celt.vcxproj b/win32/VS2010/celt.vcxproj
index f107fec..958d6a9 100644
--- a/win32/VS2010/celt.vcxproj
+++ b/win32/VS2010/celt.vcxproj
@@ -37,6 +37,12 @@
     <ClCompile Include="..\..\celt\quant_bands.c" />
     <ClCompile Include="..\..\celt\rate.c" />
     <ClCompile Include="..\..\celt\vq.c" />
+    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse2.c" />
+    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c" />
+    <ClCompile Include="..\..\celt\x86\x86cpu.c" />
+    <ClCompile Include="..\..\celt\x86\x86_celt_map.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\celt\arch.h" />
@@ -67,6 +73,9 @@
     <ClInclude Include="..\..\celt\static_modes_fixed.h" />
     <ClInclude Include="..\..\celt\static_modes_float.h" />
     <ClInclude Include="..\..\celt\vq.h" />
+    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h" />
+    <ClInclude Include="..\..\celt\x86\pitch_sse.h" />
+    <ClInclude Include="..\..\celt\x86\x86cpu.h" />
     <ClInclude Include="..\..\celt\_kiss_fft_guts.h" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
@@ -141,7 +150,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\;..\..\include;..\..\celt;..\..\silk;..\..\silk\float;..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -168,7 +177,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\;..\..\include;..\..\celt;..\..\silk;..\..\silk\float;..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -196,7 +205,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\;..\..\include;..\..\celt;..\..\silk;..\..\silk\float;..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -227,7 +236,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\;..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\;..\..\include;..\..\celt;..\..\silk;..\..\silk\float;..\..\silk\fixed;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
diff --git a/win32/VS2010/celt.vcxproj.filters b/win32/VS2010/celt.vcxproj.filters
index e3a1d97..e9948fa 100644
--- a/win32/VS2010/celt.vcxproj.filters
+++ b/win32/VS2010/celt.vcxproj.filters
@@ -69,6 +69,24 @@
     <ClCompile Include="..\..\celt\celt.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\celt\x86\celt_lpc_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\pitch_sse4_1.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\x86_celt_map.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\celt\x86\x86cpu.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\celt\cwrs.h">
@@ -158,5 +176,14 @@
     <ClInclude Include="..\..\celt\celt_lpc.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\celt\x86\celt_lpc_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\celt\x86\pitch_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\celt\x86\x86cpu.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/win32/VS2010/opus_demo.vcxproj b/win32/VS2010/opus_demo.vcxproj
index 9cc081f..d087147 100644
--- a/win32/VS2010/opus_demo.vcxproj
+++ b/win32/VS2010/opus_demo.vcxproj
@@ -18,6 +18,23 @@
       <Platform>x64</Platform>
     </ProjectConfiguration>
   </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="celt.vcxproj">
+      <Project>{245603e3-f580-41a5-9632-b25fe3372cbf}</Project>
+    </ProjectReference>
+    <ProjectReference Include="opus.vcxproj">
+      <Project>{219ec965-228a-1824-174d-96449d05f88a}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_common.vcxproj">
+      <Project>{c303d2fc-ff97-49b8-9ddd-467b4c9a0b16}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_float.vcxproj">
+      <Project>{9c4961d2-5ddb-40c7-9be8-ca918dc4e782}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\opus_demo.c" />
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{016C739D-6389-43BF-8D88-24B2BF6F620F}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
@@ -85,13 +102,12 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../silk;../celt;../win32;../include;</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\silk;..\..\celt;..\;..\..\include;</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(SolutionDir)$(Configuration)\opus.lib;$(SolutionDir)$(Configuration)\celt.lib;$(SolutionDir)$(Configuration)\silk_common.lib;$(SolutionDir)$(Configuration)\silk_float.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -101,13 +117,12 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../silk;../celt;../win32;../include;</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\silk;..\..\celt;..\;..\..\include;</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(SolutionDir)$(Configuration)\opus.lib;$(SolutionDir)$(Configuration)\celt.lib;$(SolutionDir)$(Configuration)\silk_common.lib;$(SolutionDir)$(Configuration)\silk_float.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -120,14 +135,13 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <AdditionalIncludeDirectories>../silk;../celt;../win32;../include;</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\silk;..\..\celt;..\;..\..\include;</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(SolutionDir)$(Configuration)\opus.lib;$(SolutionDir)$(Configuration)\celt.lib;$(SolutionDir)$(Configuration)\silk_common.lib;$(SolutionDir)$(Configuration)\silk_float.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -140,14 +154,13 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <AdditionalIncludeDirectories>../silk;../celt;../win32;../include;</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\silk;..\..\celt;..\;..\..\include;</AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(SolutionDir)$(Configuration)\opus.lib;$(SolutionDir)$(Configuration)\celt.lib;$(SolutionDir)$(Configuration)\silk_common.lib;$(SolutionDir)$(Configuration)\silk_float.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/win32/VS2010/opus_demo.vcxproj.filters b/win32/VS2010/opus_demo.vcxproj.filters
index d7ef6a1..2eb113a 100644
--- a/win32/VS2010/opus_demo.vcxproj.filters
+++ b/win32/VS2010/opus_demo.vcxproj.filters
@@ -14,4 +14,9 @@
       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
     </Filter>
   </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\opus_demo.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/win32/VS2010/silk_common.vcxproj b/win32/VS2010/silk_common.vcxproj
index 9cf5f48..1bf2b20 100644
--- a/win32/VS2010/silk_common.vcxproj
+++ b/win32/VS2010/silk_common.vcxproj
@@ -88,7 +88,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk/float;../../silk;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -118,7 +118,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk/float;../../silk;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -149,7 +149,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk/float;../../silk;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
@@ -184,7 +184,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk/float;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk/float;../../silk;../../win32;../../celt;../../include</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
@@ -212,6 +212,8 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="..\..\include\opus_types.h" />
+    <ClInclude Include="..\..\silk\x86\main_sse.h" />
+    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h" />
     <ClInclude Include="..\..\win32\config.h" />
     <ClInclude Include="..\..\silk\control.h" />
     <ClInclude Include="..\..\silk\debug.h" />
@@ -311,6 +313,11 @@
     <ClCompile Include="..\..\silk\table_LSF_cos.c" />
     <ClCompile Include="..\..\silk\VAD.c" />
     <ClCompile Include="..\..\silk\VQ_WMat_EC.c" />
+    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c" />
+    <ClCompile Include="..\..\silk\x86\NSQ_sse.c" />
+    <ClCompile Include="..\..\silk\x86\VAD_sse.c" />
+    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c" />
+    <ClCompile Include="..\..\silk\x86\x86_silk_map.c" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/win32/VS2010/silk_common.vcxproj.filters b/win32/VS2010/silk_common.vcxproj.filters
index 30db48e..c41064e 100644
--- a/win32/VS2010/silk_common.vcxproj.filters
+++ b/win32/VS2010/silk_common.vcxproj.filters
@@ -81,6 +81,12 @@
     <ClInclude Include="..\..\silk\typedef.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\silk\x86\main_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\silk\x86\SigProc_FIX_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\silk\VQ_WMat_EC.c">
@@ -311,5 +317,20 @@
     <ClCompile Include="..\..\silk\VAD.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\silk\x86\NSQ_del_dec_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\NSQ_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\VAD_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\VQ_WMat_EC_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\x86\x86_silk_map.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
diff --git a/win32/VS2010/silk_fixed.vcxproj b/win32/VS2010/silk_fixed.vcxproj
index 5ea1a91..1d01a33 100644
--- a/win32/VS2010/silk_fixed.vcxproj
+++ b/win32/VS2010/silk_fixed.vcxproj
@@ -86,7 +86,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -104,7 +104,7 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -123,7 +123,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -145,7 +145,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>HAVE_CONFIG_H;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../..;../../silk/fixed;../../silk;../../win32;../../celt;../../include;../win32</AdditionalIncludeDirectories>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
@@ -191,6 +191,9 @@
     <ClCompile Include="..\..\silk\fixed\solve_LS_FIX.c" />
     <ClCompile Include="..\..\silk\fixed\vector_ops_FIX.c" />
     <ClCompile Include="..\..\silk\fixed\warped_autocorrelation_FIX.c" />
+    <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c" />
+    <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c" />
+    <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/win32/VS2010/silk_fixed.vcxproj.filters b/win32/VS2010/silk_fixed.vcxproj.filters
index 6897930..c2327eb 100644
--- a/win32/VS2010/silk_fixed.vcxproj.filters
+++ b/win32/VS2010/silk_fixed.vcxproj.filters
@@ -18,18 +18,18 @@
     <ClInclude Include="..\..\win32\config.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="main_FIX.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\SigProc_FIX.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="structs_FIX.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
     <ClInclude Include="..\..\include\opus_types.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\silk\SigProc_FIX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\silk\fixed\main_FIX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\silk\fixed\structs_FIX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\silk\fixed\LTP_scale_ctrl_FIX.c">
@@ -107,5 +107,14 @@
     <ClCompile Include="..\..\silk\fixed\LTP_analysis_filter_FIX.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\silk\fixed\x86\burg_modified_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\fixed\x86\prefilter_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\silk\fixed\x86\vector_ops_FIX_sse.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/win32/VS2010/test_opus_api.vcxproj b/win32/VS2010/test_opus_api.vcxproj
index bf42a8f..0389b95 100644
--- a/win32/VS2010/test_opus_api.vcxproj
+++ b/win32/VS2010/test_opus_api.vcxproj
@@ -21,6 +21,20 @@
   <ItemGroup>
     <ClCompile Include="..\..\tests\test_opus_api.c" />
   </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="celt.vcxproj">
+      <Project>{245603e3-f580-41a5-9632-b25fe3372cbf}</Project>
+    </ProjectReference>
+    <ProjectReference Include="opus.vcxproj">
+      <Project>{219ec965-228a-1824-174d-96449d05f88a}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_common.vcxproj">
+      <Project>{c303d2fc-ff97-49b8-9ddd-467b4c9a0b16}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_float.vcxproj">
+      <Project>{9c4961d2-5ddb-40c7-9be8-ca918dc4e782}</Project>
+    </ProjectReference>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{1D257A17-D254-42E5-82D6-1C87A6EC775A}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
@@ -94,7 +108,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -110,7 +123,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -129,7 +141,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -148,7 +159,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/win32/VS2010/test_opus_decode.vcxproj b/win32/VS2010/test_opus_decode.vcxproj
index 3452331..67e552d 100644
--- a/win32/VS2010/test_opus_decode.vcxproj
+++ b/win32/VS2010/test_opus_decode.vcxproj
@@ -21,6 +21,20 @@
   <ItemGroup>
     <ClCompile Include="..\..\tests\test_opus_decode.c" />
   </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="celt.vcxproj">
+      <Project>{245603e3-f580-41a5-9632-b25fe3372cbf}</Project>
+    </ProjectReference>
+    <ProjectReference Include="opus.vcxproj">
+      <Project>{219ec965-228a-1824-174d-96449d05f88a}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_common.vcxproj">
+      <Project>{c303d2fc-ff97-49b8-9ddd-467b4c9a0b16}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_float.vcxproj">
+      <Project>{9c4961d2-5ddb-40c7-9be8-ca918dc4e782}</Project>
+    </ProjectReference>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8578322A-1883-486B-B6FA-E0094B65C9F2}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
@@ -95,7 +109,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -112,7 +125,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -132,7 +144,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -152,7 +163,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/win32/VS2010/test_opus_encode.vcxproj b/win32/VS2010/test_opus_encode.vcxproj
index d2ede27..50354d4 100644
--- a/win32/VS2010/test_opus_encode.vcxproj
+++ b/win32/VS2010/test_opus_encode.vcxproj
@@ -21,6 +21,20 @@
   <ItemGroup>
     <ClCompile Include="..\..\tests\test_opus_encode.c" />
   </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="celt.vcxproj">
+      <Project>{245603e3-f580-41a5-9632-b25fe3372cbf}</Project>
+    </ProjectReference>
+    <ProjectReference Include="opus.vcxproj">
+      <Project>{219ec965-228a-1824-174d-96449d05f88a}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_common.vcxproj">
+      <Project>{c303d2fc-ff97-49b8-9ddd-467b4c9a0b16}</Project>
+    </ProjectReference>
+    <ProjectReference Include="silk_float.vcxproj">
+      <Project>{9c4961d2-5ddb-40c7-9be8-ca918dc4e782}</Project>
+    </ProjectReference>
+  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{84DAA768-1A38-4312-BB61-4C78BB59E5B8}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
@@ -95,7 +109,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -112,7 +125,6 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -132,7 +144,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -152,7 +163,6 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(Platform)\$(Configuration)\opus.lib;$(Platform)\$(Configuration)\celt.lib;$(Platform)\$(Configuration)\silk_common.lib;$(Platform)\$(Configuration)\silk_float.lib</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/win32/config.h b/win32/config.h
index 46ff699..3e54bcb 100644
--- a/win32/config.h
+++ b/win32/config.h
@@ -35,9 +35,28 @@
 
 #define OPUS_BUILD            1
 
-/* Enable SSE functions, if compiled with SSE/SSE2 (note that AMD64 implies SSE2) */
-#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
-#define __SSE__               1
+#if defined(_M_IX86) || defined(_M_X64)
+/* Can always compile SSE intrinsics (no special compiler flags necessary) */
+#define OPUS_X86_MAY_HAVE_SSE
+#define OPUS_X86_MAY_HAVE_SSE2
+#define OPUS_X86_MAY_HAVE_SSE4_1
+
+/* Presume SSE functions, if compiled to use SSE/SSE2/AVX (note that AMD64 implies SSE2, and AVX
+   implies SSE4.1) */
+#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) || defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE 1
+#endif
+#if defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE2 1
+#endif
+#if defined(__AVX__)
+#define OPUS_X86_PRESUME_SSE4_1 1
+#endif
+
+#if !defined(OPUS_X86_PRESUME_SSE4_1) || !defined(OPUS_X86_PRESUME_SSE2) || !defined(OPUS_X86_PRESUME_SSE)
+#define OPUS_HAVE_RTCD 1
+#endif
+
 #endif
 
 #include "version.h"
diff --git a/win32/genversion.bat b/win32/genversion.bat
index a9b9353..cd1d4dc 100644
--- a/win32/genversion.bat
+++ b/win32/genversion.bat
@@ -23,10 +23,10 @@
 
 :gotversion
 
-set version_out=#define %2 "%version%"
-set version_mk=%2 = "%version%"
+set version_out=#define %~2 "%version%"
+set version_mk=%~2 = "%version%"
 
-echo %version_out%> "%1_temp"
+echo %version_out%> "%~1_temp"
 
 if %version%==unknown goto :skipgenerate
 
@@ -35,12 +35,12 @@
 
 :skipgenerate
 
-echo n | comp "%1_temp" "%1" > NUL 2> NUL
+echo n | comp "%~1_temp" "%~1" > NUL 2> NUL
 
 if not errorlevel 1 goto exit
 
-copy /y "%1_temp" "%1"
+copy /y "%~1_temp" "%~1"
 
 :exit
 
-del "%1_temp"
+del "%~1_temp"