Upgrade libopus to 794392ecd77e6fc6aafa62c3f6002780abcc2c7c

Test: make
Change-Id: I3e5232d6c6d250c69abfa71482ac91e30c1933b6
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..b9ad434
--- /dev/null
+++ b/Android.bp
@@ -0,0 +1,351 @@
+cc_library {
+    name: "libopus",
+    vendor_available: true,
+    host_supported: true,
+
+    export_include_dirs: ["include"],
+
+    local_include_dirs: [
+        "src",
+        "silk",
+        "celt",
+        "silk/fixed",
+    ],
+
+    sanitize: {
+        integer_overflow: true,
+        misc_undefined: ["bounds"],
+        blocklist: "libopus_blocklist.txt",
+    },
+
+    srcs: [
+        // CELT_SOURCES
+        "celt/bands.c",
+        "celt/celt.c",
+        "celt/celt_encoder.c",
+        "celt/celt_decoder.c",
+        "celt/cwrs.c",
+        "celt/entcode.c",
+        "celt/entdec.c",
+        "celt/entenc.c",
+        "celt/kiss_fft.c",
+        "celt/laplace.c",
+        "celt/mathops.c",
+        "celt/mdct.c",
+        "celt/modes.c",
+        "celt/pitch.c",
+        "celt/celt_lpc.c",
+        "celt/quant_bands.c",
+        "celt/rate.c",
+        "celt/vq.c",
+
+        // SILK_SOURCES
+        "silk/CNG.c",
+        "silk/code_signs.c",
+        "silk/init_decoder.c",
+        "silk/decode_core.c",
+        "silk/decode_frame.c",
+        "silk/decode_parameters.c",
+        "silk/decode_indices.c",
+        "silk/decode_pulses.c",
+        "silk/decoder_set_fs.c",
+        "silk/dec_API.c",
+        "silk/enc_API.c",
+        "silk/encode_indices.c",
+        "silk/encode_pulses.c",
+        "silk/gain_quant.c",
+        "silk/interpolate.c",
+        "silk/LP_variable_cutoff.c",
+        "silk/NLSF_decode.c",
+        "silk/NSQ.c",
+        "silk/NSQ_del_dec.c",
+        "silk/PLC.c",
+        "silk/shell_coder.c",
+        "silk/tables_gain.c",
+        "silk/tables_LTP.c",
+        "silk/tables_NLSF_CB_NB_MB.c",
+        "silk/tables_NLSF_CB_WB.c",
+        "silk/tables_other.c",
+        "silk/tables_pitch_lag.c",
+        "silk/tables_pulses_per_block.c",
+        "silk/VAD.c",
+        "silk/control_audio_bandwidth.c",
+        "silk/quant_LTP_gains.c",
+        "silk/VQ_WMat_EC.c",
+        "silk/HP_variable_cutoff.c",
+        "silk/NLSF_encode.c",
+        "silk/NLSF_VQ.c",
+        "silk/NLSF_unpack.c",
+        "silk/NLSF_del_dec_quant.c",
+        "silk/process_NLSFs.c",
+        "silk/stereo_LR_to_MS.c",
+        "silk/stereo_MS_to_LR.c",
+        "silk/check_control_input.c",
+        "silk/control_SNR.c",
+        "silk/init_encoder.c",
+        "silk/control_codec.c",
+        "silk/A2NLSF.c",
+        "silk/ana_filt_bank_1.c",
+        "silk/biquad_alt.c",
+        "silk/bwexpander_32.c",
+        "silk/bwexpander.c",
+        "silk/debug.c",
+        "silk/decode_pitch.c",
+        "silk/inner_prod_aligned.c",
+        "silk/lin2log.c",
+        "silk/log2lin.c",
+        "silk/LPC_analysis_filter.c",
+        "silk/LPC_fit.c",
+        "silk/LPC_inv_pred_gain.c",
+        "silk/table_LSF_cos.c",
+        "silk/NLSF2A.c",
+        "silk/NLSF_stabilize.c",
+        "silk/NLSF_VQ_weights_laroia.c",
+        "silk/pitch_est_tables.c",
+        "silk/resampler.c",
+        "silk/resampler_down2_3.c",
+        "silk/resampler_down2.c",
+        "silk/resampler_private_AR2.c",
+        "silk/resampler_private_down_FIR.c",
+        "silk/resampler_private_IIR_FIR.c",
+        "silk/resampler_private_up2_HQ.c",
+        "silk/resampler_rom.c",
+        "silk/sigm_Q15.c",
+        "silk/sort.c",
+        "silk/sum_sqr_shift.c",
+        "silk/stereo_decode_pred.c",
+        "silk/stereo_encode_pred.c",
+        "silk/stereo_find_predictor.c",
+        "silk/stereo_quant_pred.c",
+
+        // SILK_SOURCES_FIXED
+        "silk/fixed/LTP_analysis_filter_FIX.c",
+        "silk/fixed/LTP_scale_ctrl_FIX.c",
+        "silk/fixed/corrMatrix_FIX.c",
+        "silk/fixed/encode_frame_FIX.c",
+        "silk/fixed/find_LPC_FIX.c",
+        "silk/fixed/find_LTP_FIX.c",
+        "silk/fixed/find_pitch_lags_FIX.c",
+        "silk/fixed/find_pred_coefs_FIX.c",
+        "silk/fixed/noise_shape_analysis_FIX.c",
+        "silk/fixed/process_gains_FIX.c",
+        "silk/fixed/regularize_correlations_FIX.c",
+        "silk/fixed/residual_energy16_FIX.c",
+        "silk/fixed/residual_energy_FIX.c",
+        "silk/fixed/warped_autocorrelation_FIX.c",
+        "silk/fixed/apply_sine_window_FIX.c",
+        "silk/fixed/autocorr_FIX.c",
+        "silk/fixed/burg_modified_FIX.c",
+        "silk/fixed/k2a_FIX.c",
+        "silk/fixed/k2a_Q16_FIX.c",
+        "silk/fixed/pitch_analysis_core_FIX.c",
+        "silk/fixed/vector_ops_FIX.c",
+        "silk/fixed/schur64_FIX.c",
+        "silk/fixed/schur_FIX.c",
+
+        // OPUS_SOURCES
+        "src/mapping_matrix.c",
+        "src/opus.c",
+        "src/opus_decoder.c",
+        "src/opus_encoder.c",
+        "src/opus_multistream.c",
+        "src/opus_multistream_encoder.c",
+        "src/opus_multistream_decoder.c",
+        "src/opus_projection_encoder.c",
+        "src/opus_projection_decoder.c",
+        "src/repacketizer.c",
+
+        // OPUS_SOURCES_FLOAT
+        "src/analysis.c",
+        "src/mlp.c",
+        "src/mlp_data.c",
+    ],
+
+    cflags: [
+        "-DNULL=0",
+        "-DSOCKLEN_T=socklen_t",
+        "-DLOCALE_NOT_USED",
+        "-D_LARGEFILE_SOURCE=1",
+        "-D_FILE_OFFSET_BITS=64",
+        "-Drestrict=",
+        "-D__EMX__",
+        "-DOPUS_BUILD",
+        "-DFIXED_POINT",
+        "-DUSE_ALLOCA",
+        "-DHAVE_LRINT",
+        "-DHAVE_LRINTF",
+        "-DENABLE_HARDENING",
+        "-O2",
+        "-fno-math-errno",
+        "-Wall",
+        "-Werror",
+    ],
+    cppflags: [
+        "-DBSD=1",
+        "-ffast-math",
+        "-O2",
+        "-funroll-loops",
+    ],
+
+    arch: {
+        arm: {
+            srcs: [
+                // CELT_SOURCES_ARM
+                "celt/arm/armcpu.c",
+                "celt/arm/arm_celt_map.c",
+
+                // DSP, MEDIA and NEON instructions are in the same assembler
+                // file - thus we need to include it even if NEON is not
+                // supported on target platform.
+                // CELT_SOURCES_ARM_ASM
+                "celt/arm/celt_pitch_xcorr_arm_gnu.s",
+
+                // CELT_AM_SOURCES_ARM_ASM
+                "celt/arm/armopts_gnu.s",
+            ],
+
+            cflags: [
+                "-DOPUS_ARM_ASM",
+                "-DOPUS_ARM_INLINE_ASM",
+                "-DOPUS_ARM_MAY_HAVE_EDSP",
+                "-DOPUS_ARM_INLINE_EDSP",
+                "-DOPUS_ARM_MAY_HAVE_MEDIA",
+                "-DOPUS_ARM_INLINE_MEDIA",
+                "-DOPUS_ARM_MAY_HAVE_NEON",
+                "-DOPUS_HAVE_RTCD",
+            ],
+
+            // Note: OPUS enhanced DSP/NEON implementation is not yet
+            // compatible with arm64.  Only add the appropriate defines for
+            // 32-bit arm architecture.
+            neon: {
+                srcs: [
+                    // CELT_SOURCES_ARM_NEON_INTR
+                    "celt/arm/celt_neon_intr.c",
+                    "celt/arm/pitch_neon_intr.c",
+
+                    // SILK_SOURCES_ARM_NEON_INTR,
+                    "silk/arm/arm_silk_map.c",
+                    "silk/arm/biquad_alt_neon_intr.c",
+                    "silk/arm/LPC_inv_pred_gain_neon_intr.c",
+                    "silk/arm/NSQ_del_dec_neon_intr.c",
+                    "silk/arm/NSQ_neon.c",
+
+                    // SILK_SOURCES_FIXED_ARM_NEON_INTR,
+                    "silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c",
+                ],
+
+                cflags: [
+                    "-DOPUS_ARM_MAY_HAVE_NEON",
+                    "-DOPUS_ARM_MAY_HAVE_NEON_INTR",
+                    "-DOPUS_ARM_PRESUME_NEON",
+                    "-DOPUS_ARM_INLINE_NEON",
+                ],
+            },
+
+        },
+
+        x86: {
+            ssse3: {
+                srcs: [
+                    // CELT_SOURCES_SSE
+                    "celt/x86/x86cpu.c",
+                    "celt/x86/x86_celt_map.c",
+                    "celt/x86/pitch_sse.c",
+
+                    // CELT_SOURCES_SSE2
+                    "celt/x86/pitch_sse2.c",
+                    "celt/x86/vq_sse2.c",
+                ],
+
+                cflags: [
+                    "-DOPUS_X86_MAY_HAVE_SSE",
+                    "-DOPUS_X86_PRESUME_SSE",
+                    "-DOPUS_X86_MAY_HAVE_SSE2",
+                    "-DOPUS_X86_PRESUME_SSE2",
+                ],
+            },
+
+            sse4_1: {
+                srcs: [
+                    // CELT_SOURCES_SSE4_1
+                    "celt/x86/celt_lpc_sse4_1.c",
+                    "celt/x86/pitch_sse4_1.c",
+
+                    // SILK_SOURCES_SSE4_1
+                    "silk/x86/NSQ_sse4_1.c",
+                    "silk/x86/NSQ_del_dec_sse4_1.c",
+                    "silk/x86/x86_silk_map.c",
+                    "silk/x86/VAD_sse4_1.c",
+                    "silk/x86/VQ_WMat_EC_sse4_1.c",
+
+                    // SILK_SOURCES_FIXED_SSE4_1
+                    "silk/fixed/x86/vector_ops_FIX_sse4_1.c",
+                    "silk/fixed/x86/burg_modified_FIX_sse4_1.c",
+                ],
+
+                cflags: [
+                    "-DOPUS_X86_MAY_HAVE_SSE4_1",
+                    "-DOPUS_X86_PRESUME_SSE4_1",
+                ],
+            },
+        },
+
+        x86_64: {
+            ssse3: {
+                srcs: [
+                    // CELT_SOURCES_SSE
+                    "celt/x86/x86cpu.c",
+                    "celt/x86/x86_celt_map.c",
+                    "celt/x86/pitch_sse.c",
+
+                    // CELT_SOURCES_SSE2
+                    "celt/x86/pitch_sse2.c",
+                    "celt/x86/vq_sse2.c",
+                ],
+
+                cflags: [
+                    "-DOPUS_X86_MAY_HAVE_SSE",
+                    "-DOPUS_X86_PRESUME_SSE",
+                    "-DOPUS_X86_MAY_HAVE_SSE2",
+                    "-DOPUS_X86_PRESUME_SSE2",
+                ],
+            },
+
+            sse4_1: {
+                srcs: [
+                    // CELT_SOURCES_SSE4_1
+                    "celt/x86/celt_lpc_sse4_1.c",
+                    "celt/x86/pitch_sse4_1.c",
+
+                    // SILK_SOURCES_SSE4_1
+                    "silk/x86/NSQ_sse4_1.c",
+                    "silk/x86/NSQ_del_dec_sse4_1.c",
+                    "silk/x86/x86_silk_map.c",
+                    "silk/x86/VAD_sse4_1.c",
+                    "silk/x86/VQ_WMat_EC_sse4_1.c",
+
+                    // SILK_SOURCES_FIXED_SSE4_1
+                    "silk/fixed/x86/vector_ops_FIX_sse4_1.c",
+                    "silk/fixed/x86/burg_modified_FIX_sse4_1.c",
+                ],
+
+                cflags: [
+                    "-DOPUS_X86_MAY_HAVE_SSE4_1",
+                    "-DOPUS_X86_PRESUME_SSE4_1",
+                ],
+            },
+        },
+    },
+
+    target: {
+        darwin: {
+            enabled: false,
+        },
+    },
+    apex_available: [
+        "//apex_available:platform", // used by libstagefright_soft_opusdec
+        "com.android.media.swcodec",
+    ],
+    min_sdk_version: "29",
+}
diff --git a/CleanSpec.mk b/CleanSpec.mk
new file mode 100644
index 0000000..d531442
--- /dev/null
+++ b/CleanSpec.mk
@@ -0,0 +1,49 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# If you don't need to do a full clean build but would like to touch
+# a file or delete some intermediate files, add a clean step to the end
+# of the list.  These steps will only be run once, if they haven't been
+# run before.
+#
+# E.g.:
+#     $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
+#     $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
+#
+# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
+# files that are missing or have been moved.
+#
+# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
+# Use $(OUT_DIR) to refer to the "out" directory.
+#
+# If you need to re-do something that's already mentioned, just copy
+# the command and add it to the bottom of the list.  E.g., if a change
+# that you made last week required touching a file and a change you
+# made today requires touching the same file, just copy the old
+# touch step and add it to the end of the list.
+#
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# For example:
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates)
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates)
+#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f)
+#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*)
+
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..eb54eee
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,15 @@
+name: "libopus"
+description: "Android fork of the opus library."
+third_party {
+  url {
+    type: GIT
+    value: "https://gitlab.xiph.org/xiph/opus.git"
+  }
+  version: "794392ecd77e6fc6aafa62c3f6002780abcc2c7c"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2021
+    month: 1
+    day: 5
+  }
+}
diff --git a/MODULE_LICENSE_BSD b/MODULE_LICENSE_BSD
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_BSD
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..108afb2
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,27 @@
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of Internet Society, IETF or IETF Trust, nor the 
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..229a641
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,4 @@
+# Default code reviewers picked from top 3 or more developers.
+# Please update this list if you find better candidates.
+flim@google.com
+essick@google.com
diff --git a/PREUPLOAD.cfg b/PREUPLOAD.cfg
new file mode 100644
index 0000000..ecf8b8e
--- /dev/null
+++ b/PREUPLOAD.cfg
@@ -0,0 +1,2 @@
+[Hook Scripts]
+mainline_hook = ${REPO_ROOT}/frameworks/av/tools/mainline_hook_project.sh
diff --git a/README.android b/README.android
new file mode 100644
index 0000000..06fdc0e
--- /dev/null
+++ b/README.android
@@ -0,0 +1,7 @@
+* current source is based on libopus 1.3 (https://git.xiph.org/?p=opus.git;a=snapshot;h=83d5155f151ca47c9d6274ded1a7481f746b9a43;sf=tgz)
+* libopus is BSD-licensed - http://www.opus-codec.org/license/
+
+Updating:
+* Run "convert_android_asm.sh" from the root of the library (external/libopus).
+  This uses 'arm2gnu.pl' included in libopus to convert ARM ASM files to GNU ASM
+  files for building under the Android NDK.
diff --git a/README.version b/README.version
new file mode 100644
index 0000000..515fea8
--- /dev/null
+++ b/README.version
@@ -0,0 +1,7 @@
+This commit is upto-date with following the commit in upstream-master
+Branch: upstream-master
+commit: b83dd52868326a401c8578041e3dbea439d53f11
+
+upstream-master is in sync with upstream project at
+URL: https://gitlab.xiph.org/xiph/opus.git
+
diff --git a/celt/arm/armopts_gnu.s b/celt/arm/armopts_gnu.s
new file mode 100644
index 0000000..c7082fc
--- /dev/null
+++ b/celt/arm/armopts_gnu.s
@@ -0,0 +1,38 @@
+    .syntax unified
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, @ DATA, OR
+   PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN  .if ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+@ Set the following to 1 if we have EDSP instructions
+@  (LDRD/STRD, etc., ARMv5E and later).
+ .set OPUS_ARM_MAY_HAVE_EDSP, 1
+
+@ Set the following to 1 if we have ARMv6 media instructions.
+ .set OPUS_ARM_MAY_HAVE_MEDIA, 1
+
+@ Set the following to 1 if we have NEON (some ARMv7)
+ .set OPUS_ARM_MAY_HAVE_NEON, 1
+
+@ END:
diff --git a/celt/arm/celt_pitch_xcorr_arm_gnu.s b/celt/arm/celt_pitch_xcorr_arm_gnu.s
new file mode 100644
index 0000000..31b0c65
--- /dev/null
+++ b/celt/arm/celt_pitch_xcorr_arm_gnu.s
@@ -0,0 +1,555 @@
+    .syntax unified
+@ Copyright (c) 2007-2008 CSIRO
+@ Copyright (c) 2007-2009 Xiph.Org Foundation
+@ Copyright (c) 2013      Parrot
+@ Written by Aurélien Zanelli
+@
+@ Redistribution and use in source and binary forms, with or without
+@ modification, are permitted provided that the following conditions
+@ are met:
+@
+@ - Redistributions of source code must retain the above copyright
+@ notice, this list of conditions and the following disclaimer.
+@
+@ - Redistributions in binary form must reproduce the above copyright
+@ notice, this list of conditions and the following disclaimer in the
+@ documentation and/or other materials provided with the distribution.
+@
+@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    .text;   .p2align 2;   .arch armv7-a
+   .fpu neon
+   .object_arch armv4t
+
+  .include "celt/arm/armopts_gnu.s"
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+  .global celt_pitch_xcorr_edsp
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+  .global celt_pitch_xcorr_neon
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_NEON
+
+@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+	.type	xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
+xcorr_kernel_neon_start:
+  @ input:
+  @   r3     = int         len
+  @   r4     = opus_val16 *x
+  @   r5     = opus_val16 *y
+  @   q0     = opus_val32  sum[4]
+  @ output:
+  @   q0     = opus_val32  sum[4]
+  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  @ internal usage:
+  @   r12 = int j
+  @   d3  = y_3|y_2|y_1|y_0
+  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  @   q8  = scratch
+  @
+  @ Load y[0...3]
+  @ This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+@ Process 8 samples at a time.
+@ This loop loads one y value more than we actually need. Therefore we have to
+@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+@ reading past the end of the array.
+xcorr_kernel_neon_process8:
+  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
+  @ - 2 cycles of ARM insrtuctions,
+  @ - 10 cycles of load/store/byte permute instructions, and
+  @ - 9 cycles of data processing instructions.
+  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  @ latter two categories, meaning the whole loop should run in 10 cycles per
+  @ iteration, barring cache misses.
+  @
+  @ Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  @ Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+@ Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4:
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  @ Load x[0...3]
+  VLD1.16      d6, [r4]!
+  @ Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  @ Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+@ Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2:
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  @ Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  @ Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  @ Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  @ instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+@ Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1:
+  @ Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  @ y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+@ Now process 1 last sample, not reading ahead.
+  @ Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  @ Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
+
+@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+@  opus_val32 *xcorr, int len, int max_pitch, int arch)
+	.type	celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
+  @ input:
+  @   r0  = opus_val16 *_x
+  @   r1  = opus_val16 *_y
+  @   r2  = opus_val32 *xcorr
+  @   r3  = int         len
+  @ output:
+  @   r0  = int         maxcorr
+  @ internal usage:
+  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  @   r6  = int         max_pitch
+  @   r12 = int         j
+  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  @ ignored:
+  @         int         arch
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4:
+  @ xcorr_kernel_neon parameters:
+  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  @ So we don't save/restore any other registers.
+  BL xcorr_kernel_neon_start
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  @ _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+@ We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done:
+  ADDS         r6, r6, #4
+  @ Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+@ Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining:
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+@ Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8:
+  @ Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  @ Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+@ Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4:
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  @ Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  @ Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done:
+  @ Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+@ Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1:
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done:
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  @ _y++
+  ADD          r1, r1, #2
+  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done:
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
+
+ .endif
+
+ .if OPUS_ARM_MAY_HAVE_EDSP
+
+@ This will get used on ARMv7 devices without NEON, so it has been optimized
+@ to take advantage of dual-issuing where possible.
+	.type	xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
+xcorr_kernel_edsp_start:
+  @ input:
+  @   r3      = int         len
+  @   r4      = opus_val16 *_x (must be 32-bit aligned)
+  @   r5      = opus_val16 *_y (must be 32-bit aligned)
+  @   r6...r9 = opus_val32  sum[4]
+  @ output:
+  @   r6...r9 = opus_val32  sum[4]
+  @ preserved: r0-r5
+  @ internal usage
+  @   r2      = int         j
+  @   r12,r14 = opus_val16  x[4]
+  @   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      @ Load y[0...1]
+  SUBS         r2, r3, #4         @ j = len-4
+  LDR          r11, [r5], #4      @ Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      @ Load x[0...1]
+  @ Stall
+xcorr_kernel_edsp_process4:
+  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
+  @ other. Every other instruction here dual-issues with a multiply, and is
+  @ thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      @ Load x[2...3]
+  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         @ j-=4
+  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      @ Load y[4...5]
+  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      @ Load x[0...1]
+  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      @ Load y[6...7]
+  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done:
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      @ r12 = *x++
+  SUBS         r2, r2, #1         @ j--
+  @ Stall
+  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRHGT       r14, [r4], #2      @ r14 = *x++
+  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         @ j--
+  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRHGT       r12, [r4], #2      @ r12 = *x++
+  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             @ j--
+  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRHGT       r14, [r4]          @ r14 = *x
+  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          @ r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done:
+  LDMFD        sp!, {r2,r4,r5,pc}
+	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
+
+	.type	celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
+  @ input:
+  @   r0  = opus_val16 *_x (must be 32-bit aligned)
+  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  @   r2  = opus_val32 *xcorr
+  @   r3  = int         len
+  @ output:
+  @   r0  = maxcorr
+  @ internal usage
+  @   r4  = opus_val16 *x
+  @   r5  = opus_val16 *y
+  @   r6  = opus_val32  sum0
+  @   r7  = opus_val32  sum1
+  @   r8  = opus_val32  sum2
+  @   r9  = opus_val32  sum3
+  @   r1  = int         max_pitch
+  @   r12 = int         j
+  @ ignored:
+  @         int         arch
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  @ maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+@ Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  @ r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4:
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done:
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1:
+  LDRHGE       r6, [r4], #2
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
+  SUBSGE       r12, r12, #1
+  LDRHGT       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  @ Restore _x
+  SUB          r4, r4, r3, LSL #1
+  @ Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  @ maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  @ xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done:
+  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4:
+  @ xcorr_kernel_edsp parameters:
+  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  @ _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2:
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  @ {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4:
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done:
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  @ Stall
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1:
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  @ Stall
+  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRHGT       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done:
+  @ Restore _x
+  SUB          r4, r4, r3, LSL #1
+  @ Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  @ maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  @ maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  @ xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a:
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  @ r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4:
+  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         @ j-=4
+  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done:
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRHGE       r6, [r4], #2
+  LDRHGE       r8, [r5], #2
+  @ Stall
+  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
+  @ maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  @ xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done:
+  LDMFD        sp!, {r4-r11, pc}
+	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
+
+ .endif
+
+@ END:
+    .section	.note.GNU-stack,"",%progbits
diff --git a/convert_android_asm.sh b/convert_android_asm.sh
new file mode 100755
index 0000000..ea3d198
--- /dev/null
+++ b/convert_android_asm.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+ASM_CONVERTER="./celt/arm/arm2gnu.pl"
+
+if [[ ! -x "${ASM_CONVERTER}" ]]; then
+  echo "This script should be run from external/libopus."
+  exit
+fi
+
+while read file; do
+  # This check is required because the ASM conversion script doesn't seem to be
+  # idempotent.
+  if [[ ! "${file}" =~ .*_gnu\.s$ ]]; then
+    gnu_file="${file%.s}_gnu.s"
+    ${ASM_CONVERTER} "${file}" > "${gnu_file}"
+    # The ASM conversion script replaces includes with *_gnu.S. So, replace
+    # occurences of "*-gnu.S" with "*_gnu.s".
+    sed -i "s/-gnu\.S/_gnu\.s/g" "${gnu_file}"
+    rm -f "${file}"
+  fi
+done < <(find . -iname '*.s')
+
+# Generate armopts.s from armopts.s.in
+sed \
+  -e "s/@OPUS_ARM_MAY_HAVE_EDSP@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_MEDIA@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_NEON@/1/g" \
+  -e "s/@OPUS_ARM_MAY_HAVE_NEON_INTR@/1/g" \
+	celt/arm/armopts.s.in > celt/arm/armopts.s.temp
+${ASM_CONVERTER} "celt/arm/armopts.s.temp" > "celt/arm/armopts_gnu.s"
+rm "celt/arm/armopts.s.temp"
+echo "Converted all ASM files and generated armopts.s successfully."
diff --git a/fuzzer/Android.bp b/fuzzer/Android.bp
new file mode 100644
index 0000000..35dad5c
--- /dev/null
+++ b/fuzzer/Android.bp
@@ -0,0 +1,61 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+ */
+
+cc_fuzz {
+    name: "opus_dec_fuzzer",
+    host_supported:true,
+
+    static_libs: [
+        "libopus",
+    ],
+
+    srcs: [
+        "opus_dec_fuzzer.cpp",
+    ],
+
+    fuzz_config: {
+        cc: [
+            "android-media-fuzzing-reports@google.com",
+        ],
+        componentid: 155276,
+    },
+}
+
+cc_fuzz {
+    name: "opus_multistream_dec_fuzzer",
+    host_supported:true,
+
+    static_libs: [
+        "libopus",
+    ],
+
+    srcs: [
+        "opus_dec_fuzzer.cpp",
+    ],
+
+    cflags: ["-DMULTISTREAM"],
+
+    fuzz_config: {
+        cc: [
+            "android-media-fuzzing-reports@google.com",
+        ],
+        componentid: 155276,
+    },
+}
diff --git a/fuzzer/README.md b/fuzzer/README.md
new file mode 100644
index 0000000..cd6a680
--- /dev/null
+++ b/fuzzer/README.md
@@ -0,0 +1,62 @@
+# Fuzzer for libopus decoder
+
+## Plugin Design Considerations
+The fuzzer plugin for opus decoder is designed based on the understanding of the
+codec and tries to achieve the following:
+
+##### Maximize code coverage
+
+This fuzzer provides support for both single stream and multi stream inputs,
+thus enabling fuzzing for API's provided for single stream as well as multi
+stream.
+
+Following arguments are passed to OPUS_DEC_CREATE_API:
+
+1. Sampling frequency (parameter name: `Fs`)
+2. Number of channels (parameter name: `channels`)
+
+| Parameter| Valid Values| Configured Value|
+|------------- |-------------| ----- |
+| `Fs` | `8000 ` `12000 ` `16000 ` `24000 ` `48000 ` | Derived from Byte-9 of input stream|
+| `channels`   | `1 ` `2 ` | Derived from Byte-9 of input stream |
+
+##### Maximize utilization of input data
+The plugin feeds the entire input data to the codec. Frame sizes are determined only
+after the call to extractor, so in absence of call to extractor,
+we feed the entire data to the decoder.
+This ensures that the plugin tolerates any kind of input (empty, huge,
+malformed, etc) and doesnt `exit()` on any input and thereby increasing the
+chance of identifying vulnerabilities.
+
+## Build
+
+This describes steps to build opus_dec_fuzzer and opus_multistream_dec_fuzzer binary.
+
+## Android
+
+### Steps to build
+Build the fuzzer
+```
+  $ mm -j$(nproc) opus_dec_fuzzer
+  $ mm -j$(nproc) opus_multistream_dec_fuzzer
+```
+
+### Steps to run
+Create a directory CORPUS_DIR and copy some opus files to that folder.
+Push this directory to device.
+
+To run on device
+```
+  $ adb sync data
+  $ adb shell /data/fuzz/arm64/opus_dec_fuzzer/opus_dec_fuzzer CORPUS_DIR
+  $ adb shell /data/fuzz/arm64/opus_multistream_dec_fuzzer/opus_multistream_dec_fuzzer CORPUS_DIR
+```
+To run on host
+```
+  $ $ANDROID_HOST_OUT/fuzz/x86_64/opus_dec_fuzzer/opus_dec_fuzzer CORPUS_DIR
+  $ $ANDROID_HOST_OUT/fuzz/x86_64/opus_multistream_dec_fuzzer/opus_multistream_dec_fuzzer CORPUS_DIR
+```
+
+## References:
+ * http://llvm.org/docs/LibFuzzer.html
+ * https://github.com/google/oss-fuzz
diff --git a/fuzzer/opus_dec_fuzzer.cpp b/fuzzer/opus_dec_fuzzer.cpp
new file mode 100644
index 0000000..23bf69e
--- /dev/null
+++ b/fuzzer/opus_dec_fuzzer.cpp
@@ -0,0 +1,124 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <opus.h>
+
+/* 4 bytes: packet length, 4 bytes: encoder final range */
+constexpr int kSetupByteOffset = 8;
+constexpr int kMaxFrameSample = 5760;
+const int kSamplingRates[] = {8000, 12000, 16000, 24000, 48000};
+constexpr int kNumberSamplingRates = sizeof(kSamplingRates) / sizeof(kSamplingRates[0]);
+
+#ifdef MULTISTREAM
+#include "opus_multistream.h"
+#define OPUS_DEC_DATA_TYPE OpusMSDecoder
+#define OPUS_DEC_DECODE_API opus_multistream_decode
+#define OPUS_DEC_CREATE_API ms_opus_decoder_create
+#define OPUS_DEC_DESTROY_API opus_multistream_decoder_destroy
+static OpusMSDecoder *ms_opus_decoder_create(opus_int32 Fs, int channels, int *error) {
+  int streams = 1;
+  int coupledStreams = channels == 2;
+  unsigned char mapping[256] = {0, 1};
+  return opus_multistream_decoder_create(Fs, channels, streams, coupledStreams, mapping, error);
+}
+#else
+#define OPUS_DEC_DATA_TYPE OpusDecoder
+#define OPUS_DEC_DECODE_API opus_decode
+#define OPUS_DEC_CREATE_API opus_decoder_create
+#define OPUS_DEC_DESTROY_API opus_decoder_destroy
+#endif
+
+class Codec {
+ public:
+  Codec() = default;
+  ~Codec() { deInitDecoder(); }
+  bool initDecoder(const uint8_t *data);
+  void decodeFrames(const uint8_t *data, size_t size);
+  void deInitDecoder();
+
+ private:
+  int mSamplingRate;
+  int mNoOfChannels;
+  OPUS_DEC_DATA_TYPE *mDec = nullptr;
+  opus_int16 *mPcm = nullptr;
+};
+
+bool Codec::initDecoder(const uint8_t *data) {
+  const uint8_t *tocPtr = &data[kSetupByteOffset];
+  const int bandwidth = opus_packet_get_bandwidth(tocPtr);
+  int samplingRateIndex = bandwidth - OPUS_BANDWIDTH_NARROWBAND;
+
+  /*bounds check on samplingRateIndex*/
+  if ((samplingRateIndex >= 0) && (samplingRateIndex < kNumberSamplingRates)) {
+    mSamplingRate = kSamplingRates[samplingRateIndex];
+  } else {
+    mSamplingRate = 8000;  // set to a default value
+  }
+
+  mNoOfChannels = opus_packet_get_nb_channels(tocPtr);
+  if ((mNoOfChannels != 1) && (mNoOfChannels != 2)) {
+    mNoOfChannels = 1;
+  }
+
+  int err;
+  mDec = OPUS_DEC_CREATE_API(mSamplingRate, mNoOfChannels, &err);
+  if (!mDec || err != OPUS_OK) {
+    return false;
+  }
+  size_t sizePcm = sizeof(*mPcm) * kMaxFrameSample * mNoOfChannels;
+  mPcm = static_cast<opus_int16 *>(malloc(sizePcm));
+  if (!mPcm) {
+    return false;
+  }
+  memset(mPcm, 0x0, sizePcm);
+  return true;
+}
+
+void Codec::deInitDecoder() {
+  OPUS_DEC_DESTROY_API(mDec);
+  mDec = nullptr;
+  if (mPcm) {
+    free(mPcm);
+  }
+  mPcm = nullptr;
+}
+
+void Codec::decodeFrames(const uint8_t *data, size_t size) {
+  (void)OPUS_DEC_DECODE_API(mDec, data, size, mPcm, kMaxFrameSample, 0 /*fec*/);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size < kSetupByteOffset + 1) {
+    return 0;
+  }
+  Codec *codec = new Codec();
+  if (!codec) {
+    return 0;
+  }
+  if (codec->initDecoder(data)) {
+    codec->decodeFrames(data, size);
+  }
+  delete codec;
+  return 0;
+}
diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt
new file mode 100644
index 0000000..fd3f398
--- /dev/null
+++ b/libopus_blocklist.txt
@@ -0,0 +1,49 @@
+[integer]
+# celt/celt_decoder.c:1055:61: 0 - 1 cannot be represented in type 'unsigned int'
+fun:celt_decode_with_ec
+# celt/celt_encoder.c:2171:75: 0 - 1 cannot be represented in type 'unsigned int'
+fun:celt_encode_with_ec
+fun:celt_lcg_rand
+# celt/entcode.h:131: negation of 100 cannot be represented in type 'opus_uint32'
+fun:celt_udiv
+# celt/mdct.c:273
+# celt/mdct.c:274
+# celt/mdct.c:304
+# celt/mdct.c:305
+# celt/mdct.c:315
+# celt/mdct.c:316
+# celt/mdct.c:336
+# celt/mdct.c:337
+fun:clt_mdct_backward_c
+fun:ec_dec_init
+# celt/entdec.c:143
+fun:ec_decode
+# celt/entdec.c:150
+fun:ec_decode_bin
+
+src:*/celt/kiss_fft.c
+
+# assembly optimizations that know what they are doing
+fun:silk_SMULWB_armv4
+fun:silk_SMULWT_armv4
+fun:silk_SMULWW_armv4
+fun:silk_SMLAWW_armv4
+#
+fun:silk_SMULWB_armv5e
+fun:silk_SMLAWB_armv5e
+fun:silk_SMULWT_armv5e
+fun:silk_SMLAWT_armv5e
+fun:silk_SMULBB_armv5e
+fun:silk_SMLABB_armv5e
+fun:silk_SMULBT_armv5e
+fun:silk_SMLABT_armv5e
+fun:silk_ADD_SAT32_armv5e
+fun:silk_SUB_SAT32_armv5e
+fun:silk_CLZ16_armv5
+fun:silk_CLZ32_armv5
+
+
+# Performance related
+fun:exp_rotation1
+fun:haar1
+fun:celt_preemphasis
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index f9ae326..a906890 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -448,13 +448,29 @@
 
 /* Adds two signed 32-bit values in a way that can overflow, while not relying on undefined behaviour
    (just standard two's complement implementation-specific behaviour) */
-#define silk_ADD32_ovflw(a, b)              ((opus_int32)((opus_uint32)(a) + (opus_uint32)(b)))
+static OPUS_INLINE opus_int32 silk_ADD32_ovflw(opus_int32 a, opus_int32 b) {
+    opus_int32  _c;
+    __builtin_add_overflow(a, b, &_c);
+    return _c;
+}
+
 /* Subtractss two signed 32-bit values in a way that can overflow, while not relying on undefined behaviour
    (just standard two's complement implementation-specific behaviour) */
-#define silk_SUB32_ovflw(a, b)              ((opus_int32)((opus_uint32)(a) - (opus_uint32)(b)))
+static OPUS_INLINE opus_int32 silk_SUB32_ovflw(opus_int32 a, opus_int32 b) {
+    opus_int32  _c;
+    __builtin_sub_overflow(a, b, &_c);
+    return _c;
+}
 
 /* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */
-#define silk_MLA_ovflw(a32, b32, c32)       silk_ADD32_ovflw((a32), (opus_uint32)(b32) * (opus_uint32)(c32))
+/* .. also ignoring multiply overflows; caller has comment about this happening occasionally */
+static OPUS_INLINE opus_int32 silk_MLA_ovflw(opus_int32 a, opus_int32 b, opus_int32 c) {
+    opus_int32 _d, _e;
+    __builtin_mul_overflow(b, c, &_d);
+    __builtin_add_overflow(a, _d, &_e);
+    return _e;
+}
+
 #define silk_SMLABB_ovflw(a32, b32, c32)    (silk_ADD32_ovflw((a32) , ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32))))
 
 #define silk_DIV32_16(a32, b16)             ((opus_int32)((a32) / (b16)))
@@ -496,7 +512,12 @@
 /* Add with saturation for positive input values */
 #define silk_ADD_POS_SAT8(a, b)             ((((a)+(b)) & 0x80)                 ? silk_int8_MAX  : ((a)+(b)))
 #define silk_ADD_POS_SAT16(a, b)            ((((a)+(b)) & 0x8000)               ? silk_int16_MAX : ((a)+(b)))
-#define silk_ADD_POS_SAT32(a, b)            ((((opus_uint32)(a)+(opus_uint32)(b)) & 0x80000000) ? silk_int32_MAX : ((a)+(b)))
+static OPUS_INLINE opus_int32 silk_ADD_POS_SAT32(opus_int32 a, opus_int32 b) {
+    opus_int32  _c;
+    if (__builtin_add_overflow(a, b, &_c))
+        return silk_int32_MAX;
+    return _c;
+}
 
 #define silk_LSHIFT8(a, shift)              ((opus_int8)((opus_uint8)(a)<<(shift)))         /* shift >= 0, shift < 8  */
 #define silk_LSHIFT16(a, shift)             ((opus_int16)((opus_uint16)(a)<<(shift)))       /* shift >= 0, shift < 16 */
diff --git a/silk/macros.h b/silk/macros.h
index 3c67b6e..00ccca3 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -33,6 +33,7 @@
 #endif
 
 #include "opus_types.h"
+#include "typedef.h"
 #include "opus_defines.h"
 #include "arch.h"
 
@@ -96,13 +97,31 @@
 #endif
 
 /* add/subtract with output saturated */
-#define silk_ADD_SAT32(a, b)             ((((opus_uint32)(a) + (opus_uint32)(b)) & 0x80000000) == 0 ?                              \
-                                        ((((a) & (b)) & 0x80000000) != 0 ? silk_int32_MIN : (a)+(b)) :   \
-                                        ((((a) | (b)) & 0x80000000) == 0 ? silk_int32_MAX : (a)+(b)) )
+/* use clang builtin overflow detectors */
+static OPUS_INLINE opus_int32 silk_ADD_SAT32(opus_int32 a, opus_int32 b) {
+    opus_int32 c;
+    if (__builtin_add_overflow(a, b, &c)) {
+        // overflowed
+        if (a < 0)      // neg+X can only overflow towards -inf
+            c = silk_int32_MIN;
+        else
+            c = silk_int32_MAX;
+    }
+    return c;
+}
 
-#define silk_SUB_SAT32(a, b)             ((((opus_uint32)(a)-(opus_uint32)(b)) & 0x80000000) == 0 ?                                        \
-                                        (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) :    \
-                                        ((((a)^0x80000000) & (b)  & 0x80000000) ? silk_int32_MAX : (a)-(b)) )
+/* use clang builtin overflow detectors */
+static OPUS_INLINE opus_int32 silk_SUB_SAT32(opus_int32 a, opus_int32 b) {
+    opus_int32 c;
+    if (__builtin_sub_overflow(a, b, &c)) {
+        // overflowed,
+        if (a < 0) // neg-X only overflows towards -inf
+            c = silk_int32_MIN;
+        else
+            c = silk_int32_MAX;
+    }
+    return c;
+}
 
 #if defined(MIPSr1_ASM)
 #include "mips/macros_mipsr1.h"