webrtc/modules/audio_coding/codecs/isac/fix/source/filters_neon.S - platform/external/webrtc - Git at Google

 @
 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 @
 @ Use of this source code is governed by a BSD-style license
 @ that can be found in the LICENSE file in the root of the source
 @ tree. An additional intellectual property rights grant can be found
 @ in the file PATENTS.  All contributing project authors may
 @ be found in the AUTHORS file in the root of the source tree.
 @
 @ Reference code in filters.c. Output is bit-exact.

 #include "webrtc/system_wrappers/interface/asm_defines.h"

 GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
 .align  2

 @ int WebRtcIsacfix_AutocorrNeon(
 @     int32_t* __restrict r,
 @     const int16_t* __restrict x,
 @     int16_t N,
 @     int16_t order,
 @     int16_t* __restrict scale);

 DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
   push       {r3 - r12}

   @ Constant initializations
   mov        r4, #33
   vmov.i32   d0, #0
   vmov.i32   q8, #0
   vmov.i32   d29, #0               @ Initialize (-scale).
   vmov.u8    d30, #255             @ Initialize d30 as -1.
   vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
   vmov.i32   d25, #32

   mov        r5, r1                @ x
   mov        r6, r2                @ N

 @ Generate the first coefficient r0.
 LOOP_R0:
   vld1.16    {d18}, [r5]!          @ x[]
   subs       r6, r6, #4
   vmull.s16  q9, d18, d18
   vpadal.s32 q8, q9
   bgt        LOOP_R0

   vadd.i64   d16, d16, d17

   @ Calculate scaling (the value of shifting).
   vmov       d17, d16

   @ Check overflow and determine the value for 'scale'.
   @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
   @ lower 32-bit words. Note that we don't care about the value of the upper
   @ word in d17.

   @ Check the case of 1 bit overflow. If it occurs store the results for
   @ scale and r[0] in d17 and d29.

   vshr.u64   d3, d16, #1
   vclt.s32   d1, d16, #0           @ < 0 ?
   vbit       d17, d3, d1           @ For r[0]
   vbit       d29, d30, d1          @ -scale = -1

   @ For the case of more than 1 bit overflow. If it occurs overwrite the
   @ results for scale and r[0] in d17 and d29.
   vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
   vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
   vsub.i64   d31, d26, d0          @ zeros - 33
   vshl.i64   d27, d26, #32
   vorr       d27, d26              @ Duplicate the high word with its low one.
   vshl.u64   d2, d16, d31          @ Shift by (-scale).
   vclt.s32   d1, d27, d25          @ < 32 ?
   vbit       d17, d2, d1           @ For r[0]
   vbit       d29, d31, d1          @ -scale

   vst1.32    d17[0], [r0]!         @ r[0]
   mov        r5, #1                @ outer loop counter

 @ Generate rest of the coefficients
 LOOP_R:
   vmov.i32   q8, #0                @ Initialize the accumulation result.
   vmov.i32   q9, #0                @ Initialize the accumulation result.
   mov        r7, r1                @ &x[0]
   add        r6, r7, r5, lsl #1    @ x[i]
   sub        r12, r2, r5           @ N - i
   lsr        r8, r12, #3           @ inner loop counter
   sub        r12, r8, lsl #3       @ Leftover samples to be processed

 LOOP_8X_SAMPLES:                   @ Multiple of 8 samples
   vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
   vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
   vmull.s16  q12, d20, d22
   vmull.s16  q13, d21, d23
   subs       r8, #1
   vpadal.s32 q8, q12
   vpadal.s32 q9, q13
   bgt        LOOP_8X_SAMPLES

   cmp r12, #4
   blt REST_SAMPLES

 Four_SAMPLES:
   vld1.16    d20, [r7]!
   vld1.16    d22, [r6]!
   vmull.s16  q12, d20, d22
   vpadal.s32 q8, q12
   sub r12, #4

 REST_SAMPLES:
   mov        r8, #0                @ Initialize lower word of the accumulation.
   mov        r4, #0                @ Initialize upper word of the accumulation.
   cmp r12, #0
   ble SUMUP

 LOOP_REST_SAMPLES:
   ldrh       r9, [r7], #2          @ x[0, ...]
   ldrh       r10, [r6], #2         @ x[i, ...]
   smulbb     r11, r9, r10
   adds       r8, r8, r11           @ lower word of the accumulation.
   adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
   subs       r12, #1
   bgt        LOOP_REST_SAMPLES

 @ Added the multiplication results together and do a shift.
 SUMUP:
   vadd.i64   d16, d17
   vadd.i64   d18, d19
   vadd.i64   d18, d16
   vmov       d17, r8, r4
   vadd.i64   d18, d17
   vshl.s64   d18, d29              @ Shift left by (-scale).
   vst1.32    d18[0], [r0]!         @ r[i]

   add        r5, #1
   cmp        r5, r3
   ble        LOOP_R

   vneg.s32   d29, d29              @ Get value for 'scale'.
   ldr        r2, [sp, #40]         @ &scale
   add        r0, r3, #1            @ return (order + 1)
   vst1.s16   d29[0], [r2]          @ Store 'scale'

   pop        {r3 - r12}
   bx         lr
	@
	@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
	@
	@ Use of this source code is governed by a BSD-style license
	@ that can be found in the LICENSE file in the root of the source
	@ tree. An additional intellectual property rights grant can be found
	@ in the file PATENTS. All contributing project authors may
	@ be found in the AUTHORS file in the root of the source tree.
	@
	@ Reference code in filters.c. Output is bit-exact.

	#include "webrtc/system_wrappers/interface/asm_defines.h"

	GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
	.align 2

	@ int WebRtcIsacfix_AutocorrNeon(
	@ int32_t* __restrict r,
	@ const int16_t* __restrict x,
	@ int16_t N,
	@ int16_t order,
	@ int16_t* __restrict scale);

	DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
	push {r3 - r12}

	@ Constant initializations
	mov r4, #33
	vmov.i32 d0, #0
	vmov.i32 q8, #0
	vmov.i32 d29, #0 @ Initialize (-scale).
	vmov.u8 d30, #255 @ Initialize d30 as -1.
	vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high)
	vmov.i32 d25, #32

	mov r5, r1 @ x
	mov r6, r2 @ N

	@ Generate the first coefficient r0.
	LOOP_R0:
	vld1.16 {d18}, [r5]! @ x[]
	subs r6, r6, #4
	vmull.s16 q9, d18, d18
	vpadal.s32 q8, q9
	bgt LOOP_R0

	vadd.i64 d16, d16, d17

	@ Calculate scaling (the value of shifting).
	vmov d17, d16

	@ Check overflow and determine the value for 'scale'.
	@ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
	@ lower 32-bit words. Note that we don't care about the value of the upper
	@ word in d17.

	@ Check the case of 1 bit overflow. If it occurs store the results for
	@ scale and r[0] in d17 and d29.

	vshr.u64 d3, d16, #1
	vclt.s32 d1, d16, #0 @ < 0 ?
	vbit d17, d3, d1 @ For r[0]
	vbit d29, d30, d1 @ -scale = -1

	@ For the case of more than 1 bit overflow. If it occurs overwrite the
	@ results for scale and r[0] in d17 and d29.
	vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words.
	vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits.
	vsub.i64 d31, d26, d0 @ zeros - 33
	vshl.i64 d27, d26, #32
	vorr d27, d26 @ Duplicate the high word with its low one.
	vshl.u64 d2, d16, d31 @ Shift by (-scale).
	vclt.s32 d1, d27, d25 @ < 32 ?
	vbit d17, d2, d1 @ For r[0]
	vbit d29, d31, d1 @ -scale

	vst1.32 d17[0], [r0]! @ r[0]
	mov r5, #1 @ outer loop counter

	@ Generate rest of the coefficients
	LOOP_R:
	vmov.i32 q8, #0 @ Initialize the accumulation result.
	vmov.i32 q9, #0 @ Initialize the accumulation result.
	mov r7, r1 @ &x[0]
	add r6, r7, r5, lsl #1 @ x[i]
	sub r12, r2, r5 @ N - i
	lsr r8, r12, #3 @ inner loop counter
	sub r12, r8, lsl #3 @ Leftover samples to be processed

	LOOP_8X_SAMPLES: @ Multiple of 8 samples
	vld1.16 {d20, d21}, [r7]! @ x[0, ...]
	vld1.16 {d22, d23}, [r6]! @ x[i, ...]
	vmull.s16 q12, d20, d22
	vmull.s16 q13, d21, d23
	subs r8, #1
	vpadal.s32 q8, q12
	vpadal.s32 q9, q13
	bgt LOOP_8X_SAMPLES

	cmp r12, #4
	blt REST_SAMPLES

	Four_SAMPLES:
	vld1.16 d20, [r7]!
	vld1.16 d22, [r6]!
	vmull.s16 q12, d20, d22
	vpadal.s32 q8, q12
	sub r12, #4

	REST_SAMPLES:
	mov r8, #0 @ Initialize lower word of the accumulation.
	mov r4, #0 @ Initialize upper word of the accumulation.
	cmp r12, #0
	ble SUMUP

	LOOP_REST_SAMPLES:
	ldrh r9, [r7], #2 @ x[0, ...]
	ldrh r10, [r6], #2 @ x[i, ...]
	smulbb r11, r9, r10
	adds r8, r8, r11 @ lower word of the accumulation.
	adc r4, r4, r11, asr #31 @ upper word of the accumulation.
	subs r12, #1
	bgt LOOP_REST_SAMPLES

	@ Added the multiplication results together and do a shift.
	SUMUP:
	vadd.i64 d16, d17
	vadd.i64 d18, d19
	vadd.i64 d18, d16
	vmov d17, r8, r4
	vadd.i64 d18, d17
	vshl.s64 d18, d29 @ Shift left by (-scale).
	vst1.32 d18[0], [r0]! @ r[i]

	add r5, #1
	cmp r5, r3
	ble LOOP_R

	vneg.s32 d29, d29 @ Get value for 'scale'.
	ldr r2, [sp, #40] @ &scale
	add r0, r3, #1 @ return (order + 1)
	vst1.s16 d29[0], [r2] @ Store 'scale'

	pop {r3 - r12}
	bx lr