libFLAC/ia32/stream_encoder_asm.nasm - platform/external/flac - Git at Google

 ;  vim:filetype=nasm ts=8

 ;  libFLAC - Free Lossless Audio Codec library
 ;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
 ;  are met:
 ;
 ;  - Redistributions of source code must retain the above copyright
 ;  notice, this list of conditions and the following disclaimer.
 ;
 ;  - Redistributions in binary form must reproduce the above copyright
 ;  notice, this list of conditions and the following disclaimer in the
 ;  documentation and/or other materials provided with the distribution.
 ;
 ;  - Neither the name of the Xiph.org Foundation nor the names of its
 ;  contributors may be used to endorse or promote products derived from
 ;  this software without specific prior written permission.
 ;
 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 %include "nasm.h"

 	data_section

 cglobal precompute_partition_info_sums_32bit_asm_ia32_

 	code_section


 ; **********************************************************************
 ;
 ; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
 ; void precompute_partition_info_sums_32bit_(
 ; 	const FLAC__int32 residual[],
 ; 	FLAC__uint64 abs_residual_partition_sums[],
 ; 	unsigned blocksize,
 ; 	unsigned predictor_order,
 ; 	unsigned min_partition_order,
 ; 	unsigned max_partition_order
 ; )
 ;
 	ALIGN 16
 cident precompute_partition_info_sums_32bit_asm_ia32_

 	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
 	;; [esp + 4]	const FLAC__int32 residual[]
 	;; [esp + 8]	FLAC__uint64 abs_residual_partition_sums[]
 	;; [esp + 12]	unsigned blocksize
 	;; [esp + 16]	unsigned predictor_order
 	;; [esp + 20]	unsigned min_partition_order
 	;; [esp + 24]	unsigned max_partition_order
 	push	ebp
 	push	ebx
 	push	esi
 	push	edi
 	sub	esp, 8
 	;; [esp + 28]	const FLAC__int32 residual[]
 	;; [esp + 32]	FLAC__uint64 abs_residual_partition_sums[]
 	;; [esp + 36]	unsigned blocksize
 	;; [esp + 40]	unsigned predictor_order
 	;; [esp + 44]	unsigned min_partition_order
 	;; [esp + 48]	unsigned max_partition_order
 	;; [esp]	partitions
 	;; [esp + 4]	default_partition_samples

 	mov	ecx, [esp + 48]
 	mov	eax, 1
 	shl	eax, cl
 	mov	[esp], eax		; [esp] <- partitions = 1u << max_partition_order;
 	mov	eax, [esp + 36]
 	shr	eax, cl
 	mov	[esp + 4], eax		; [esp + 4] <- default_partition_samples = blocksize >> max_partition_order;

 	;
 	; first do max_partition_order
 	;
 	mov	edi, [esp + 4]
 	sub	edi, [esp + 40]		; edi <- end = (unsigned)(-(int)predictor_order) + default_partition_samples
 	xor	esi, esi		; esi <- residual_sample = 0
 	xor	ecx, ecx		; ecx <- partition = 0
 	mov	ebp, [esp + 28]		; ebp <- residual[]
 	xor	ebx, ebx		; ebx <- abs_residual_partition_sum = 0;
 	; note we put the updates to 'end' and 'abs_residual_partition_sum' at the end of loop0 and in the initialization above so we could align loop0 and loop1
 	ALIGN	16
 .loop0:					; for(partition = residual_sample = 0; partition < partitions; partition++) {
 .loop1:					;   for( ; residual_sample < end; residual_sample++)
 	mov	eax, [ebp + esi * 4]
 	cdq
 	xor	eax, edx
 	sub	eax, edx
 	add	ebx, eax		;     abs_residual_partition_sum += abs(residual[residual_sample]);
 	;@@@@@@ check overflow flag and abort here?
 	add	esi, byte 1
 	cmp	esi, edi		;   /* since the loop will always run at least once, we can put the loop check down here */
 	jb	.loop1
 .next1:
 	add	edi, [esp + 4]		;   end += default_partition_samples;
 	mov	eax, [esp + 32]
 	mov	[eax + ecx * 8], ebx	;   abs_residual_partition_sums[partition] = abs_residual_partition_sum;
 	mov	[eax + ecx * 8 + 4], dword 0
 	xor	ebx, ebx		;   abs_residual_partition_sum = 0;
 	add	ecx, byte 1
 	cmp	ecx, [esp]		; /* since the loop will always run at least once, we can put the loop check down here */
 	jb	.loop0
 .next0:					; }
 	;
 	; now merge partitions for lower orders
 	;
 	mov	esi, [esp + 32]		; esi <- abs_residual_partition_sums[from_partition==0];
 	mov	eax, [esp]
 	lea	edi, [esi + eax * 8]	; edi <- abs_residual_partition_sums[to_partition==partitions];
 	mov	ecx, [esp + 48]
 	sub	ecx, byte 1		; ecx <- partition_order = (int)max_partition_order - 1;
 	ALIGN 16
 .loop2:					; for(; partition_order >= (int)min_partition_order; partition_order--) {
 	cmp	ecx, [esp + 44]
 	jl	.next2
 	mov	edx, 1
 	shl	edx, cl			;   const unsigned partitions = 1u << partition_order;
 	ALIGN 16
 .loop3:					;   for(i = 0; i < partitions; i++) {
 	mov	eax, [esi]
 	mov	ebx, [esi + 4]
 	add	eax, [esi + 8]
 	adc	ebx, [esi + 12]
 	mov	[edi], eax
 	mov	[edi + 4], ebx		;     a_r_p_s[to_partition] = a_r_p_s[from_partition] + a_r_p_s[from_partition+1];
 	add	esi, byte 16
 	add	edi, byte 8
 	sub	edx, byte 1
 	jnz	.loop3			;   }
 	sub	ecx, byte 1
 	jmp	.loop2			; }
 .next2:

 	add	esp, 8
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret

 end

 %ifdef OBJ_FORMAT_elf
 	section .note.GNU-stack noalloc
 %endif
	; vim:filetype=nasm ts=8

	; libFLAC - Free Lossless Audio Codec library
	; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson
	;
	; Redistribution and use in source and binary forms, with or without
	; modification, are permitted provided that the following conditions
	; are met:
	;
	; - Redistributions of source code must retain the above copyright
	; notice, this list of conditions and the following disclaimer.
	;
	; - Redistributions in binary form must reproduce the above copyright
	; notice, this list of conditions and the following disclaimer in the
	; documentation and/or other materials provided with the distribution.
	;
	; - Neither the name of the Xiph.org Foundation nor the names of its
	; contributors may be used to endorse or promote products derived from
	; this software without specific prior written permission.
	;
	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
	; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	%include "nasm.h"

	data_section

	cglobal precompute_partition_info_sums_32bit_asm_ia32_

	code_section


	; **********************************************************************
	;
	; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
	; void precompute_partition_info_sums_32bit_(
	; const FLAC__int32 residual[],
	; FLAC__uint64 abs_residual_partition_sums[],
	; unsigned blocksize,
	; unsigned predictor_order,
	; unsigned min_partition_order,
	; unsigned max_partition_order
	; )
	;
	ALIGN 16
	cident precompute_partition_info_sums_32bit_asm_ia32_

	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
	;; [esp + 4] const FLAC__int32 residual[]
	;; [esp + 8] FLAC__uint64 abs_residual_partition_sums[]
	;; [esp + 12] unsigned blocksize
	;; [esp + 16] unsigned predictor_order
	;; [esp + 20] unsigned min_partition_order
	;; [esp + 24] unsigned max_partition_order
	push ebp
	push ebx
	push esi
	push edi
	sub esp, 8
	;; [esp + 28] const FLAC__int32 residual[]
	;; [esp + 32] FLAC__uint64 abs_residual_partition_sums[]
	;; [esp + 36] unsigned blocksize
	;; [esp + 40] unsigned predictor_order
	;; [esp + 44] unsigned min_partition_order
	;; [esp + 48] unsigned max_partition_order
	;; [esp] partitions
	;; [esp + 4] default_partition_samples

	mov ecx, [esp + 48]
	mov eax, 1
	shl eax, cl
	mov [esp], eax ; [esp] <- partitions = 1u << max_partition_order;
	mov eax, [esp + 36]
	shr eax, cl
	mov [esp + 4], eax ; [esp + 4] <- default_partition_samples = blocksize >> max_partition_order;

	;
	; first do max_partition_order
	;
	mov edi, [esp + 4]
	sub edi, [esp + 40] ; edi <- end = (unsigned)(-(int)predictor_order) + default_partition_samples
	xor esi, esi ; esi <- residual_sample = 0
	xor ecx, ecx ; ecx <- partition = 0
	mov ebp, [esp + 28] ; ebp <- residual[]
	xor ebx, ebx ; ebx <- abs_residual_partition_sum = 0;
	; note we put the updates to 'end' and 'abs_residual_partition_sum' at the end of loop0 and in the initialization above so we could align loop0 and loop1
	ALIGN 16
	.loop0: ; for(partition = residual_sample = 0; partition < partitions; partition++) {
	.loop1: ; for( ; residual_sample < end; residual_sample++)
	mov eax, [ebp + esi * 4]
	cdq
	xor eax, edx
	sub eax, edx
	add ebx, eax ; abs_residual_partition_sum += abs(residual[residual_sample]);
	;@@@@@@ check overflow flag and abort here?
	add esi, byte 1
	cmp esi, edi ; /* since the loop will always run at least once, we can put the loop check down here */
	jb .loop1
	.next1:
	add edi, [esp + 4] ; end += default_partition_samples;
	mov eax, [esp + 32]
	mov [eax + ecx * 8], ebx ; abs_residual_partition_sums[partition] = abs_residual_partition_sum;
	mov [eax + ecx * 8 + 4], dword 0
	xor ebx, ebx ; abs_residual_partition_sum = 0;
	add ecx, byte 1
	cmp ecx, [esp] ; /* since the loop will always run at least once, we can put the loop check down here */
	jb .loop0
	.next0: ; }
	;
	; now merge partitions for lower orders
	;
	mov esi, [esp + 32] ; esi <- abs_residual_partition_sums[from_partition==0];
	mov eax, [esp]
	lea edi, [esi + eax * 8] ; edi <- abs_residual_partition_sums[to_partition==partitions];
	mov ecx, [esp + 48]
	sub ecx, byte 1 ; ecx <- partition_order = (int)max_partition_order - 1;
	ALIGN 16
	.loop2: ; for(; partition_order >= (int)min_partition_order; partition_order--) {
	cmp ecx, [esp + 44]
	jl .next2
	mov edx, 1
	shl edx, cl ; const unsigned partitions = 1u << partition_order;
	ALIGN 16
	.loop3: ; for(i = 0; i < partitions; i++) {
	mov eax, [esi]
	mov ebx, [esi + 4]
	add eax, [esi + 8]
	adc ebx, [esi + 12]
	mov [edi], eax
	mov [edi + 4], ebx ; a_r_p_s[to_partition] = a_r_p_s[from_partition] + a_r_p_s[from_partition+1];
	add esi, byte 16
	add edi, byte 8
	sub edx, byte 1
	jnz .loop3 ; }
	sub ecx, byte 1
	jmp .loop2 ; }
	.next2:

	add esp, 8
	pop edi
	pop esi
	pop ebx
	pop ebp
	ret

	end

	%ifdef OBJ_FORMAT_elf
	section .note.GNU-stack noalloc
	%endif