distrib/sdl-1.2.12/src/hermes/mmxp2_32.asm - platform/external/qemu - Git at Google

 ;
 ; pII-optimised MMX format converters for HERMES
 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
 ; This source code is licensed under the GNU LGPL
 ;
 ; Please refer to the file COPYING.LIB contained in the distribution for
 ; licensing conditions
 ;
 ; COPYRIGHT NOTICE
 ;
 ; This file partly contains code that is (c) Intel Corporation, specifically
 ; the mode detection routine, and the converter to 15 bit (8 pixel
 ; conversion routine from the mmx programming tutorial pages).
 ;
 ;
 ; These routines aren't exactly pII optimised - it's just that as they
 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
 ; optimise them for p5 MMXs..

 BITS 32

 %include "common.inc"

 SDL_FUNC _ConvertMMXpII32_24RGB888
 SDL_FUNC _ConvertMMXpII32_16RGB565
 SDL_FUNC _ConvertMMXpII32_16BGR565
 SDL_FUNC _ConvertMMXpII32_16RGB555
 SDL_FUNC _ConvertMMXpII32_16BGR555

 ;; Macros for conversion routines

 %macro _push_immq_mask 1
 	push dword %1
 	push dword %1
 %endmacro

 %macro load_immq 2
 	_push_immq_mask %2
 	movq %1, [esp]
 %endmacro

 %macro pand_immq 2
 	_push_immq_mask %2
 	pand %1, [esp]
 %endmacro

 %define CLEANUP_IMMQ_LOADS(num) \
 	add esp, byte 8 * num

 %define mmx32_rgb888_mask 00ffffffh
 %define mmx32_rgb565_b 000000f8h
 %define mmx32_rgb565_g 0000fc00h
 %define mmx32_rgb565_r 00f80000h

 %define mmx32_rgb555_rb 00f800f8h
 %define mmx32_rgb555_g 0000f800h
 %define mmx32_rgb555_mul 20000008h
 %define mmx32_bgr555_mul 00082000h

 SECTION .text

 _ConvertMMXpII32_24RGB888:

         ; set up mm6 as the mask, mm7 as zero
         load_immq mm6, mmx32_rgb888_mask
         CLEANUP_IMMQ_LOADS(1)
         pxor mm7, mm7

         mov edx, ecx                    ; save ecx
         and ecx, 0fffffffch             ; clear lower two bits
         jnz .L1
         jmp .L2

 .L1:

         movq mm0, [esi]                 ; A R G B a r g b
         pand mm0, mm6                   ; 0 R G B 0 r g b
         movq mm1, [esi+8]               ; A R G B a r g b
         pand mm1, mm6                   ; 0 R G B 0 r g b

         movq mm2, mm0                   ; 0 R G B 0 r g b
         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
         psllq mm2, 24                   ; 0 0 R G B 0 0 0
         por mm0, mm2                    ; 0 0 R G B r g b

         movq mm3, mm1                   ; 0 R G B 0 r g b
         psllq mm3, 48                   ; g b 0 0 0 0 0 0
         por mm0, mm3                    ; g b R G B r g b

         movq mm4, mm1                   ; 0 R G B 0 r g b
         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
         psllq mm4, 8                    ; 0 0 0 0 R G B 0
         por mm1, mm4                    ; 0 0 0 0 R G B r

         movq [edi], mm0
         add esi, BYTE 16
         movd [edi+8], mm1
         add edi, BYTE 12
         sub ecx, BYTE 4
         jnz .L1

 .L2:
         mov ecx, edx
         and ecx, BYTE 3
         jz .L4
 .L3:
         mov al, [esi]
         mov bl, [esi+1]
         mov dl, [esi+2]
         mov [edi], al
         mov [edi+1], bl
         mov [edi+2], dl
         add esi, BYTE 4
         add edi, BYTE 3
         dec ecx
         jnz .L3
 .L4:
         return


 _ConvertMMXpII32_16RGB565:

         ; set up masks
         load_immq mm5, mmx32_rgb565_b
         load_immq mm6, mmx32_rgb565_g
         load_immq mm7, mmx32_rgb565_r
         CLEANUP_IMMQ_LOADS(3)

         mov edx, ecx
         shr ecx, 2
         jnz .L1
         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)

 .L1:
         movq mm0, [esi]         ; argb
         movq mm1, mm0           ; argb
         pand mm0, mm6           ; 00g0
         movq mm3, mm1           ; argb
         pand mm1, mm5           ; 000b
         pand mm3, mm7           ; 0r00
         pslld mm1, 2            ; 0 0 000000bb bbb00000
         por mm0, mm1            ; 0 0 ggggggbb bbb00000
         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb

         movq mm4, [esi+8]       ; argb
         movq mm2, mm4           ; argb
         pand mm4, mm6           ; 00g0
         movq mm1, mm2           ; argb
         pand mm2, mm5           ; 000b
         pand mm1, mm7           ; 0r00
         pslld mm2, 2            ; 0 0 000000bb bbb00000
         por mm4, mm2            ; 0 0 ggggggbb bbb00000
         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb

         packuswb mm3, mm1       ; R 0 r 0
         packssdw mm0, mm4       ; as above.. ish
         por mm0, mm3            ; done.
         movq [edi], mm0

         add esi, 16
         add edi, 8
         dec ecx
         jnz .L1

 .L2:
         mov ecx, edx
         and ecx, BYTE 3
         jz .L4
 .L3:
         mov al, [esi]
         mov bh, [esi+1]
         mov ah, [esi+2]
         shr al, 3
         and eax, 0F81Fh            ; BYTE?
         shr ebx, 5
         and ebx, 07E0h             ; BYTE?
         add eax, ebx
         mov [edi], al
         mov [edi+1], ah
         add esi, BYTE 4
         add edi, BYTE 2
         dec ecx
         jnz .L3

 .L4:
 	retn


 _ConvertMMXpII32_16BGR565:

         load_immq mm5, mmx32_rgb565_r
         load_immq mm6, mmx32_rgb565_g
         load_immq mm7, mmx32_rgb565_b
         CLEANUP_IMMQ_LOADS(3)

         mov edx, ecx
         shr ecx, 2
         jnz .L1
         jmp .L2

 .L1:
         movq mm0, [esi]                 ; a r g b
         movq mm1, mm0                   ; a r g b
         pand mm0, mm6                   ; 0 0 g 0
         movq mm3, mm1                   ; a r g b
         pand mm1, mm5                   ; 0 r 0 0
         pand mm3, mm7                   ; 0 0 0 b

         psllq mm3, 16                   ; 0 b 0 0
         psrld mm1, 14                   ; 0 0 000000rr rrr00000
         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr

         movq mm4, [esi+8]               ; a r g b
         movq mm2, mm4                   ; a r g b
         pand mm4, mm6                   ; 0 0 g 0
         movq mm1, mm2                   ; a r g b
         pand mm2, mm5                   ; 0 r 0 0
         pand mm1, mm7                   ; 0 0 0 b

         psllq mm1, 16                   ; 0 b 0 0
         psrld mm2, 14                   ; 0 0 000000rr rrr00000
         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr

         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
         movq [edi], mm0

         add esi, BYTE 16
         add edi, BYTE 8
         dec ecx
         jnz .L1

 .L2:
         and edx, BYTE 3
         jz .L4
 .L3:
         mov al, [esi+2]
         mov bh, [esi+1]
         mov ah, [esi]
         shr al, 3
         and eax, 0F81Fh                    ; BYTE ?
         shr ebx, 5
         and ebx, 07E0h                     ; BYTE ?
         add eax, ebx
         mov [edi], al
         mov [edi+1], ah
         add esi, BYTE 4
         add edi, BYTE 2
         dec edx
         jnz .L3

 .L4:
         retn

 _ConvertMMXpII32_16BGR555:

         ; the 16BGR555 converter is identical to the RGB555 one,
         ; except it uses a different multiplier for the pmaddwd
         ; instruction.  cool huh.

         load_immq mm7, mmx32_bgr555_mul
         jmp _convert_bgr555_cheat

 ; This is the same as the Intel version.. they obviously went to
 ; much more trouble to expand/coil the loop than I did, so theirs
 ; would almost certainly be faster, even if only a little.
 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
 ; (I think) a more accurate name..
 _ConvertMMXpII32_16RGB555:

 	load_immq mm7, mmx32_rgb555_mul
 _convert_bgr555_cheat:
 	load_immq mm6, mmx32_rgb555_g
 	CLEANUP_IMMQ_LOADS(2)

 	mov edx,ecx		           ; Save ecx

         and ecx,DWORD 0fffffff8h            ; clear lower three bits
 	jnz .L_OK
         jmp near .L2

 .L_OK:

 	movq mm2,[esi+8]

 	movq mm0,[esi]
 	movq mm3,mm2

 	pand_immq mm3, mmx32_rgb555_rb
 	movq mm1,mm0

 	pand_immq mm1, mmx32_rgb555_rb
 	pmaddwd mm3,mm7

 	CLEANUP_IMMQ_LOADS(2)

 	pmaddwd mm1,mm7
 	pand mm2,mm6

 .L1:
 	movq mm4,[esi+24]
 	pand mm0,mm6

 	movq mm5,[esi+16]
 	por mm3,mm2

 	psrld mm3,6
 	por mm1,mm0

 	movq mm0,mm4
 	psrld mm1,6

 	pand_immq mm0, mmx32_rgb555_rb
 	packssdw mm1,mm3

 	movq mm3,mm5
 	pmaddwd mm0,mm7

 	pand_immq mm3, mmx32_rgb555_rb
 	pand mm4,mm6

 	movq [edi],mm1
 	pmaddwd mm3,mm7

         add esi,BYTE 32
 	por mm4,mm0

 	pand mm5,mm6
 	psrld mm4,6

 	movq mm2,[esi+8]
 	por mm5,mm3

 	movq mm0,[esi]
 	psrld mm5,6

 	movq mm3,mm2
 	movq mm1,mm0

 	pand_immq mm3, mmx32_rgb555_rb
 	packssdw mm5,mm4

 	pand_immq mm1, mmx32_rgb555_rb
 	pand mm2,mm6

 	CLEANUP_IMMQ_LOADS(4)

 	movq [edi+8],mm5
 	pmaddwd mm3,mm7

 	pmaddwd mm1,mm7
         add edi,BYTE 16

         sub ecx,BYTE 8
 	jz .L2
         jmp .L1


 .L2:
 	mov ecx,edx

         and ecx,BYTE 7
 	jz .L4

 .L3:
 	mov ebx,[esi]
         add esi,BYTE 4

         mov eax,ebx
         mov edx,ebx

         shr eax,3
         shr edx,6

         and eax,BYTE 0000000000011111b
         and edx,     0000001111100000b

         shr ebx,9

         or eax,edx

         and ebx,     0111110000000000b

         or eax,ebx

         mov [edi],ax
         add edi,BYTE 2

 	dec ecx
 	jnz .L3

 .L4:
 	retn

 %ifidn __OUTPUT_FORMAT__,elf
 section .note.GNU-stack noalloc noexec nowrite progbits
 %endif
	;
	; pII-optimised MMX format converters for HERMES
	; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
	; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
	; This source code is licensed under the GNU LGPL
	;
	; Please refer to the file COPYING.LIB contained in the distribution for
	; licensing conditions
	;
	; COPYRIGHT NOTICE
	;
	; This file partly contains code that is (c) Intel Corporation, specifically
	; the mode detection routine, and the converter to 15 bit (8 pixel
	; conversion routine from the mmx programming tutorial pages).
	;
	;
	; These routines aren't exactly pII optimised - it's just that as they
	; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
	; optimise them for p5 MMXs..

	BITS 32

	%include "common.inc"

	SDL_FUNC _ConvertMMXpII32_24RGB888
	SDL_FUNC _ConvertMMXpII32_16RGB565
	SDL_FUNC _ConvertMMXpII32_16BGR565
	SDL_FUNC _ConvertMMXpII32_16RGB555
	SDL_FUNC _ConvertMMXpII32_16BGR555

	;; Macros for conversion routines

	%macro _push_immq_mask 1
	push dword %1
	push dword %1
	%endmacro

	%macro load_immq 2
	_push_immq_mask %2
	movq %1, [esp]
	%endmacro

	%macro pand_immq 2
	_push_immq_mask %2
	pand %1, [esp]
	%endmacro

	%define CLEANUP_IMMQ_LOADS(num) \
	add esp, byte 8 * num

	%define mmx32_rgb888_mask 00ffffffh
	%define mmx32_rgb565_b 000000f8h
	%define mmx32_rgb565_g 0000fc00h
	%define mmx32_rgb565_r 00f80000h

	%define mmx32_rgb555_rb 00f800f8h
	%define mmx32_rgb555_g 0000f800h
	%define mmx32_rgb555_mul 20000008h
	%define mmx32_bgr555_mul 00082000h

	SECTION .text

	_ConvertMMXpII32_24RGB888:

	; set up mm6 as the mask, mm7 as zero
	load_immq mm6, mmx32_rgb888_mask
	CLEANUP_IMMQ_LOADS(1)
	pxor mm7, mm7

	mov edx, ecx ; save ecx
	and ecx, 0fffffffch ; clear lower two bits
	jnz .L1
	jmp .L2

	.L1:

	movq mm0, [esi] ; A R G B a r g b
	pand mm0, mm6 ; 0 R G B 0 r g b
	movq mm1, [esi+8] ; A R G B a r g b
	pand mm1, mm6 ; 0 R G B 0 r g b

	movq mm2, mm0 ; 0 R G B 0 r g b
	punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
	punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
	psllq mm2, 24 ; 0 0 R G B 0 0 0
	por mm0, mm2 ; 0 0 R G B r g b

	movq mm3, mm1 ; 0 R G B 0 r g b
	psllq mm3, 48 ; g b 0 0 0 0 0 0
	por mm0, mm3 ; g b R G B r g b

	movq mm4, mm1 ; 0 R G B 0 r g b
	punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
	punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
	psrlq mm1, 16 ; 0 0 0 R G B 0 r
	psllq mm4, 8 ; 0 0 0 0 R G B 0
	por mm1, mm4 ; 0 0 0 0 R G B r

	movq [edi], mm0
	add esi, BYTE 16
	movd [edi+8], mm1
	add edi, BYTE 12
	sub ecx, BYTE 4
	jnz .L1

	.L2:
	mov ecx, edx
	and ecx, BYTE 3
	jz .L4
	.L3:
	mov al, [esi]
	mov bl, [esi+1]
	mov dl, [esi+2]
	mov [edi], al
	mov [edi+1], bl
	mov [edi+2], dl
	add esi, BYTE 4
	add edi, BYTE 3
	dec ecx
	jnz .L3
	.L4:
	return



	_ConvertMMXpII32_16RGB565:

	; set up masks
	load_immq mm5, mmx32_rgb565_b
	load_immq mm6, mmx32_rgb565_g
	load_immq mm7, mmx32_rgb565_r
	CLEANUP_IMMQ_LOADS(3)

	mov edx, ecx
	shr ecx, 2
	jnz .L1
	jmp .L2 ; not necessary at the moment, but doesn't hurt (much)

	.L1:
	movq mm0, [esi] ; argb
	movq mm1, mm0 ; argb
	pand mm0, mm6 ; 00g0
	movq mm3, mm1 ; argb
	pand mm1, mm5 ; 000b
	pand mm3, mm7 ; 0r00
	pslld mm1, 2 ; 0 0 000000bb bbb00000
	por mm0, mm1 ; 0 0 ggggggbb bbb00000
	psrld mm0, 5 ; 0 0 00000ggg gggbbbbb

	movq mm4, [esi+8] ; argb
	movq mm2, mm4 ; argb
	pand mm4, mm6 ; 00g0
	movq mm1, mm2 ; argb
	pand mm2, mm5 ; 000b
	pand mm1, mm7 ; 0r00
	pslld mm2, 2 ; 0 0 000000bb bbb00000
	por mm4, mm2 ; 0 0 ggggggbb bbb00000
	psrld mm4, 5 ; 0 0 00000ggg gggbbbbb

	packuswb mm3, mm1 ; R 0 r 0
	packssdw mm0, mm4 ; as above.. ish
	por mm0, mm3 ; done.
	movq [edi], mm0

	add esi, 16
	add edi, 8
	dec ecx
	jnz .L1

	.L2:
	mov ecx, edx
	and ecx, BYTE 3
	jz .L4
	.L3:
	mov al, [esi]
	mov bh, [esi+1]
	mov ah, [esi+2]
	shr al, 3
	and eax, 0F81Fh ; BYTE?
	shr ebx, 5
	and ebx, 07E0h ; BYTE?
	add eax, ebx
	mov [edi], al
	mov [edi+1], ah
	add esi, BYTE 4
	add edi, BYTE 2
	dec ecx
	jnz .L3

	.L4:
	retn


	_ConvertMMXpII32_16BGR565:

	load_immq mm5, mmx32_rgb565_r
	load_immq mm6, mmx32_rgb565_g
	load_immq mm7, mmx32_rgb565_b
	CLEANUP_IMMQ_LOADS(3)

	mov edx, ecx
	shr ecx, 2
	jnz .L1
	jmp .L2

	.L1:
	movq mm0, [esi] ; a r g b
	movq mm1, mm0 ; a r g b
	pand mm0, mm6 ; 0 0 g 0
	movq mm3, mm1 ; a r g b
	pand mm1, mm5 ; 0 r 0 0
	pand mm3, mm7 ; 0 0 0 b

	psllq mm3, 16 ; 0 b 0 0
	psrld mm1, 14 ; 0 0 000000rr rrr00000
	por mm0, mm1 ; 0 0 ggggggrr rrr00000
	psrld mm0, 5 ; 0 0 00000ggg gggrrrrr

	movq mm4, [esi+8] ; a r g b
	movq mm2, mm4 ; a r g b
	pand mm4, mm6 ; 0 0 g 0
	movq mm1, mm2 ; a r g b
	pand mm2, mm5 ; 0 r 0 0
	pand mm1, mm7 ; 0 0 0 b

	psllq mm1, 16 ; 0 b 0 0
	psrld mm2, 14 ; 0 0 000000rr rrr00000
	por mm4, mm2 ; 0 0 ggggggrr rrr00000
	psrld mm4, 5 ; 0 0 00000ggg gggrrrrr

	packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
	packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
	por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
	movq [edi], mm0

	add esi, BYTE 16
	add edi, BYTE 8
	dec ecx
	jnz .L1

	.L2:
	and edx, BYTE 3
	jz .L4
	.L3:
	mov al, [esi+2]
	mov bh, [esi+1]
	mov ah, [esi]
	shr al, 3
	and eax, 0F81Fh ; BYTE ?
	shr ebx, 5
	and ebx, 07E0h ; BYTE ?
	add eax, ebx
	mov [edi], al
	mov [edi+1], ah
	add esi, BYTE 4
	add edi, BYTE 2
	dec edx
	jnz .L3

	.L4:
	retn

	_ConvertMMXpII32_16BGR555:

	; the 16BGR555 converter is identical to the RGB555 one,
	; except it uses a different multiplier for the pmaddwd
	; instruction. cool huh.

	load_immq mm7, mmx32_bgr555_mul
	jmp _convert_bgr555_cheat

	; This is the same as the Intel version.. they obviously went to
	; much more trouble to expand/coil the loop than I did, so theirs
	; would almost certainly be faster, even if only a little.
	; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
	; (I think) a more accurate name..
	_ConvertMMXpII32_16RGB555:

	load_immq mm7, mmx32_rgb555_mul
	_convert_bgr555_cheat:
	load_immq mm6, mmx32_rgb555_g
	CLEANUP_IMMQ_LOADS(2)

	mov edx,ecx ; Save ecx

	and ecx,DWORD 0fffffff8h ; clear lower three bits
	jnz .L_OK
	jmp near .L2

	.L_OK:

	movq mm2,[esi+8]

	movq mm0,[esi]
	movq mm3,mm2

	pand_immq mm3, mmx32_rgb555_rb
	movq mm1,mm0

	pand_immq mm1, mmx32_rgb555_rb
	pmaddwd mm3,mm7

	CLEANUP_IMMQ_LOADS(2)

	pmaddwd mm1,mm7
	pand mm2,mm6

	.L1:
	movq mm4,[esi+24]
	pand mm0,mm6

	movq mm5,[esi+16]
	por mm3,mm2

	psrld mm3,6
	por mm1,mm0

	movq mm0,mm4
	psrld mm1,6

	pand_immq mm0, mmx32_rgb555_rb
	packssdw mm1,mm3

	movq mm3,mm5
	pmaddwd mm0,mm7

	pand_immq mm3, mmx32_rgb555_rb
	pand mm4,mm6

	movq [edi],mm1
	pmaddwd mm3,mm7

	add esi,BYTE 32
	por mm4,mm0

	pand mm5,mm6
	psrld mm4,6

	movq mm2,[esi+8]
	por mm5,mm3

	movq mm0,[esi]
	psrld mm5,6

	movq mm3,mm2
	movq mm1,mm0

	pand_immq mm3, mmx32_rgb555_rb
	packssdw mm5,mm4

	pand_immq mm1, mmx32_rgb555_rb
	pand mm2,mm6

	CLEANUP_IMMQ_LOADS(4)

	movq [edi+8],mm5
	pmaddwd mm3,mm7

	pmaddwd mm1,mm7
	add edi,BYTE 16

	sub ecx,BYTE 8
	jz .L2
	jmp .L1


	.L2:
	mov ecx,edx

	and ecx,BYTE 7
	jz .L4

	.L3:
	mov ebx,[esi]
	add esi,BYTE 4

	mov eax,ebx
	mov edx,ebx

	shr eax,3
	shr edx,6

	and eax,BYTE 0000000000011111b
	and edx, 0000001111100000b

	shr ebx,9

	or eax,edx

	and ebx, 0111110000000000b

	or eax,ebx

	mov [edi],ax
	add edi,BYTE 2

	dec ecx
	jnz .L3

	.L4:
	retn

	%ifidn __OUTPUT_FORMAT__,elf
	section .note.GNU-stack noalloc noexec nowrite progbits
	%endif