blob: d2d31ecdce81f817c1724c427b66282601502395 [file] [log] [blame]
;
; pII-optimised MMX format converters for HERMES
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
; COPYRIGHT NOTICE
;
; This file partly contains code that is (c) Intel Corporation, specifically
; the mode detection routine, and the converter to 15 bit (8 pixel
; conversion routine from the mmx programming tutorial pages).
;
;
; These routines aren't exactly pII optimised - it's just that as they
; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
; optimise them for p5 MMXs..
BITS 32
%include "common.inc"
SDL_FUNC _ConvertMMXpII32_24RGB888
SDL_FUNC _ConvertMMXpII32_16RGB565
SDL_FUNC _ConvertMMXpII32_16BGR565
SDL_FUNC _ConvertMMXpII32_16RGB555
SDL_FUNC _ConvertMMXpII32_16BGR555
;; Macros for conversion routines
%macro _push_immq_mask 1
push dword %1
push dword %1
%endmacro
%macro load_immq 2
_push_immq_mask %2
movq %1, [esp]
%endmacro
%macro pand_immq 2
_push_immq_mask %2
pand %1, [esp]
%endmacro
%define CLEANUP_IMMQ_LOADS(num) \
add esp, byte 8 * num
%define mmx32_rgb888_mask 00ffffffh
%define mmx32_rgb565_b 000000f8h
%define mmx32_rgb565_g 0000fc00h
%define mmx32_rgb565_r 00f80000h
%define mmx32_rgb555_rb 00f800f8h
%define mmx32_rgb555_g 0000f800h
%define mmx32_rgb555_mul 20000008h
%define mmx32_bgr555_mul 00082000h
SECTION .text
_ConvertMMXpII32_24RGB888:
; set up mm6 as the mask, mm7 as zero
load_immq mm6, mmx32_rgb888_mask
CLEANUP_IMMQ_LOADS(1)
pxor mm7, mm7
mov edx, ecx ; save ecx
and ecx, 0fffffffch ; clear lower two bits
jnz .L1
jmp .L2
.L1:
movq mm0, [esi] ; A R G B a r g b
pand mm0, mm6 ; 0 R G B 0 r g b
movq mm1, [esi+8] ; A R G B a r g b
pand mm1, mm6 ; 0 R G B 0 r g b
movq mm2, mm0 ; 0 R G B 0 r g b
punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
psllq mm2, 24 ; 0 0 R G B 0 0 0
por mm0, mm2 ; 0 0 R G B r g b
movq mm3, mm1 ; 0 R G B 0 r g b
psllq mm3, 48 ; g b 0 0 0 0 0 0
por mm0, mm3 ; g b R G B r g b
movq mm4, mm1 ; 0 R G B 0 r g b
punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
psrlq mm1, 16 ; 0 0 0 R G B 0 r
psllq mm4, 8 ; 0 0 0 0 R G B 0
por mm1, mm4 ; 0 0 0 0 R G B r
movq [edi], mm0
add esi, BYTE 16
movd [edi+8], mm1
add edi, BYTE 12
sub ecx, BYTE 4
jnz .L1
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bl, [esi+1]
mov dl, [esi+2]
mov [edi], al
mov [edi+1], bl
mov [edi+2], dl
add esi, BYTE 4
add edi, BYTE 3
dec ecx
jnz .L3
.L4:
return
_ConvertMMXpII32_16RGB565:
; set up masks
load_immq mm5, mmx32_rgb565_b
load_immq mm6, mmx32_rgb565_g
load_immq mm7, mmx32_rgb565_r
CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
.L1:
movq mm0, [esi] ; argb
movq mm1, mm0 ; argb
pand mm0, mm6 ; 00g0
movq mm3, mm1 ; argb
pand mm1, mm5 ; 000b
pand mm3, mm7 ; 0r00
pslld mm1, 2 ; 0 0 000000bb bbb00000
por mm0, mm1 ; 0 0 ggggggbb bbb00000
psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
movq mm4, [esi+8] ; argb
movq mm2, mm4 ; argb
pand mm4, mm6 ; 00g0
movq mm1, mm2 ; argb
pand mm2, mm5 ; 000b
pand mm1, mm7 ; 0r00
pslld mm2, 2 ; 0 0 000000bb bbb00000
por mm4, mm2 ; 0 0 ggggggbb bbb00000
psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
packuswb mm3, mm1 ; R 0 r 0
packssdw mm0, mm4 ; as above.. ish
por mm0, mm3 ; done.
movq [edi], mm0
add esi, 16
add edi, 8
dec ecx
jnz .L1
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bh, [esi+1]
mov ah, [esi+2]
shr al, 3
and eax, 0F81Fh ; BYTE?
shr ebx, 5
and ebx, 07E0h ; BYTE?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec ecx
jnz .L3
.L4:
retn
_ConvertMMXpII32_16BGR565:
load_immq mm5, mmx32_rgb565_r
load_immq mm6, mmx32_rgb565_g
load_immq mm7, mmx32_rgb565_b
CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2
.L1:
movq mm0, [esi] ; a r g b
movq mm1, mm0 ; a r g b
pand mm0, mm6 ; 0 0 g 0
movq mm3, mm1 ; a r g b
pand mm1, mm5 ; 0 r 0 0
pand mm3, mm7 ; 0 0 0 b
psllq mm3, 16 ; 0 b 0 0
psrld mm1, 14 ; 0 0 000000rr rrr00000
por mm0, mm1 ; 0 0 ggggggrr rrr00000
psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
movq mm4, [esi+8] ; a r g b
movq mm2, mm4 ; a r g b
pand mm4, mm6 ; 0 0 g 0
movq mm1, mm2 ; a r g b
pand mm2, mm5 ; 0 r 0 0
pand mm1, mm7 ; 0 0 0 b
psllq mm1, 16 ; 0 b 0 0
psrld mm2, 14 ; 0 0 000000rr rrr00000
por mm4, mm2 ; 0 0 ggggggrr rrr00000
psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
movq [edi], mm0
add esi, BYTE 16
add edi, BYTE 8
dec ecx
jnz .L1
.L2:
and edx, BYTE 3
jz .L4
.L3:
mov al, [esi+2]
mov bh, [esi+1]
mov ah, [esi]
shr al, 3
and eax, 0F81Fh ; BYTE ?
shr ebx, 5
and ebx, 07E0h ; BYTE ?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec edx
jnz .L3
.L4:
retn
_ConvertMMXpII32_16BGR555:
; the 16BGR555 converter is identical to the RGB555 one,
; except it uses a different multiplier for the pmaddwd
; instruction. cool huh.
load_immq mm7, mmx32_bgr555_mul
jmp _convert_bgr555_cheat
; This is the same as the Intel version.. they obviously went to
; much more trouble to expand/coil the loop than I did, so theirs
; would almost certainly be faster, even if only a little.
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
; (I think) a more accurate name..
_ConvertMMXpII32_16RGB555:
load_immq mm7, mmx32_rgb555_mul
_convert_bgr555_cheat:
load_immq mm6, mmx32_rgb555_g
CLEANUP_IMMQ_LOADS(2)
mov edx,ecx ; Save ecx
and ecx,DWORD 0fffffff8h ; clear lower three bits
jnz .L_OK
jmp near .L2
.L_OK:
movq mm2,[esi+8]
movq mm0,[esi]
movq mm3,mm2
pand_immq mm3, mmx32_rgb555_rb
movq mm1,mm0
pand_immq mm1, mmx32_rgb555_rb
pmaddwd mm3,mm7
CLEANUP_IMMQ_LOADS(2)
pmaddwd mm1,mm7
pand mm2,mm6
.L1:
movq mm4,[esi+24]
pand mm0,mm6
movq mm5,[esi+16]
por mm3,mm2
psrld mm3,6
por mm1,mm0
movq mm0,mm4
psrld mm1,6
pand_immq mm0, mmx32_rgb555_rb
packssdw mm1,mm3
movq mm3,mm5
pmaddwd mm0,mm7
pand_immq mm3, mmx32_rgb555_rb
pand mm4,mm6
movq [edi],mm1
pmaddwd mm3,mm7
add esi,BYTE 32
por mm4,mm0
pand mm5,mm6
psrld mm4,6
movq mm2,[esi+8]
por mm5,mm3
movq mm0,[esi]
psrld mm5,6
movq mm3,mm2
movq mm1,mm0
pand_immq mm3, mmx32_rgb555_rb
packssdw mm5,mm4
pand_immq mm1, mmx32_rgb555_rb
pand mm2,mm6
CLEANUP_IMMQ_LOADS(4)
movq [edi+8],mm5
pmaddwd mm3,mm7
pmaddwd mm1,mm7
add edi,BYTE 16
sub ecx,BYTE 8
jz .L2
jmp .L1
.L2:
mov ecx,edx
and ecx,BYTE 7
jz .L4
.L3:
mov ebx,[esi]
add esi,BYTE 4
mov eax,ebx
mov edx,ebx
shr eax,3
shr edx,6
and eax,BYTE 0000000000011111b
and edx, 0000001111100000b
shr ebx,9
or eax,edx
and ebx, 0111110000000000b
or eax,ebx
mov [edi],ax
add edi,BYTE 2
dec ecx
jnz .L3
.L4:
retn
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif