src/mesa/x86/read_rgba_span_x86.S - platform/external/mesa3d - Git at Google

 /*
  * (C) Copyright IBM Corporation 2004
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * on the rights to use, copy, modify, merge, publish, distribute, sub
  * license, and/or sell copies of the Software, and to permit persons to whom
  * the Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */

 /**
  * \file read_rgba_span_x86.S
  * Optimized routines to transfer pixel data from the framebuffer to a
  * buffer in main memory.
  *
  * \author Ian Romanick <idr@us.ibm.com>
  */

 	.file	"read_rgba_span_x86.S"
 #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
 /* Kevin F. Quinn 2nd July 2006
  * Replaced data segment constants with text-segment instructions.
  */
 #define	LOAD_MASK(mvins,m1,m2) \
    	pushl	$0xff00ff00 ;\
    	pushl	$0xff00ff00 ;\
    	pushl	$0xff00ff00 ;\
    	pushl	$0xff00ff00 ;\
 	mvins	(%esp), m1	;\
    	pushl	$0x00ff0000 ;\
    	pushl	$0x00ff0000 ;\
    	pushl	$0x00ff0000 ;\
    	pushl	$0x00ff0000 ;\
 	mvins	(%esp), m2	;\
 	addl	$32, %esp

 /* I implemented these as macros because they appear in several places,
  * and I've tweaked them a number of times.  I got tired of changing every
  * place they appear. :)
  */

 #define DO_ONE_PIXEL() \
 	movl	(%ebx), %eax ; \
 	addl	$4, %ebx ; \
 	bswap	%eax          /* ARGB -> BGRA */ ; \
 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
 	addl	$4, %ecx

 #define DO_ONE_LAST_PIXEL() \
 	movl	(%ebx), %eax ; \
 	bswap	%eax          /* ARGB -> BGRA */ ; \
 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \


 /**
  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  *
  * \warning
  * This function assumes that the caller will issue the EMMS instruction
  * at the correct places.
  */

 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
 #ifndef USE_DRICORE
 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
 #endif
 	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
 _generic_read_RGBA_span_BGRA8888_REV_MMX:
 	pushl	%ebx

 #ifdef USE_INNER_EMMS
 	emms
 #endif
 	LOAD_MASK(movq,%mm1,%mm2)

 	movl	8(%esp), %ebx	/* source pointer */
 	movl	16(%esp), %edx	/* number of pixels to copy */
 	movl	12(%esp), %ecx	/* destination pointer */

 	testl	%edx, %edx
 	jle	.L20		/* Bail if there's nothing to do. */

 	movl	%ebx, %eax

 	negl	%eax
 	sarl	$2, %eax
 	andl	$1, %eax
 	je	.L17

 	subl	%eax, %edx
 	DO_ONE_PIXEL()
 .L17:

 	/* Would it be faster to unroll this loop once and process 4 pixels
 	 * per pass, instead of just two?
 	 */

 	movl	%edx, %eax
 	shrl	%eax
 	jmp	.L18
 .L19:
 	movq	(%ebx), %mm0
 	addl	$8, %ebx

 	/* These 9 instructions do what PSHUFB (if there were such an
 	 * instruction) could do in 1. :(
 	 */

 	movq	%mm0, %mm3
 	movq	%mm0, %mm4

 	pand	%mm2, %mm3
 	psllq	$16, %mm4
 	psrlq	$16, %mm3
 	pand	%mm2, %mm4

 	pand	%mm1, %mm0
 	por	%mm4, %mm3
 	por	%mm3, %mm0

 	movq	%mm0, (%ecx)
 	addl	$8, %ecx
 	subl	$1, %eax
 .L18:
 	jne	.L19

 #ifdef USE_INNER_EMMS
 	emms
 #endif

 	/* At this point there are either 1 or 0 pixels remaining to be
 	 * converted.  Convert the last pixel, if needed.
 	 */

 	testl	$1, %edx
 	je	.L20

 	DO_ONE_LAST_PIXEL()

 .L20:
 	popl	%ebx
 	ret
 	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX


 /**
  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
  * instructions are only actually used to read data from the framebuffer.
  * In practice, the speed-up is pretty small.
  *
  * \todo
  * Do some more testing and determine if there's any reason to have this
  * function in addition to the MMX version.
  *
  * \warning
  * This function assumes that the caller will issue the EMMS instruction
  * at the correct places.
  */

 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 #ifndef USE_DRICORE
 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 #endif
 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 	pushl	%esi
 	pushl	%ebx
 	pushl	%ebp

 #ifdef USE_INNER_EMMS
 	emms
 #endif

 	LOAD_MASK(movq,%mm1,%mm2)

 	movl	16(%esp), %ebx	/* source pointer */
 	movl	24(%esp), %edx	/* number of pixels to copy */
 	movl	20(%esp), %ecx	/* destination pointer */

 	testl	%edx, %edx
 	jle	.L35		/* Bail if there's nothing to do. */

 	movl	%esp, %ebp
 	subl	$16, %esp
 	andl	$0xfffffff0, %esp

 	movl	%ebx, %eax
 	movl	%edx, %esi

 	negl	%eax
 	andl	$15, %eax
 	sarl	$2, %eax
 	cmpl	%edx, %eax
 	cmovle	%eax, %esi

 	subl	%esi, %edx

 	testl	$1, %esi
 	je	.L32

 	DO_ONE_PIXEL()
 .L32:

 	testl	$2, %esi
 	je	.L31

 	movq	(%ebx), %mm0
 	addl	$8, %ebx

 	movq	%mm0, %mm3
 	movq	%mm0, %mm4

 	pand	%mm2, %mm3
 	psllq	$16, %mm4
 	psrlq	$16, %mm3
 	pand	%mm2, %mm4

 	pand	%mm1, %mm0
 	por	%mm4, %mm3
 	por	%mm3, %mm0

 	movq	%mm0, (%ecx)
 	addl	$8, %ecx
 .L31:

 	movl	%edx, %eax
 	shrl	$2, %eax
 	jmp	.L33
 .L34:
 	movaps	(%ebx), %xmm0
 	addl	$16, %ebx

 	/* This would be so much better if we could just move directly from
 	 * an SSE register to an MMX register.  Unfortunately, that
 	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 	 * instruction.
 	 */

 	movaps	%xmm0, (%esp)
 	movq	(%esp), %mm0
 	movq	8(%esp), %mm5

 	movq	%mm0, %mm3
 	movq	%mm0, %mm4
 	movq	%mm5, %mm6
 	movq	%mm5, %mm7

 	pand	%mm2, %mm3
 	pand	%mm2, %mm6

 	psllq	$16, %mm4
 	psllq	$16, %mm7

 	psrlq	$16, %mm3
 	psrlq	$16, %mm6

 	pand	%mm2, %mm4
 	pand	%mm2, %mm7

 	pand	%mm1, %mm0
 	pand	%mm1, %mm5

 	por	%mm4, %mm3
 	por	%mm7, %mm6

 	por	%mm3, %mm0
 	por	%mm6, %mm5

 	movq	%mm0, (%ecx)
 	movq	%mm5, 8(%ecx)
 	addl	$16, %ecx

 	subl	$1, %eax
 .L33:
 	jne	.L34

 #ifdef USE_INNER_EMMS
 	emms
 #endif
 	movl	%ebp, %esp

 	/* At this point there are either [0, 3] pixels remaining to be
 	 * converted.
 	 */

 	testl	$2, %edx
 	je	.L36

 	movq	(%ebx), %mm0
 	addl	$8, %ebx

 	movq	%mm0, %mm3
 	movq	%mm0, %mm4

 	pand	%mm2, %mm3
 	psllq	$16, %mm4
 	psrlq	$16, %mm3
 	pand	%mm2, %mm4

 	pand	%mm1, %mm0
 	por	%mm4, %mm3
 	por	%mm3, %mm0

 	movq	%mm0, (%ecx)
 	addl	$8, %ecx
 .L36:

 	testl	$1, %edx
 	je	.L35

 	DO_ONE_LAST_PIXEL()
 .L35:
 	popl	%ebp
 	popl	%ebx
 	popl	%esi
 	ret
 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE


 /**
  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
  */

 	.text
 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 #ifndef USE_DRICORE
 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 #endif
 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 	pushl	%esi
 	pushl	%ebx

 	LOAD_MASK(movdqu,%xmm1,%xmm2)

 	movl	12(%esp), %ebx	/* source pointer */
 	movl	20(%esp), %edx	/* number of pixels to copy */
 	movl	16(%esp), %ecx	/* destination pointer */

 	movl	%ebx, %eax
 	movl	%edx, %esi

 	testl	%edx, %edx
 	jle	.L46		/* Bail if there's nothing to do. */

 	/* If the source pointer isn't a multiple of 16 we have to process
 	 * a few pixels the "slow" way to get the address aligned for
 	 * the SSE fetch intsructions.
 	 */

 	negl	%eax
 	andl	$15, %eax
 	sarl	$2, %eax

 	cmpl	%edx, %eax
 	cmovbe	%eax, %esi
 	subl	%esi, %edx

 	testl	$1, %esi
 	je	.L41

 	DO_ONE_PIXEL()
 .L41:
 	testl	$2, %esi
 	je	.L40

 	movq	(%ebx), %xmm0
 	addl	$8, %ebx

 	movdqa	%xmm0, %xmm3
 	movdqa	%xmm0, %xmm4
 	andps	%xmm1, %xmm0

 	andps	%xmm2, %xmm3
 	pslldq	$2, %xmm4
 	psrldq	$2, %xmm3
 	andps	%xmm2, %xmm4

 	orps	%xmm4, %xmm3
 	orps	%xmm3, %xmm0

 	movq	%xmm0, (%ecx)
 	addl	$8, %ecx
 .L40:

 	/* Would it be worth having a specialized version of this loop for
 	 * the case where the destination is 16-byte aligned?  That version
 	 * would be identical except that it could use movedqa instead of
 	 * movdqu.
 	 */

 	movl	%edx, %eax
 	shrl	$2, %eax
 	jmp	.L42
 .L43:
 	movdqa	(%ebx), %xmm0
 	addl	$16, %ebx

 	movdqa	%xmm0, %xmm3
 	movdqa	%xmm0, %xmm4
 	andps	%xmm1, %xmm0

 	andps	%xmm2, %xmm3
 	pslldq	$2, %xmm4
 	psrldq	$2, %xmm3
 	andps	%xmm2, %xmm4

 	orps	%xmm4, %xmm3
 	orps	%xmm3, %xmm0

 	movdqu	%xmm0, (%ecx)
 	addl	$16, %ecx
 	subl	$1, %eax
 .L42:
 	jne	.L43


 	/* There may be upto 3 pixels remaining to be copied.  Take care
 	 * of them now.  We do the 2 pixel case first because the data
 	 * will be aligned.
 	 */

 	testl	$2, %edx
 	je	.L47

 	movq	(%ebx), %xmm0
 	addl	$8, %ebx

 	movdqa	%xmm0, %xmm3
 	movdqa	%xmm0, %xmm4
 	andps	%xmm1, %xmm0

 	andps	%xmm2, %xmm3
 	pslldq	$2, %xmm4
 	psrldq	$2, %xmm3
 	andps	%xmm2, %xmm4

 	orps	%xmm4, %xmm3
 	orps	%xmm3, %xmm0

 	movq	%xmm0, (%ecx)
 	addl	$8, %ecx
 .L47:

 	testl	$1, %edx
 	je	.L46

 	DO_ONE_LAST_PIXEL()
 .L46:

 	popl	%ebx
 	popl	%esi
 	ret
 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2


 #define MASK_565_L	0x07e0f800
 #define MASK_565_H	0x0000001f
 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
  * classic C implementation in Mesa.  Setting SCALE_ADJUST
  * to 0 is slightly faster but at a small cost to accuracy.
  */
 #define SCALE_ADJUST	5
 #if SCALE_ADJUST == 5
 #define PRESCALE_L 0x00100001
 #define PRESCALE_H 0x00000200
 #define SCALE_L 0x40C620E8
 #define SCALE_H 0x0000839d
 #elif SCALE_ADJUST == 0
 #define PRESCALE_L 0x00200001
 #define PRESCALE_H 0x00000800
 #define SCALE_L 0x01040108
 #define SCALE_H 0x00000108
 #else
 #error SCALE_ADJUST must either be 5 or 0.
 #endif
 #define ALPHA_L 0x00000000
 #define ALPHA_H 0x00ff0000

 /**
  * MMX optimized version of the RGB565 to RGBA copy routine.
  */

 	.text
 	.globl	_generic_read_RGBA_span_RGB565_MMX
 #ifndef USE_DRICORE
         .hidden _generic_read_RGBA_span_RGB565_MMX
 #endif
 	.type	_generic_read_RGBA_span_RGB565_MMX, @function

 _generic_read_RGBA_span_RGB565_MMX:

 #ifdef USE_INNER_EMMS
 	emms
 #endif

 	movl	4(%esp), %eax	/* source pointer */
 	movl	8(%esp), %edx	/* destination pointer */
 	movl	12(%esp), %ecx	/* number of pixels to copy */

 	pushl	$MASK_565_H
 	pushl	$MASK_565_L
 	movq	(%esp), %mm5
 	pushl	$PRESCALE_H
 	pushl	$PRESCALE_L
 	movq	(%esp), %mm6
 	pushl	$SCALE_H
 	pushl	$SCALE_L
 	movq	(%esp), %mm7
 	pushl	$ALPHA_H
 	pushl	$ALPHA_L
 	movq	(%esp), %mm3
 	addl	$32,%esp

 	sarl	$2, %ecx
 	jl	.L01		/* Bail early if the count is negative. */
 	jmp	.L02

 .L03:
 	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 	 * second pixels into the four words of %mm0 and %mm2.
       	 */

 	movq	(%eax), %mm4
 	addl	$8, %eax

 	pshufw	$0x00, %mm4, %mm0
 	pshufw	$0x55, %mm4, %mm2


 	/* Mask the pixels so that each word of each register contains only
 	 * one color component.
 	 */

 	pand	%mm5, %mm0
 	pand	%mm5, %mm2


 	/* Adjust the component values so that they are as small as possible,
 	 * but large enough so that we can multiply them by an unsigned 16-bit
 	 * number and get a value as large as 0x00ff0000.
  	 */

 	pmullw	%mm6, %mm0
 	pmullw	%mm6, %mm2
 #if SCALE_ADJUST > 0
 	psrlw	$SCALE_ADJUST, %mm0
 	psrlw	$SCALE_ADJUST, %mm2
 #endif

 	/* Scale the input component values to be on the range
 	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
 	 */

 	pmulhuw	%mm7, %mm0
 	pmulhuw	%mm7, %mm2


 	/* Always set the alpha value to 0xff.
 	 */

  	por %mm3, %mm0
  	por %mm3, %mm2


 	/* Pack the 16-bit values to 8-bit values and store the converted
 	 * pixel data.
 	 */

 	packuswb	%mm2, %mm0
 	movq	%mm0, (%edx)
 	addl	$8, %edx

 	pshufw	$0xaa, %mm4, %mm0
 	pshufw	$0xff, %mm4, %mm2

 	pand	%mm5, %mm0
 	pand	%mm5, %mm2
 	pmullw	%mm6, %mm0
 	pmullw	%mm6, %mm2
 #if SCALE_ADJUST > 0
 	psrlw	$SCALE_ADJUST, %mm0
 	psrlw	$SCALE_ADJUST, %mm2
 #endif
 	pmulhuw	%mm7, %mm0
 	pmulhuw	%mm7, %mm2

  	por %mm3, %mm0
  	por %mm3, %mm2

 	packuswb	%mm2, %mm0

 	movq	%mm0, (%edx)
 	addl	$8, %edx

 	subl	$1, %ecx
 .L02:
 	jne	.L03


 	/* At this point there can be at most 3 pixels left to process.  If
 	 * there is either 2 or 3 left, process 2.
          */

 	movl	12(%esp), %ecx
 	testl	$0x02, %ecx
 	je	.L04

 	movd	(%eax), %mm4
 	addl	$4, %eax

 	pshufw	$0x00, %mm4, %mm0
 	pshufw	$0x55, %mm4, %mm2

 	pand	%mm5, %mm0
 	pand	%mm5, %mm2
 	pmullw	%mm6, %mm0
 	pmullw	%mm6, %mm2
 #if SCALE_ADJUST > 0
 	psrlw	$SCALE_ADJUST, %mm0
 	psrlw	$SCALE_ADJUST, %mm2
 #endif
 	pmulhuw	%mm7, %mm0
 	pmulhuw	%mm7, %mm2

  	por %mm3, %mm0
  	por %mm3, %mm2

 	packuswb	%mm2, %mm0

 	movq	%mm0, (%edx)
 	addl	$8, %edx

 .L04:
 	/* At this point there can be at most 1 pixel left to process.
 	 * Process it if needed.
          */

 	testl	$0x01, %ecx
 	je	.L01

 	movzwl	(%eax), %ecx
 	movd	%ecx, %mm4

 	pshufw	$0x00, %mm4, %mm0

 	pand	%mm5, %mm0
 	pmullw	%mm6, %mm0
 #if SCALE_ADJUST > 0
 	psrlw	$SCALE_ADJUST, %mm0
 #endif
 	pmulhuw	%mm7, %mm0

  	por %mm3, %mm0

 	packuswb	%mm0, %mm0

 	movd	%mm0, (%edx)

 .L01:
 #ifdef USE_INNER_EMMS
 	emms
 #endif
 	ret
 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */

 #if defined (__ELF__) && defined (__linux__)
 	.section .note.GNU-stack,"",%progbits
 #endif
	/*
	* (C) Copyright IBM Corporation 2004
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* on the rights to use, copy, modify, merge, publish, distribute, sub
	* license, and/or sell copies of the Software, and to permit persons to whom
	* the Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*/

	/**
	* \file read_rgba_span_x86.S
	* Optimized routines to transfer pixel data from the framebuffer to a
	* buffer in main memory.
	*
	* \author Ian Romanick <idr@us.ibm.com>
	*/

	.file "read_rgba_span_x86.S"
	#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
	/* Kevin F. Quinn 2nd July 2006
	* Replaced data segment constants with text-segment instructions.
	*/
	#define LOAD_MASK(mvins,m1,m2) \
	pushl $0xff00ff00 ;\
	pushl $0xff00ff00 ;\
	pushl $0xff00ff00 ;\
	pushl $0xff00ff00 ;\
	mvins (%esp), m1 ;\
	pushl $0x00ff0000 ;\
	pushl $0x00ff0000 ;\
	pushl $0x00ff0000 ;\
	pushl $0x00ff0000 ;\
	mvins (%esp), m2 ;\
	addl $32, %esp

	/* I implemented these as macros because they appear in several places,
	* and I've tweaked them a number of times. I got tired of changing every
	* place they appear. :)
	*/

	#define DO_ONE_PIXEL() \
	movl (%ebx), %eax ; \
	addl $4, %ebx ; \
	bswap %eax /* ARGB -> BGRA */ ; \
	rorl $8, %eax /* BGRA -> ABGR */ ; \
	movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
	addl $4, %ecx

	#define DO_ONE_LAST_PIXEL() \
	movl (%ebx), %eax ; \
	bswap %eax /* ARGB -> BGRA */ ; \
	rorl $8, %eax /* BGRA -> ABGR */ ; \
	movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \


	/**
	* MMX optimized version of the BGRA8888_REV to RGBA copy routine.
	*
	* \warning
	* This function assumes that the caller will issue the EMMS instruction
	* at the correct places.
	*/

	.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
	#ifndef USE_DRICORE
	.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
	#endif
	.type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
	_generic_read_RGBA_span_BGRA8888_REV_MMX:
	pushl %ebx

	#ifdef USE_INNER_EMMS
	emms
	#endif
	LOAD_MASK(movq,%mm1,%mm2)

	movl 8(%esp), %ebx /* source pointer */
	movl 16(%esp), %edx /* number of pixels to copy */
	movl 12(%esp), %ecx /* destination pointer */

	testl %edx, %edx
	jle .L20 /* Bail if there's nothing to do. */

	movl %ebx, %eax

	negl %eax
	sarl $2, %eax
	andl $1, %eax
	je .L17

	subl %eax, %edx
	DO_ONE_PIXEL()
	.L17:

	/* Would it be faster to unroll this loop once and process 4 pixels
	* per pass, instead of just two?
	*/

	movl %edx, %eax
	shrl %eax
	jmp .L18
	.L19:
	movq (%ebx), %mm0
	addl $8, %ebx

	/* These 9 instructions do what PSHUFB (if there were such an
	* instruction) could do in 1. :(
	*/

	movq %mm0, %mm3
	movq %mm0, %mm4

	pand %mm2, %mm3
	psllq $16, %mm4
	psrlq $16, %mm3
	pand %mm2, %mm4

	pand %mm1, %mm0
	por %mm4, %mm3
	por %mm3, %mm0

	movq %mm0, (%ecx)
	addl $8, %ecx
	subl $1, %eax
	.L18:
	jne .L19

	#ifdef USE_INNER_EMMS
	emms
	#endif

	/* At this point there are either 1 or 0 pixels remaining to be
	* converted. Convert the last pixel, if needed.
	*/

	testl $1, %edx
	je .L20

	DO_ONE_LAST_PIXEL()

	.L20:
	popl %ebx
	ret
	.size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX


	/**
	* SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
	* instructions are only actually used to read data from the framebuffer.
	* In practice, the speed-up is pretty small.
	*
	* \todo
	* Do some more testing and determine if there's any reason to have this
	* function in addition to the MMX version.
	*
	* \warning
	* This function assumes that the caller will issue the EMMS instruction
	* at the correct places.
	*/

	.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
	#ifndef USE_DRICORE
	.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
	#endif
	.type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
	_generic_read_RGBA_span_BGRA8888_REV_SSE:
	pushl %esi
	pushl %ebx
	pushl %ebp

	#ifdef USE_INNER_EMMS
	emms
	#endif

	LOAD_MASK(movq,%mm1,%mm2)

	movl 16(%esp), %ebx /* source pointer */
	movl 24(%esp), %edx /* number of pixels to copy */
	movl 20(%esp), %ecx /* destination pointer */

	testl %edx, %edx
	jle .L35 /* Bail if there's nothing to do. */

	movl %esp, %ebp
	subl $16, %esp
	andl $0xfffffff0, %esp

	movl %ebx, %eax
	movl %edx, %esi

	negl %eax
	andl $15, %eax
	sarl $2, %eax
	cmpl %edx, %eax
	cmovle %eax, %esi

	subl %esi, %edx

	testl $1, %esi
	je .L32

	DO_ONE_PIXEL()
	.L32:

	testl $2, %esi
	je .L31

	movq (%ebx), %mm0
	addl $8, %ebx

	movq %mm0, %mm3
	movq %mm0, %mm4

	pand %mm2, %mm3
	psllq $16, %mm4
	psrlq $16, %mm3
	pand %mm2, %mm4

	pand %mm1, %mm0
	por %mm4, %mm3
	por %mm3, %mm0

	movq %mm0, (%ecx)
	addl $8, %ecx
	.L31:

	movl %edx, %eax
	shrl $2, %eax
	jmp .L33
	.L34:
	movaps (%ebx), %xmm0
	addl $16, %ebx

	/* This would be so much better if we could just move directly from
	* an SSE register to an MMX register. Unfortunately, that
	* functionality wasn't introduced until SSE2 with the MOVDQ2Q
	* instruction.
	*/

	movaps %xmm0, (%esp)
	movq (%esp), %mm0
	movq 8(%esp), %mm5

	movq %mm0, %mm3
	movq %mm0, %mm4
	movq %mm5, %mm6
	movq %mm5, %mm7

	pand %mm2, %mm3
	pand %mm2, %mm6

	psllq $16, %mm4
	psllq $16, %mm7

	psrlq $16, %mm3
	psrlq $16, %mm6

	pand %mm2, %mm4
	pand %mm2, %mm7

	pand %mm1, %mm0
	pand %mm1, %mm5

	por %mm4, %mm3
	por %mm7, %mm6

	por %mm3, %mm0
	por %mm6, %mm5

	movq %mm0, (%ecx)
	movq %mm5, 8(%ecx)
	addl $16, %ecx

	subl $1, %eax
	.L33:
	jne .L34

	#ifdef USE_INNER_EMMS
	emms
	#endif
	movl %ebp, %esp

	/* At this point there are either [0, 3] pixels remaining to be
	* converted.
	*/

	testl $2, %edx
	je .L36

	movq (%ebx), %mm0
	addl $8, %ebx

	movq %mm0, %mm3
	movq %mm0, %mm4

	pand %mm2, %mm3
	psllq $16, %mm4
	psrlq $16, %mm3
	pand %mm2, %mm4

	pand %mm1, %mm0
	por %mm4, %mm3
	por %mm3, %mm0

	movq %mm0, (%ecx)
	addl $8, %ecx
	.L36:

	testl $1, %edx
	je .L35

	DO_ONE_LAST_PIXEL()
	.L35:
	popl %ebp
	popl %ebx
	popl %esi
	ret
	.size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE


	/**
	* SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
	*/

	.text
	.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
	#ifndef USE_DRICORE
	.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
	#endif
	.type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
	_generic_read_RGBA_span_BGRA8888_REV_SSE2:
	pushl %esi
	pushl %ebx

	LOAD_MASK(movdqu,%xmm1,%xmm2)

	movl 12(%esp), %ebx /* source pointer */
	movl 20(%esp), %edx /* number of pixels to copy */
	movl 16(%esp), %ecx /* destination pointer */

	movl %ebx, %eax
	movl %edx, %esi

	testl %edx, %edx
	jle .L46 /* Bail if there's nothing to do. */

	/* If the source pointer isn't a multiple of 16 we have to process
	* a few pixels the "slow" way to get the address aligned for
	* the SSE fetch intsructions.
	*/

	negl %eax
	andl $15, %eax
	sarl $2, %eax

	cmpl %edx, %eax
	cmovbe %eax, %esi
	subl %esi, %edx

	testl $1, %esi
	je .L41

	DO_ONE_PIXEL()
	.L41:
	testl $2, %esi
	je .L40

	movq (%ebx), %xmm0
	addl $8, %ebx

	movdqa %xmm0, %xmm3
	movdqa %xmm0, %xmm4
	andps %xmm1, %xmm0

	andps %xmm2, %xmm3
	pslldq $2, %xmm4
	psrldq $2, %xmm3
	andps %xmm2, %xmm4

	orps %xmm4, %xmm3
	orps %xmm3, %xmm0

	movq %xmm0, (%ecx)
	addl $8, %ecx
	.L40:

	/* Would it be worth having a specialized version of this loop for
	* the case where the destination is 16-byte aligned? That version
	* would be identical except that it could use movedqa instead of
	* movdqu.
	*/

	movl %edx, %eax
	shrl $2, %eax
	jmp .L42
	.L43:
	movdqa (%ebx), %xmm0
	addl $16, %ebx

	movdqa %xmm0, %xmm3
	movdqa %xmm0, %xmm4
	andps %xmm1, %xmm0

	andps %xmm2, %xmm3
	pslldq $2, %xmm4
	psrldq $2, %xmm3
	andps %xmm2, %xmm4

	orps %xmm4, %xmm3
	orps %xmm3, %xmm0

	movdqu %xmm0, (%ecx)
	addl $16, %ecx
	subl $1, %eax
	.L42:
	jne .L43


	/* There may be upto 3 pixels remaining to be copied. Take care
	* of them now. We do the 2 pixel case first because the data
	* will be aligned.
	*/

	testl $2, %edx
	je .L47

	movq (%ebx), %xmm0
	addl $8, %ebx

	movdqa %xmm0, %xmm3
	movdqa %xmm0, %xmm4
	andps %xmm1, %xmm0

	andps %xmm2, %xmm3
	pslldq $2, %xmm4
	psrldq $2, %xmm3
	andps %xmm2, %xmm4

	orps %xmm4, %xmm3
	orps %xmm3, %xmm0

	movq %xmm0, (%ecx)
	addl $8, %ecx
	.L47:

	testl $1, %edx
	je .L46

	DO_ONE_LAST_PIXEL()
	.L46:

	popl %ebx
	popl %esi
	ret
	.size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2



	#define MASK_565_L 0x07e0f800
	#define MASK_565_H 0x0000001f
	/* Setting SCALE_ADJUST to 5 gives a perfect match with the
	* classic C implementation in Mesa. Setting SCALE_ADJUST
	* to 0 is slightly faster but at a small cost to accuracy.
	*/
	#define SCALE_ADJUST 5
	#if SCALE_ADJUST == 5
	#define PRESCALE_L 0x00100001
	#define PRESCALE_H 0x00000200
	#define SCALE_L 0x40C620E8
	#define SCALE_H 0x0000839d
	#elif SCALE_ADJUST == 0
	#define PRESCALE_L 0x00200001
	#define PRESCALE_H 0x00000800
	#define SCALE_L 0x01040108
	#define SCALE_H 0x00000108
	#else
	#error SCALE_ADJUST must either be 5 or 0.
	#endif
	#define ALPHA_L 0x00000000
	#define ALPHA_H 0x00ff0000

	/**
	* MMX optimized version of the RGB565 to RGBA copy routine.
	*/

	.text
	.globl _generic_read_RGBA_span_RGB565_MMX
	#ifndef USE_DRICORE
	.hidden _generic_read_RGBA_span_RGB565_MMX
	#endif
	.type _generic_read_RGBA_span_RGB565_MMX, @function

	_generic_read_RGBA_span_RGB565_MMX:

	#ifdef USE_INNER_EMMS
	emms
	#endif

	movl 4(%esp), %eax /* source pointer */
	movl 8(%esp), %edx /* destination pointer */
	movl 12(%esp), %ecx /* number of pixels to copy */

	pushl $MASK_565_H
	pushl $MASK_565_L
	movq (%esp), %mm5
	pushl $PRESCALE_H
	pushl $PRESCALE_L
	movq (%esp), %mm6
	pushl $SCALE_H
	pushl $SCALE_L
	movq (%esp), %mm7
	pushl $ALPHA_H
	pushl $ALPHA_L
	movq (%esp), %mm3
	addl $32,%esp

	sarl $2, %ecx
	jl .L01 /* Bail early if the count is negative. */
	jmp .L02

	.L03:
	/* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
	* second pixels into the four words of %mm0 and %mm2.
	*/

	movq (%eax), %mm4
	addl $8, %eax

	pshufw $0x00, %mm4, %mm0
	pshufw $0x55, %mm4, %mm2


	/* Mask the pixels so that each word of each register contains only
	* one color component.
	*/

	pand %mm5, %mm0
	pand %mm5, %mm2


	/* Adjust the component values so that they are as small as possible,
	* but large enough so that we can multiply them by an unsigned 16-bit
	* number and get a value as large as 0x00ff0000.
	*/

	pmullw %mm6, %mm0
	pmullw %mm6, %mm2
	#if SCALE_ADJUST > 0
	psrlw $SCALE_ADJUST, %mm0
	psrlw $SCALE_ADJUST, %mm2
	#endif

	/* Scale the input component values to be on the range
	* [0, 0x00ff0000]. This it the real magic of the whole routine.
	*/

	pmulhuw %mm7, %mm0
	pmulhuw %mm7, %mm2


	/* Always set the alpha value to 0xff.
	*/

	por %mm3, %mm0
	por %mm3, %mm2


	/* Pack the 16-bit values to 8-bit values and store the converted
	* pixel data.
	*/

	packuswb %mm2, %mm0
	movq %mm0, (%edx)
	addl $8, %edx

	pshufw $0xaa, %mm4, %mm0
	pshufw $0xff, %mm4, %mm2

	pand %mm5, %mm0
	pand %mm5, %mm2
	pmullw %mm6, %mm0
	pmullw %mm6, %mm2
	#if SCALE_ADJUST > 0
	psrlw $SCALE_ADJUST, %mm0
	psrlw $SCALE_ADJUST, %mm2
	#endif
	pmulhuw %mm7, %mm0
	pmulhuw %mm7, %mm2

	por %mm3, %mm0
	por %mm3, %mm2

	packuswb %mm2, %mm0

	movq %mm0, (%edx)
	addl $8, %edx

	subl $1, %ecx
	.L02:
	jne .L03


	/* At this point there can be at most 3 pixels left to process. If
	* there is either 2 or 3 left, process 2.
	*/

	movl 12(%esp), %ecx
	testl $0x02, %ecx
	je .L04

	movd (%eax), %mm4
	addl $4, %eax

	pshufw $0x00, %mm4, %mm0
	pshufw $0x55, %mm4, %mm2

	pand %mm5, %mm0
	pand %mm5, %mm2
	pmullw %mm6, %mm0
	pmullw %mm6, %mm2
	#if SCALE_ADJUST > 0
	psrlw $SCALE_ADJUST, %mm0
	psrlw $SCALE_ADJUST, %mm2
	#endif
	pmulhuw %mm7, %mm0
	pmulhuw %mm7, %mm2

	por %mm3, %mm0
	por %mm3, %mm2

	packuswb %mm2, %mm0

	movq %mm0, (%edx)
	addl $8, %edx

	.L04:
	/* At this point there can be at most 1 pixel left to process.
	* Process it if needed.
	*/

	testl $0x01, %ecx
	je .L01

	movzwl (%eax), %ecx
	movd %ecx, %mm4

	pshufw $0x00, %mm4, %mm0

	pand %mm5, %mm0
	pmullw %mm6, %mm0
	#if SCALE_ADJUST > 0
	psrlw $SCALE_ADJUST, %mm0
	#endif
	pmulhuw %mm7, %mm0

	por %mm3, %mm0

	packuswb %mm0, %mm0

	movd %mm0, (%edx)

	.L01:
	#ifdef USE_INNER_EMMS
	emms
	#endif
	ret
	#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */

	#if defined (__ELF__) && defined (__linux__)
	.section .note.GNU-stack,"",%progbits
	#endif