| /* |
| * (C) Copyright IBM Corporation 2004 |
| * All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * on the rights to use, copy, modify, merge, publish, distribute, sub |
| * license, and/or sell copies of the Software, and to permit persons to whom |
| * the Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
| * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
| * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
| * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
| * USE OR OTHER DEALINGS IN THE SOFTWARE. |
| */ |
| |
| /** |
| * \file read_rgba_span_x86.S |
| * Optimized routines to transfer pixel data from the framebuffer to a |
| * buffer in main memory. |
| * |
| * \author Ian Romanick <idr@us.ibm.com> |
| */ |
| |
| .file "read_rgba_span_x86.S" |
| #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ |
| /* Kevin F. Quinn 2nd July 2006 |
| * Replaced data segment constants with text-segment instructions. |
| */ |
| #define LOAD_MASK(mvins,m1,m2) \ |
| pushl $0xff00ff00 ;\ |
| pushl $0xff00ff00 ;\ |
| pushl $0xff00ff00 ;\ |
| pushl $0xff00ff00 ;\ |
| mvins (%esp), m1 ;\ |
| pushl $0x00ff0000 ;\ |
| pushl $0x00ff0000 ;\ |
| pushl $0x00ff0000 ;\ |
| pushl $0x00ff0000 ;\ |
| mvins (%esp), m2 ;\ |
| addl $32, %esp |
| |
| /* I implemented these as macros because they appear in several places, |
| * and I've tweaked them a number of times. I got tired of changing every |
| * place they appear. :) |
| */ |
| |
| #define DO_ONE_PIXEL() \ |
| movl (%ebx), %eax ; \ |
| addl $4, %ebx ; \ |
| bswap %eax /* ARGB -> BGRA */ ; \ |
| rorl $8, %eax /* BGRA -> ABGR */ ; \ |
| movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ |
| addl $4, %ecx |
| |
| #define DO_ONE_LAST_PIXEL() \ |
| movl (%ebx), %eax ; \ |
| bswap %eax /* ARGB -> BGRA */ ; \ |
| rorl $8, %eax /* BGRA -> ABGR */ ; \ |
| movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ |
| |
| |
| /** |
| * MMX optimized version of the BGRA8888_REV to RGBA copy routine. |
| * |
| * \warning |
| * This function assumes that the caller will issue the EMMS instruction |
| * at the correct places. |
| */ |
| |
| .globl _generic_read_RGBA_span_BGRA8888_REV_MMX |
| #ifndef USE_DRICORE |
| .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX |
| #endif |
| .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function |
| _generic_read_RGBA_span_BGRA8888_REV_MMX: |
| pushl %ebx |
| |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| LOAD_MASK(movq,%mm1,%mm2) |
| |
| movl 8(%esp), %ebx /* source pointer */ |
| movl 16(%esp), %edx /* number of pixels to copy */ |
| movl 12(%esp), %ecx /* destination pointer */ |
| |
| testl %edx, %edx |
| jle .L20 /* Bail if there's nothing to do. */ |
| |
| movl %ebx, %eax |
| |
| negl %eax |
| sarl $2, %eax |
| andl $1, %eax |
| je .L17 |
| |
| subl %eax, %edx |
| DO_ONE_PIXEL() |
| .L17: |
| |
| /* Would it be faster to unroll this loop once and process 4 pixels |
| * per pass, instead of just two? |
| */ |
| |
| movl %edx, %eax |
| shrl %eax |
| jmp .L18 |
| .L19: |
| movq (%ebx), %mm0 |
| addl $8, %ebx |
| |
| /* These 9 instructions do what PSHUFB (if there were such an |
| * instruction) could do in 1. :( |
| */ |
| |
| movq %mm0, %mm3 |
| movq %mm0, %mm4 |
| |
| pand %mm2, %mm3 |
| psllq $16, %mm4 |
| psrlq $16, %mm3 |
| pand %mm2, %mm4 |
| |
| pand %mm1, %mm0 |
| por %mm4, %mm3 |
| por %mm3, %mm0 |
| |
| movq %mm0, (%ecx) |
| addl $8, %ecx |
| subl $1, %eax |
| .L18: |
| jne .L19 |
| |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| |
| /* At this point there are either 1 or 0 pixels remaining to be |
| * converted. Convert the last pixel, if needed. |
| */ |
| |
| testl $1, %edx |
| je .L20 |
| |
| DO_ONE_LAST_PIXEL() |
| |
| .L20: |
| popl %ebx |
| ret |
| .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX |
| |
| |
| /** |
| * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE |
| * instructions are only actually used to read data from the framebuffer. |
| * In practice, the speed-up is pretty small. |
| * |
| * \todo |
| * Do some more testing and determine if there's any reason to have this |
| * function in addition to the MMX version. |
| * |
| * \warning |
| * This function assumes that the caller will issue the EMMS instruction |
| * at the correct places. |
| */ |
| |
| .globl _generic_read_RGBA_span_BGRA8888_REV_SSE |
| #ifndef USE_DRICORE |
| .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE |
| #endif |
| .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function |
| _generic_read_RGBA_span_BGRA8888_REV_SSE: |
| pushl %esi |
| pushl %ebx |
| pushl %ebp |
| |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| |
| LOAD_MASK(movq,%mm1,%mm2) |
| |
| movl 16(%esp), %ebx /* source pointer */ |
| movl 24(%esp), %edx /* number of pixels to copy */ |
| movl 20(%esp), %ecx /* destination pointer */ |
| |
| testl %edx, %edx |
| jle .L35 /* Bail if there's nothing to do. */ |
| |
| movl %esp, %ebp |
| subl $16, %esp |
| andl $0xfffffff0, %esp |
| |
| movl %ebx, %eax |
| movl %edx, %esi |
| |
| negl %eax |
| andl $15, %eax |
| sarl $2, %eax |
| cmpl %edx, %eax |
| cmovle %eax, %esi |
| |
| subl %esi, %edx |
| |
| testl $1, %esi |
| je .L32 |
| |
| DO_ONE_PIXEL() |
| .L32: |
| |
| testl $2, %esi |
| je .L31 |
| |
| movq (%ebx), %mm0 |
| addl $8, %ebx |
| |
| movq %mm0, %mm3 |
| movq %mm0, %mm4 |
| |
| pand %mm2, %mm3 |
| psllq $16, %mm4 |
| psrlq $16, %mm3 |
| pand %mm2, %mm4 |
| |
| pand %mm1, %mm0 |
| por %mm4, %mm3 |
| por %mm3, %mm0 |
| |
| movq %mm0, (%ecx) |
| addl $8, %ecx |
| .L31: |
| |
| movl %edx, %eax |
| shrl $2, %eax |
| jmp .L33 |
| .L34: |
| movaps (%ebx), %xmm0 |
| addl $16, %ebx |
| |
| /* This would be so much better if we could just move directly from |
| * an SSE register to an MMX register. Unfortunately, that |
| * functionality wasn't introduced until SSE2 with the MOVDQ2Q |
| * instruction. |
| */ |
| |
| movaps %xmm0, (%esp) |
| movq (%esp), %mm0 |
| movq 8(%esp), %mm5 |
| |
| movq %mm0, %mm3 |
| movq %mm0, %mm4 |
| movq %mm5, %mm6 |
| movq %mm5, %mm7 |
| |
| pand %mm2, %mm3 |
| pand %mm2, %mm6 |
| |
| psllq $16, %mm4 |
| psllq $16, %mm7 |
| |
| psrlq $16, %mm3 |
| psrlq $16, %mm6 |
| |
| pand %mm2, %mm4 |
| pand %mm2, %mm7 |
| |
| pand %mm1, %mm0 |
| pand %mm1, %mm5 |
| |
| por %mm4, %mm3 |
| por %mm7, %mm6 |
| |
| por %mm3, %mm0 |
| por %mm6, %mm5 |
| |
| movq %mm0, (%ecx) |
| movq %mm5, 8(%ecx) |
| addl $16, %ecx |
| |
| subl $1, %eax |
| .L33: |
| jne .L34 |
| |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| movl %ebp, %esp |
| |
| /* At this point there are either [0, 3] pixels remaining to be |
| * converted. |
| */ |
| |
| testl $2, %edx |
| je .L36 |
| |
| movq (%ebx), %mm0 |
| addl $8, %ebx |
| |
| movq %mm0, %mm3 |
| movq %mm0, %mm4 |
| |
| pand %mm2, %mm3 |
| psllq $16, %mm4 |
| psrlq $16, %mm3 |
| pand %mm2, %mm4 |
| |
| pand %mm1, %mm0 |
| por %mm4, %mm3 |
| por %mm3, %mm0 |
| |
| movq %mm0, (%ecx) |
| addl $8, %ecx |
| .L36: |
| |
| testl $1, %edx |
| je .L35 |
| |
| DO_ONE_LAST_PIXEL() |
| .L35: |
| popl %ebp |
| popl %ebx |
| popl %esi |
| ret |
| .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE |
| |
| |
| /** |
| * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. |
| */ |
| |
| .text |
| .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 |
| #ifndef USE_DRICORE |
| .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 |
| #endif |
| .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function |
| _generic_read_RGBA_span_BGRA8888_REV_SSE2: |
| pushl %esi |
| pushl %ebx |
| |
| LOAD_MASK(movdqu,%xmm1,%xmm2) |
| |
| movl 12(%esp), %ebx /* source pointer */ |
| movl 20(%esp), %edx /* number of pixels to copy */ |
| movl 16(%esp), %ecx /* destination pointer */ |
| |
| movl %ebx, %eax |
| movl %edx, %esi |
| |
| testl %edx, %edx |
| jle .L46 /* Bail if there's nothing to do. */ |
| |
| /* If the source pointer isn't a multiple of 16 we have to process |
| * a few pixels the "slow" way to get the address aligned for |
| * the SSE fetch intsructions. |
| */ |
| |
| negl %eax |
| andl $15, %eax |
| sarl $2, %eax |
| |
| cmpl %edx, %eax |
| cmovbe %eax, %esi |
| subl %esi, %edx |
| |
| testl $1, %esi |
| je .L41 |
| |
| DO_ONE_PIXEL() |
| .L41: |
| testl $2, %esi |
| je .L40 |
| |
| movq (%ebx), %xmm0 |
| addl $8, %ebx |
| |
| movdqa %xmm0, %xmm3 |
| movdqa %xmm0, %xmm4 |
| andps %xmm1, %xmm0 |
| |
| andps %xmm2, %xmm3 |
| pslldq $2, %xmm4 |
| psrldq $2, %xmm3 |
| andps %xmm2, %xmm4 |
| |
| orps %xmm4, %xmm3 |
| orps %xmm3, %xmm0 |
| |
| movq %xmm0, (%ecx) |
| addl $8, %ecx |
| .L40: |
| |
| /* Would it be worth having a specialized version of this loop for |
| * the case where the destination is 16-byte aligned? That version |
| * would be identical except that it could use movedqa instead of |
| * movdqu. |
| */ |
| |
| movl %edx, %eax |
| shrl $2, %eax |
| jmp .L42 |
| .L43: |
| movdqa (%ebx), %xmm0 |
| addl $16, %ebx |
| |
| movdqa %xmm0, %xmm3 |
| movdqa %xmm0, %xmm4 |
| andps %xmm1, %xmm0 |
| |
| andps %xmm2, %xmm3 |
| pslldq $2, %xmm4 |
| psrldq $2, %xmm3 |
| andps %xmm2, %xmm4 |
| |
| orps %xmm4, %xmm3 |
| orps %xmm3, %xmm0 |
| |
| movdqu %xmm0, (%ecx) |
| addl $16, %ecx |
| subl $1, %eax |
| .L42: |
| jne .L43 |
| |
| |
| /* There may be upto 3 pixels remaining to be copied. Take care |
| * of them now. We do the 2 pixel case first because the data |
| * will be aligned. |
| */ |
| |
| testl $2, %edx |
| je .L47 |
| |
| movq (%ebx), %xmm0 |
| addl $8, %ebx |
| |
| movdqa %xmm0, %xmm3 |
| movdqa %xmm0, %xmm4 |
| andps %xmm1, %xmm0 |
| |
| andps %xmm2, %xmm3 |
| pslldq $2, %xmm4 |
| psrldq $2, %xmm3 |
| andps %xmm2, %xmm4 |
| |
| orps %xmm4, %xmm3 |
| orps %xmm3, %xmm0 |
| |
| movq %xmm0, (%ecx) |
| addl $8, %ecx |
| .L47: |
| |
| testl $1, %edx |
| je .L46 |
| |
| DO_ONE_LAST_PIXEL() |
| .L46: |
| |
| popl %ebx |
| popl %esi |
| ret |
| .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 |
| |
| |
| |
| #define MASK_565_L 0x07e0f800 |
| #define MASK_565_H 0x0000001f |
| /* Setting SCALE_ADJUST to 5 gives a perfect match with the |
| * classic C implementation in Mesa. Setting SCALE_ADJUST |
| * to 0 is slightly faster but at a small cost to accuracy. |
| */ |
| #define SCALE_ADJUST 5 |
| #if SCALE_ADJUST == 5 |
| #define PRESCALE_L 0x00100001 |
| #define PRESCALE_H 0x00000200 |
| #define SCALE_L 0x40C620E8 |
| #define SCALE_H 0x0000839d |
| #elif SCALE_ADJUST == 0 |
| #define PRESCALE_L 0x00200001 |
| #define PRESCALE_H 0x00000800 |
| #define SCALE_L 0x01040108 |
| #define SCALE_H 0x00000108 |
| #else |
| #error SCALE_ADJUST must either be 5 or 0. |
| #endif |
| #define ALPHA_L 0x00000000 |
| #define ALPHA_H 0x00ff0000 |
| |
| /** |
| * MMX optimized version of the RGB565 to RGBA copy routine. |
| */ |
| |
| .text |
| .globl _generic_read_RGBA_span_RGB565_MMX |
| #ifndef USE_DRICORE |
| .hidden _generic_read_RGBA_span_RGB565_MMX |
| #endif |
| .type _generic_read_RGBA_span_RGB565_MMX, @function |
| |
| _generic_read_RGBA_span_RGB565_MMX: |
| |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| |
| movl 4(%esp), %eax /* source pointer */ |
| movl 8(%esp), %edx /* destination pointer */ |
| movl 12(%esp), %ecx /* number of pixels to copy */ |
| |
| pushl $MASK_565_H |
| pushl $MASK_565_L |
| movq (%esp), %mm5 |
| pushl $PRESCALE_H |
| pushl $PRESCALE_L |
| movq (%esp), %mm6 |
| pushl $SCALE_H |
| pushl $SCALE_L |
| movq (%esp), %mm7 |
| pushl $ALPHA_H |
| pushl $ALPHA_L |
| movq (%esp), %mm3 |
| addl $32,%esp |
| |
| sarl $2, %ecx |
| jl .L01 /* Bail early if the count is negative. */ |
| jmp .L02 |
| |
| .L03: |
| /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and |
| * second pixels into the four words of %mm0 and %mm2. |
| */ |
| |
| movq (%eax), %mm4 |
| addl $8, %eax |
| |
| pshufw $0x00, %mm4, %mm0 |
| pshufw $0x55, %mm4, %mm2 |
| |
| |
| /* Mask the pixels so that each word of each register contains only |
| * one color component. |
| */ |
| |
| pand %mm5, %mm0 |
| pand %mm5, %mm2 |
| |
| |
| /* Adjust the component values so that they are as small as possible, |
| * but large enough so that we can multiply them by an unsigned 16-bit |
| * number and get a value as large as 0x00ff0000. |
| */ |
| |
| pmullw %mm6, %mm0 |
| pmullw %mm6, %mm2 |
| #if SCALE_ADJUST > 0 |
| psrlw $SCALE_ADJUST, %mm0 |
| psrlw $SCALE_ADJUST, %mm2 |
| #endif |
| |
| /* Scale the input component values to be on the range |
| * [0, 0x00ff0000]. This it the real magic of the whole routine. |
| */ |
| |
| pmulhuw %mm7, %mm0 |
| pmulhuw %mm7, %mm2 |
| |
| |
| /* Always set the alpha value to 0xff. |
| */ |
| |
| por %mm3, %mm0 |
| por %mm3, %mm2 |
| |
| |
| /* Pack the 16-bit values to 8-bit values and store the converted |
| * pixel data. |
| */ |
| |
| packuswb %mm2, %mm0 |
| movq %mm0, (%edx) |
| addl $8, %edx |
| |
| pshufw $0xaa, %mm4, %mm0 |
| pshufw $0xff, %mm4, %mm2 |
| |
| pand %mm5, %mm0 |
| pand %mm5, %mm2 |
| pmullw %mm6, %mm0 |
| pmullw %mm6, %mm2 |
| #if SCALE_ADJUST > 0 |
| psrlw $SCALE_ADJUST, %mm0 |
| psrlw $SCALE_ADJUST, %mm2 |
| #endif |
| pmulhuw %mm7, %mm0 |
| pmulhuw %mm7, %mm2 |
| |
| por %mm3, %mm0 |
| por %mm3, %mm2 |
| |
| packuswb %mm2, %mm0 |
| |
| movq %mm0, (%edx) |
| addl $8, %edx |
| |
| subl $1, %ecx |
| .L02: |
| jne .L03 |
| |
| |
| /* At this point there can be at most 3 pixels left to process. If |
| * there is either 2 or 3 left, process 2. |
| */ |
| |
| movl 12(%esp), %ecx |
| testl $0x02, %ecx |
| je .L04 |
| |
| movd (%eax), %mm4 |
| addl $4, %eax |
| |
| pshufw $0x00, %mm4, %mm0 |
| pshufw $0x55, %mm4, %mm2 |
| |
| pand %mm5, %mm0 |
| pand %mm5, %mm2 |
| pmullw %mm6, %mm0 |
| pmullw %mm6, %mm2 |
| #if SCALE_ADJUST > 0 |
| psrlw $SCALE_ADJUST, %mm0 |
| psrlw $SCALE_ADJUST, %mm2 |
| #endif |
| pmulhuw %mm7, %mm0 |
| pmulhuw %mm7, %mm2 |
| |
| por %mm3, %mm0 |
| por %mm3, %mm2 |
| |
| packuswb %mm2, %mm0 |
| |
| movq %mm0, (%edx) |
| addl $8, %edx |
| |
| .L04: |
| /* At this point there can be at most 1 pixel left to process. |
| * Process it if needed. |
| */ |
| |
| testl $0x01, %ecx |
| je .L01 |
| |
| movzwl (%eax), %ecx |
| movd %ecx, %mm4 |
| |
| pshufw $0x00, %mm4, %mm0 |
| |
| pand %mm5, %mm0 |
| pmullw %mm6, %mm0 |
| #if SCALE_ADJUST > 0 |
| psrlw $SCALE_ADJUST, %mm0 |
| #endif |
| pmulhuw %mm7, %mm0 |
| |
| por %mm3, %mm0 |
| |
| packuswb %mm0, %mm0 |
| |
| movd %mm0, (%edx) |
| |
| .L01: |
| #ifdef USE_INNER_EMMS |
| emms |
| #endif |
| ret |
| #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */ |
| |
| #if defined (__ELF__) && defined (__linux__) |
| .section .note.GNU-stack,"",%progbits |
| #endif |