/* * (C) Copyright IBM Corporation 2004 * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * \file read_rgba_span_x86.S * Optimized routines to transfer pixel data from the framebuffer to a * buffer in main memory. * * \author Ian Romanick */ /* Control flow enforcement support */ #ifdef HAVE_CET_H #include #else #define _CET_ENDBR #endif .file "read_rgba_span_x86.S" #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ /* Kevin F. Quinn 2nd July 2006 * Replaced data segment constants with text-segment instructions. */ #define LOAD_MASK(mvins,m1,m2) \ pushl $0xff00ff00 ;\ pushl $0xff00ff00 ;\ pushl $0xff00ff00 ;\ pushl $0xff00ff00 ;\ mvins (%esp), m1 ;\ pushl $0x00ff0000 ;\ pushl $0x00ff0000 ;\ pushl $0x00ff0000 ;\ pushl $0x00ff0000 ;\ mvins (%esp), m2 ;\ addl $32, %esp /* I implemented these as macros because they appear in several places, * and I've tweaked them a number of times. I got tired of changing every * place they appear. :) */ #define DO_ONE_PIXEL() \ movl (%ebx), %eax ; \ addl $4, %ebx ; \ bswap %eax /* ARGB -> BGRA */ ; \ rorl $8, %eax /* BGRA -> ABGR */ ; \ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ addl $4, %ecx #define DO_ONE_LAST_PIXEL() \ movl (%ebx), %eax ; \ bswap %eax /* ARGB -> BGRA */ ; \ rorl $8, %eax /* BGRA -> ABGR */ ; \ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; /** * MMX optimized version of the BGRA8888_REV to RGBA copy routine. * * \warning * This function assumes that the caller will issue the EMMS instruction * at the correct places. */ .globl _generic_read_RGBA_span_BGRA8888_REV_MMX .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function _generic_read_RGBA_span_BGRA8888_REV_MMX: _CET_ENDBR pushl %ebx #ifdef USE_INNER_EMMS emms #endif LOAD_MASK(movq,%mm1,%mm2) movl 8(%esp), %ebx /* source pointer */ movl 16(%esp), %edx /* number of pixels to copy */ movl 12(%esp), %ecx /* destination pointer */ testl %edx, %edx jle .L20 /* Bail if there's nothing to do. */ movl %ebx, %eax negl %eax sarl $2, %eax andl $1, %eax je .L17 subl %eax, %edx DO_ONE_PIXEL() .L17: /* Would it be faster to unroll this loop once and process 4 pixels * per pass, instead of just two? */ movl %edx, %eax shrl %eax jmp .L18 .L19: movq (%ebx), %mm0 addl $8, %ebx /* These 9 instructions do what PSHUFB (if there were such an * instruction) could do in 1. :( */ movq %mm0, %mm3 movq %mm0, %mm4 pand %mm2, %mm3 psllq $16, %mm4 psrlq $16, %mm3 pand %mm2, %mm4 pand %mm1, %mm0 por %mm4, %mm3 por %mm3, %mm0 movq %mm0, (%ecx) addl $8, %ecx subl $1, %eax .L18: jne .L19 #ifdef USE_INNER_EMMS emms #endif /* At this point there are either 1 or 0 pixels remaining to be * converted. Convert the last pixel, if needed. */ testl $1, %edx je .L20 DO_ONE_LAST_PIXEL() .L20: popl %ebx ret .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX /** * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE * instructions are only actually used to read data from the framebuffer. * In practice, the speed-up is pretty small. * * \todo * Do some more testing and determine if there's any reason to have this * function in addition to the MMX version. * * \warning * This function assumes that the caller will issue the EMMS instruction * at the correct places. */ .globl _generic_read_RGBA_span_BGRA8888_REV_SSE .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function _generic_read_RGBA_span_BGRA8888_REV_SSE: _CET_ENDBR pushl %esi pushl %ebx pushl %ebp #ifdef USE_INNER_EMMS emms #endif LOAD_MASK(movq,%mm1,%mm2) movl 16(%esp), %ebx /* source pointer */ movl 24(%esp), %edx /* number of pixels to copy */ movl 20(%esp), %ecx /* destination pointer */ testl %edx, %edx jle .L35 /* Bail if there's nothing to do. */ movl %esp, %ebp subl $16, %esp andl $0xfffffff0, %esp movl %ebx, %eax movl %edx, %esi negl %eax andl $15, %eax sarl $2, %eax cmpl %edx, %eax cmovle %eax, %esi subl %esi, %edx testl $1, %esi je .L32 DO_ONE_PIXEL() .L32: testl $2, %esi je .L31 movq (%ebx), %mm0 addl $8, %ebx movq %mm0, %mm3 movq %mm0, %mm4 pand %mm2, %mm3 psllq $16, %mm4 psrlq $16, %mm3 pand %mm2, %mm4 pand %mm1, %mm0 por %mm4, %mm3 por %mm3, %mm0 movq %mm0, (%ecx) addl $8, %ecx .L31: movl %edx, %eax shrl $2, %eax jmp .L33 .L34: movaps (%ebx), %xmm0 addl $16, %ebx /* This would be so much better if we could just move directly from * an SSE register to an MMX register. Unfortunately, that * functionality wasn't introduced until SSE2 with the MOVDQ2Q * instruction. */ movaps %xmm0, (%esp) movq (%esp), %mm0 movq 8(%esp), %mm5 movq %mm0, %mm3 movq %mm0, %mm4 movq %mm5, %mm6 movq %mm5, %mm7 pand %mm2, %mm3 pand %mm2, %mm6 psllq $16, %mm4 psllq $16, %mm7 psrlq $16, %mm3 psrlq $16, %mm6 pand %mm2, %mm4 pand %mm2, %mm7 pand %mm1, %mm0 pand %mm1, %mm5 por %mm4, %mm3 por %mm7, %mm6 por %mm3, %mm0 por %mm6, %mm5 movq %mm0, (%ecx) movq %mm5, 8(%ecx) addl $16, %ecx subl $1, %eax .L33: jne .L34 #ifdef USE_INNER_EMMS emms #endif movl %ebp, %esp /* At this point there are either [0, 3] pixels remaining to be * converted. */ testl $2, %edx je .L36 movq (%ebx), %mm0 addl $8, %ebx movq %mm0, %mm3 movq %mm0, %mm4 pand %mm2, %mm3 psllq $16, %mm4 psrlq $16, %mm3 pand %mm2, %mm4 pand %mm1, %mm0 por %mm4, %mm3 por %mm3, %mm0 movq %mm0, (%ecx) addl $8, %ecx .L36: testl $1, %edx je .L35 DO_ONE_LAST_PIXEL() .L35: popl %ebp popl %ebx popl %esi ret .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE /** * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. */ .text .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function _generic_read_RGBA_span_BGRA8888_REV_SSE2: _CET_ENDBR pushl %esi pushl %ebx LOAD_MASK(movdqu,%xmm1,%xmm2) movl 12(%esp), %ebx /* source pointer */ movl 20(%esp), %edx /* number of pixels to copy */ movl 16(%esp), %ecx /* destination pointer */ movl %ebx, %eax movl %edx, %esi testl %edx, %edx jle .L46 /* Bail if there's nothing to do. */ /* If the source pointer isn't a multiple of 16 we have to process * a few pixels the "slow" way to get the address aligned for * the SSE fetch intsructions. */ negl %eax andl $15, %eax sarl $2, %eax cmpl %edx, %eax cmovbe %eax, %esi subl %esi, %edx testl $1, %esi je .L41 DO_ONE_PIXEL() .L41: testl $2, %esi je .L40 movq (%ebx), %xmm0 addl $8, %ebx movdqa %xmm0, %xmm3 movdqa %xmm0, %xmm4 andps %xmm1, %xmm0 andps %xmm2, %xmm3 pslldq $2, %xmm4 psrldq $2, %xmm3 andps %xmm2, %xmm4 orps %xmm4, %xmm3 orps %xmm3, %xmm0 movq %xmm0, (%ecx) addl $8, %ecx .L40: /* Would it be worth having a specialized version of this loop for * the case where the destination is 16-byte aligned? That version * would be identical except that it could use movedqa instead of * movdqu. */ movl %edx, %eax shrl $2, %eax jmp .L42 .L43: movdqa (%ebx), %xmm0 addl $16, %ebx movdqa %xmm0, %xmm3 movdqa %xmm0, %xmm4 andps %xmm1, %xmm0 andps %xmm2, %xmm3 pslldq $2, %xmm4 psrldq $2, %xmm3 andps %xmm2, %xmm4 orps %xmm4, %xmm3 orps %xmm3, %xmm0 movdqu %xmm0, (%ecx) addl $16, %ecx subl $1, %eax .L42: jne .L43 /* There may be upto 3 pixels remaining to be copied. Take care * of them now. We do the 2 pixel case first because the data * will be aligned. */ testl $2, %edx je .L47 movq (%ebx), %xmm0 addl $8, %ebx movdqa %xmm0, %xmm3 movdqa %xmm0, %xmm4 andps %xmm1, %xmm0 andps %xmm2, %xmm3 pslldq $2, %xmm4 psrldq $2, %xmm3 andps %xmm2, %xmm4 orps %xmm4, %xmm3 orps %xmm3, %xmm0 movq %xmm0, (%ecx) addl $8, %ecx .L47: testl $1, %edx je .L46 DO_ONE_LAST_PIXEL() .L46: popl %ebx popl %esi ret .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 #define MASK_565_L 0x07e0f800 #define MASK_565_H 0x0000001f /* Setting SCALE_ADJUST to 5 gives a perfect match with the * classic C implementation in Mesa. Setting SCALE_ADJUST * to 0 is slightly faster but at a small cost to accuracy. */ #define SCALE_ADJUST 5 #if SCALE_ADJUST == 5 #define PRESCALE_L 0x00100001 #define PRESCALE_H 0x00000200 #define SCALE_L 0x40C620E8 #define SCALE_H 0x0000839d #elif SCALE_ADJUST == 0 #define PRESCALE_L 0x00200001 #define PRESCALE_H 0x00000800 #define SCALE_L 0x01040108 #define SCALE_H 0x00000108 #else #error SCALE_ADJUST must either be 5 or 0. #endif #define ALPHA_L 0x00000000 #define ALPHA_H 0x00ff0000 /** * MMX optimized version of the RGB565 to RGBA copy routine. */ .text .globl _generic_read_RGBA_span_RGB565_MMX .hidden _generic_read_RGBA_span_RGB565_MMX .type _generic_read_RGBA_span_RGB565_MMX, @function _generic_read_RGBA_span_RGB565_MMX: _CET_ENDBR #ifdef USE_INNER_EMMS emms #endif movl 4(%esp), %eax /* source pointer */ movl 8(%esp), %edx /* destination pointer */ movl 12(%esp), %ecx /* number of pixels to copy */ pushl $MASK_565_H pushl $MASK_565_L movq (%esp), %mm5 pushl $PRESCALE_H pushl $PRESCALE_L movq (%esp), %mm6 pushl $SCALE_H pushl $SCALE_L movq (%esp), %mm7 pushl $ALPHA_H pushl $ALPHA_L movq (%esp), %mm3 addl $32,%esp sarl $2, %ecx jl .L01 /* Bail early if the count is negative. */ jmp .L02 .L03: /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and * second pixels into the four words of %mm0 and %mm2. */ movq (%eax), %mm4 addl $8, %eax pshufw $0x00, %mm4, %mm0 pshufw $0x55, %mm4, %mm2 /* Mask the pixels so that each word of each register contains only * one color component. */ pand %mm5, %mm0 pand %mm5, %mm2 /* Adjust the component values so that they are as small as possible, * but large enough so that we can multiply them by an unsigned 16-bit * number and get a value as large as 0x00ff0000. */ pmullw %mm6, %mm0 pmullw %mm6, %mm2 #if SCALE_ADJUST > 0 psrlw $SCALE_ADJUST, %mm0 psrlw $SCALE_ADJUST, %mm2 #endif /* Scale the input component values to be on the range * [0, 0x00ff0000]. This it the real magic of the whole routine. */ pmulhuw %mm7, %mm0 pmulhuw %mm7, %mm2 /* Always set the alpha value to 0xff. */ por %mm3, %mm0 por %mm3, %mm2 /* Pack the 16-bit values to 8-bit values and store the converted * pixel data. */ packuswb %mm2, %mm0 movq %mm0, (%edx) addl $8, %edx pshufw $0xaa, %mm4, %mm0 pshufw $0xff, %mm4, %mm2 pand %mm5, %mm0 pand %mm5, %mm2 pmullw %mm6, %mm0 pmullw %mm6, %mm2 #if SCALE_ADJUST > 0 psrlw $SCALE_ADJUST, %mm0 psrlw $SCALE_ADJUST, %mm2 #endif pmulhuw %mm7, %mm0 pmulhuw %mm7, %mm2 por %mm3, %mm0 por %mm3, %mm2 packuswb %mm2, %mm0 movq %mm0, (%edx) addl $8, %edx subl $1, %ecx .L02: jne .L03 /* At this point there can be at most 3 pixels left to process. If * there is either 2 or 3 left, process 2. */ movl 12(%esp), %ecx testl $0x02, %ecx je .L04 movd (%eax), %mm4 addl $4, %eax pshufw $0x00, %mm4, %mm0 pshufw $0x55, %mm4, %mm2 pand %mm5, %mm0 pand %mm5, %mm2 pmullw %mm6, %mm0 pmullw %mm6, %mm2 #if SCALE_ADJUST > 0 psrlw $SCALE_ADJUST, %mm0 psrlw $SCALE_ADJUST, %mm2 #endif pmulhuw %mm7, %mm0 pmulhuw %mm7, %mm2 por %mm3, %mm0 por %mm3, %mm2 packuswb %mm2, %mm0 movq %mm0, (%edx) addl $8, %edx .L04: /* At this point there can be at most 1 pixel left to process. * Process it if needed. */ testl $0x01, %ecx je .L01 movzwl (%eax), %ecx movd %ecx, %mm4 pshufw $0x00, %mm4, %mm0 pand %mm5, %mm0 pmullw %mm6, %mm0 #if SCALE_ADJUST > 0 psrlw $SCALE_ADJUST, %mm0 #endif pmulhuw %mm7, %mm0 por %mm3, %mm0 packuswb %mm0, %mm0 movd %mm0, (%edx) .L01: #ifdef USE_INNER_EMMS emms #endif ret #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */ #if defined (__ELF__) && defined (__linux__) .section .note.GNU-stack,"",%progbits #endif