2 * (C) Copyright IBM Corporation 2004
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
30 * \author Ian Romanick <idr@us.ibm.com>
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 /* Kevin F. Quinn 2nd July 2006
36 * Replace data segment constants with text-segment instructions
51 #define LOAD_MASK(mvins,m1,m2) \
65 /* I implemented these as macros because the appear in quite a few places,
66 * and I've tweaked them a number of times. I got tired of changing every
67 * place they appear. :)
70 #define DO_ONE_PIXEL() \
73 bswap %eax /* ARGB -> BGRA */ ; \
74 rorl $8, %eax /* BGRA -> ABGR */ ; \
75 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
78 #define DO_ONE_LAST_PIXEL() \
80 bswap %eax /* ARGB -> BGRA */ ; \
81 rorl $8, %eax /* BGRA -> ABGR */ ; \
82 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
86 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
89 * This function assumes that the caller will issue the EMMS instruction
90 * at the correct places.
93 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
94 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
95 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
96 _generic_read_RGBA_span_BGRA8888_REV_MMX:
102 /* Kevin F. Quinn 2nd July 2006
103 * Replace data segment constants with text-segment instructions
107 LOAD_MASK(movq,%mm1,%mm2)
109 movl 8(%esp), %ebx /* source pointer */
110 movl 16(%esp), %edx /* number of pixels to copy */
111 movl 12(%esp), %ecx /* destination pointer */
114 jle .L20 /* Bail if there's nothing to do. */
127 /* Would it be faster to unroll this loop once and process 4 pixels
128 * per pass, instead of just two?
138 /* These 9 instructions do what PSHUFB (if there were such an
139 * instruction) could do in 1. :(
160 #ifdef USE_INNER_EMMS
164 /* At this point there are either 1 or 0 pixels remaining to be
165 * converted. Convert the last pixel, if needed.
176 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
180 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
181 * instructions are only actually used to read data from the framebuffer.
182 * In practice, the speed-up is pretty small.
185 * Do some more testing and determine if there's any reason to have this
186 * function in addition to the MMX version.
189 * This function assumes that the caller will issue the EMMS instruction
190 * at the correct places.
193 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
194 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
195 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
196 _generic_read_RGBA_span_BGRA8888_REV_SSE:
201 #ifdef USE_INNER_EMMS
204 /* Kevin F. Quinn 2nd July 2006
205 * Replace data segment constants with text-segment instructions
209 LOAD_MASK(movq,%mm1,%mm2)
211 movl 16(%esp), %ebx /* source pointer */
212 movl 24(%esp), %edx /* number of pixels to copy */
213 movl 20(%esp), %ecx /* destination pointer */
216 jle .L35 /* Bail if there's nothing to do. */
220 andl $0xfffffff0, %esp
268 /* This would be so much better if we could just move directly from
269 * an SSE register to an MMX register. Unfortunately, that
270 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
312 #ifdef USE_INNER_EMMS
317 /* At this point there are either [0, 3] pixels remaining to be
352 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
356 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
360 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
361 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
362 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
363 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
367 /* Kevin F. Quinn 2nd July 2006
368 * Replace data segment constants with text-segment instructions
370 movdqa mask+16, %xmm2
372 LOAD_MASK(movdqu,%xmm1,%xmm2)
374 movl 12(%esp), %ebx /* source pointer */
375 movl 20(%esp), %edx /* number of pixels to copy */
376 movl 16(%esp), %ecx /* destination pointer */
382 jle .L46 /* Bail if there's nothing to do. */
384 /* If the source pointer isn't a multiple of 16 we have to process
385 * a few pixels the "slow" way to get the address aligned for
386 * the SSE fetch intsructions.
424 /* Would it be worth having a specialized version of this loop for
425 * the case where the destination is 16-byte aligned? That version
426 * would be identical except that it could use movedqa instead of
456 /* There may be upto 3 pixels remaining to be copied. Take care
457 * of them now. We do the 2 pixel case first because the data
490 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
494 /* Kevin F. Quinn 2nd July 2006
495 * Replace data segment constants with text-segment instructions
507 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
508 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
509 * at a small cost to accuracy.
512 #define SCALE_ADJUST 5
513 #if SCALE_ADJUST == 5
521 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
522 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
523 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
525 #elif SCALE_ADJUST == 0
533 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
534 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
535 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
538 #error SCALE_ADJUST must either be 5 or 0.
542 alpha: .long 0x00000000
546 #define MASK_565_L 0x07e0f800
547 #define MASK_565_H 0x0000001f
548 #define SCALE_ADJUST 5
549 #if SCALE_ADJUST == 5
550 #define PRESCALE_L 0x00100001
551 #define PRESCALE_H 0x00000200
552 #define SCALE_L 0x40C620E8
553 #define SCALE_H 0x0000839d
554 #elif SCALE_ADJUST == 0
555 #define PRESCALE_L 0x00200001
556 #define PRESCALE_H 0x00000800
557 #define SCALE_L 0x01040108
558 #define SCALE_H 0x00000108
560 #error SCALE_ADJUST must either be 5 or 0.
562 #define ALPHA_L 0x00000000
563 #define ALPHA_H 0x00ff0000
566 * MMX optimized version of the RGB565 to RGBA copy routine.
570 .globl _generic_read_RGBA_span_RGB565_MMX
571 .hidden _generic_read_RGBA_span_RGB565_MMX
572 .type _generic_read_RGBA_span_RGB565_MMX, @function
574 _generic_read_RGBA_span_RGB565_MMX:
576 #ifdef USE_INNER_EMMS
580 movl 4(%esp), %eax /* source pointer */
581 movl 8(%esp), %edx /* destination pointer */
582 movl 12(%esp), %ecx /* number of pixels to copy */
584 /* Kevin F. Quinn 2nd July 2006
585 * Replace data segment constants with text-segment instructions
605 jle .L01 /* Bail early if the count is negative. */
609 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
610 * second pixels into the four words of %mm0 and %mm2.
616 pshufw $0x00, %mm4, %mm0
617 pshufw $0x55, %mm4, %mm2
620 /* Mask the pixels so that each word of each register contains only
621 * one color component.
628 /* Adjust the component values so that they are as small as possible,
629 * but large enough so that we can multiply them by an unsigned 16-bit
630 * number and get a value as large as 0x00ff0000.
636 psrlw $SCALE_ADJUST, %mm0
637 psrlw $SCALE_ADJUST, %mm2
640 /* Scale the input component values to be on the range
641 * [0, 0x00ff0000]. This it the real magic of the whole routine.
648 /* Always set the alpha value to 0xff.
651 /* Kevin F. Quinn 2nd July 2006
652 * Replace data segment constants with text-segment instructions
660 /* Pack the 16-bit values to 8-bit values and store the converted
670 pshufw $0xaa, %mm4, %mm0
671 pshufw $0xff, %mm4, %mm2
678 psrlw $SCALE_ADJUST, %mm0
679 psrlw $SCALE_ADJUST, %mm2
684 /* Kevin F. Quinn 2nd July 2006
685 * Replace data segment constants with text-segment instructions
702 /* At this point there can be at most 3 pixels left to process. If
703 * there is either 2 or 3 left, process 2.
713 pshufw $0x00, %mm4, %mm0
714 pshufw $0x55, %mm4, %mm2
721 psrlw $SCALE_ADJUST, %mm0
722 psrlw $SCALE_ADJUST, %mm2
727 /* Kevin F. Quinn 2nd July 2006
728 * Replace data segment constants with text-segment instructions
741 /* At this point there can be at most 1 pixel left to process.
742 * Process it if needed.
751 pshufw $0x00, %mm4, %mm0
756 psrlw $SCALE_ADJUST, %mm0
760 /* Kevin F. Quinn 2nd July 2006
761 * Replace data segment constants with text-segment instructions
771 #ifdef USE_INNER_EMMS
775 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
777 #if defined (__ELF__) && defined (__linux__)
778 .section .note.GNU-stack,"",%progbits