*/
.file "read_rgba_span_x86.S"
- .section .rodata
- .align 16
- .type mask, @object
- .size mask, 32
-mask:
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
-
-
-/* I implemented these as macros because the appear in quite a few places,
+#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define LOAD_MASK(mvins,m1,m2) \
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ mvins (%esp), m1 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ mvins (%esp), m2 ;\
+ addl $32, %esp
+
+/* I implemented these as macros because they appear in several places,
* and I've tweaked them a number of times. I got tired of changing every
* place they appear. :)
*/
*/
.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
+#endif
.type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
pushl %ebx
#ifdef USE_INNER_EMMS
emms
#endif
- movq mask, %mm1
- movq mask+16, %mm2
+ LOAD_MASK(movq,%mm1,%mm2)
movl 8(%esp), %ebx /* source pointer */
movl 16(%esp), %edx /* number of pixels to copy */
movl 12(%esp), %ecx /* destination pointer */
testl %edx, %edx
- je .L20 /* Bail if there's nothing to do. */
+ jle .L20 /* Bail if there's nothing to do. */
movl %ebx, %eax
*/
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
+#endif
.type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE:
pushl %esi
#ifdef USE_INNER_EMMS
emms
#endif
- movq mask, %mm1
- movq mask+16, %mm2
+
+ LOAD_MASK(movq,%mm1,%mm2)
movl 16(%esp), %ebx /* source pointer */
movl 24(%esp), %edx /* number of pixels to copy */
movl 20(%esp), %ecx /* destination pointer */
+ testl %edx, %edx
+ jle .L35 /* Bail if there's nothing to do. */
+
movl %esp, %ebp
subl $16, %esp
andl $0xfffffff0, %esp
.text
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#endif
.type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
pushl %esi
pushl %ebx
- movdqa mask, %xmm1
- movdqa mask+16, %xmm2
+ LOAD_MASK(movdqu,%xmm1,%xmm2)
movl 12(%esp), %ebx /* source pointer */
movl 20(%esp), %edx /* number of pixels to copy */
movl %ebx, %eax
movl %edx, %esi
+ testl %edx, %edx
+ jle .L46 /* Bail if there's nothing to do. */
+
/* If the source pointer isn't a multiple of 16 we have to process
* a few pixels the "slow" way to get the address aligned for
* the SSE fetch intsructions.
je .L47
movq (%ebx), %xmm0
-
+ addl $8, %ebx
+
movdqa %xmm0, %xmm3
movdqa %xmm0, %xmm4
andps %xmm1, %xmm0
orps %xmm3, %xmm0
movq %xmm0, (%ecx)
+ addl $8, %ecx
.L47:
testl $1, %edx
popl %esi
ret
.size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+#define MASK_565_L 0x07e0f800
+#define MASK_565_H 0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa. Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
+ */
+#define SCALE_ADJUST 5
+#if SCALE_ADJUST == 5
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
+#elif SCALE_ADJUST == 0
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+ .text
+ .globl _generic_read_RGBA_span_RGB565_MMX
+#ifndef USE_DRICORE
+ .hidden _generic_read_RGBA_span_RGB565_MMX
+#endif
+ .type _generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+
+ movl 4(%esp), %eax /* source pointer */
+ movl 8(%esp), %edx /* destination pointer */
+ movl 12(%esp), %ecx /* number of pixels to copy */
+
+ pushl $MASK_565_H
+ pushl $MASK_565_L
+ movq (%esp), %mm5
+ pushl $PRESCALE_H
+ pushl $PRESCALE_L
+ movq (%esp), %mm6
+ pushl $SCALE_H
+ pushl $SCALE_L
+ movq (%esp), %mm7
+ pushl $ALPHA_H
+ pushl $ALPHA_L
+ movq (%esp), %mm3
+ addl $32,%esp
+
+ sarl $2, %ecx
+ jl .L01 /* Bail early if the count is negative. */
+ jmp .L02
+
+.L03:
+ /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
+ * second pixels into the four words of %mm0 and %mm2.
+ */
+
+ movq (%eax), %mm4
+ addl $8, %eax
+
+ pshufw $0x00, %mm4, %mm0
+ pshufw $0x55, %mm4, %mm2
+
+
+ /* Mask the pixels so that each word of each register contains only
+ * one color component.
+ */
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+
+
+ /* Adjust the component values so that they are as small as possible,
+ * but large enough so that we can multiply them by an unsigned 16-bit
+ * number and get a value as large as 0x00ff0000.
+ */
+
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+
+ /* Scale the input component values to be on the range
+ * [0, 0x00ff0000]. This it the real magic of the whole routine.
+ */
+
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+
+ /* Always set the alpha value to 0xff.
+ */
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+
+ /* Pack the 16-bit values to 8-bit values and store the converted
+ * pixel data.
+ */
+
+ packuswb %mm2, %mm0
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+ pshufw $0xaa, %mm4, %mm0
+ pshufw $0xff, %mm4, %mm2
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+ packuswb %mm2, %mm0
+
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+ subl $1, %ecx
+.L02:
+ jne .L03
+
+
+ /* At this point there can be at most 3 pixels left to process. If
+ * there is either 2 or 3 left, process 2.
+ */
+
+ movl 12(%esp), %ecx
+ testl $0x02, %ecx
+ je .L04
+
+ movd (%eax), %mm4
+ addl $4, %eax
+
+ pshufw $0x00, %mm4, %mm0
+ pshufw $0x55, %mm4, %mm2
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+ packuswb %mm2, %mm0
+
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+.L04:
+ /* At this point there can be at most 1 pixel left to process.
+ * Process it if needed.
+ */
+
+ testl $0x01, %ecx
+ je .L01
+
+ movzwl (%eax), %ecx
+ movd %ecx, %mm4
+
+ pshufw $0x00, %mm4, %mm0
+
+ pand %mm5, %mm0
+ pmullw %mm6, %mm0
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+#endif
+ pmulhuw %mm7, %mm0
+
+ por %mm3, %mm0
+
+ packuswb %mm0, %mm0
+
+ movd %mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif