*
* \author Ian Romanick <idr@us.ibm.com>
*/
+/* Control flow enforcement support */
+#ifdef HAVE_CET_H
+#include <cet.h>
+#else
+#define _CET_ENDBR
+#endif
.file "read_rgba_span_x86.S"
-#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
- .section .rodata
- .align 16
- .type mask, @object
- .size mask, 32
-mask:
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0xff00ff00
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
- .long 0x00ff0000
-
-
-/* I implemented these as macros because the appear in quite a few places,
+#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define LOAD_MASK(mvins,m1,m2) \
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ mvins (%esp), m1 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ mvins (%esp), m2 ;\
+ addl $32, %esp
+
+/* I implemented these as macros because they appear in several places,
* and I've tweaked them a number of times. I got tired of changing every
* place they appear. :)
*/
movl (%ebx), %eax ; \
bswap %eax /* ARGB -> BGRA */ ; \
rorl $8, %eax /* BGRA -> ABGR */ ; \
- movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
+ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ;
/**
.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
.type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
+ _CET_ENDBR
pushl %ebx
#ifdef USE_INNER_EMMS
emms
#endif
- movq mask, %mm1
- movq mask+16, %mm2
+ LOAD_MASK(movq,%mm1,%mm2)
movl 8(%esp), %ebx /* source pointer */
movl 16(%esp), %edx /* number of pixels to copy */
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
.type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE:
+ _CET_ENDBR
pushl %esi
pushl %ebx
pushl %ebp
#ifdef USE_INNER_EMMS
emms
#endif
- movq mask, %mm1
- movq mask+16, %mm2
+
+ LOAD_MASK(movq,%mm1,%mm2)
movl 16(%esp), %ebx /* source pointer */
movl 24(%esp), %edx /* number of pixels to copy */
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
.type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+ _CET_ENDBR
pushl %esi
pushl %ebx
- movdqa mask, %xmm1
- movdqa mask+16, %xmm2
+ LOAD_MASK(movdqu,%xmm1,%xmm2)
movl 12(%esp), %ebx /* source pointer */
movl 20(%esp), %edx /* number of pixels to copy */
je .L47
movq (%ebx), %xmm0
-
+ addl $8, %ebx
+
movdqa %xmm0, %xmm3
movdqa %xmm0, %xmm4
andps %xmm1, %xmm0
orps %xmm3, %xmm0
movq %xmm0, (%ecx)
+ addl $8, %ecx
.L47:
testl $1, %edx
- .section .rodata
-
- .align 16
-mask_565:
- .word 0xf800
- .word 0x07e0
- .word 0x001f
- .word 0x0000
-
-/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
- * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
- * at a small cost to accuracy.
+#define MASK_565_L 0x07e0f800
+#define MASK_565_H 0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa. Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
*/
-
#define SCALE_ADJUST 5
#if SCALE_ADJUST == 5
-prescale:
- .word 0x0001
- .word 0x0010
- .word 0x0200
- .word 0x0000
-
-scale:
- .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
- .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
- .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
- .word 0x0000
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
#elif SCALE_ADJUST == 0
-prescale:
- .word 0x0001
- .word 0x0020
- .word 0x0800
- .word 0x0000
-
-scale:
- .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
- .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
- .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
- .word 0x0000
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
#else
#error SCALE_ADJUST must either be 5 or 0.
#endif
-
-
-alpha: .long 0x00000000
- .long 0x00ff0000
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
/**
* MMX optimized version of the RGB565 to RGBA copy routine.
.type _generic_read_RGBA_span_RGB565_MMX, @function
_generic_read_RGBA_span_RGB565_MMX:
-
+ _CET_ENDBR
#ifdef USE_INNER_EMMS
emms
#endif
movl 8(%esp), %edx /* destination pointer */
movl 12(%esp), %ecx /* number of pixels to copy */
- movq mask_565, %mm5
- movq prescale, %mm6
- movq scale, %mm7
+ pushl $MASK_565_H
+ pushl $MASK_565_L
+ movq (%esp), %mm5
+ pushl $PRESCALE_H
+ pushl $PRESCALE_L
+ movq (%esp), %mm6
+ pushl $SCALE_H
+ pushl $SCALE_L
+ movq (%esp), %mm7
+ pushl $ALPHA_H
+ pushl $ALPHA_L
+ movq (%esp), %mm3
+ addl $32,%esp
sarl $2, %ecx
- jle .L01 /* Bail early if the count is negative. */
+ jl .L01 /* Bail early if the count is negative. */
jmp .L02
.L03:
/* Always set the alpha value to 0xff.
*/
- por alpha, %mm0
- por alpha, %mm2
+ por %mm3, %mm0
+ por %mm3, %mm2
/* Pack the 16-bit values to 8-bit values and store the converted
movq %mm0, (%edx)
addl $8, %edx
-
-
pshufw $0xaa, %mm4, %mm0
pshufw $0xff, %mm4, %mm2
pmulhuw %mm7, %mm0
pmulhuw %mm7, %mm2
- por alpha, %mm0
- por alpha, %mm2
+ por %mm3, %mm0
+ por %mm3, %mm2
packuswb %mm2, %mm0
pmulhuw %mm7, %mm0
pmulhuw %mm7, %mm2
- por alpha, %mm0
- por alpha, %mm2
+ por %mm3, %mm0
+ por %mm3, %mm2
packuswb %mm2, %mm0
testl $0x01, %ecx
je .L01
- movzxw (%eax), %ecx
+ movzwl (%eax), %ecx
movd %ecx, %mm4
pshufw $0x00, %mm4, %mm0
#endif
pmulhuw %mm7, %mm0
- por alpha, %mm0
+ por %mm3, %mm0
packuswb %mm0, %mm0
emms
#endif
ret
-#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
+#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
#if defined (__ELF__) && defined (__linux__)
.section .note.GNU-stack,"",%progbits