merge from master
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 /* Kevin F. Quinn 2nd July 2006
36 * Replace data segment constants with text-segment instructions
37 .section .rodata
38 .align 16
39 .type mask, @object
40 .size mask, 32
41 mask:
42 .long 0xff00ff00
43 .long 0xff00ff00
44 .long 0xff00ff00
45 .long 0xff00ff00
46 .long 0x00ff0000
47 .long 0x00ff0000
48 .long 0x00ff0000
49 .long 0x00ff0000
50 */
51 #define LOAD_MASK(mvins,m1,m2) \
52 pushl $0xff00ff00 ;\
53 pushl $0xff00ff00 ;\
54 pushl $0xff00ff00 ;\
55 pushl $0xff00ff00 ;\
56 mvins (%esp), m1 ;\
57 pushl $0x00ff0000 ;\
58 pushl $0x00ff0000 ;\
59 pushl $0x00ff0000 ;\
60 pushl $0x00ff0000 ;\
61 mvins (%esp), m2 ;\
62 addl $32, %esp
63
64
65 /* I implemented these as macros because the appear in quite a few places,
66 * and I've tweaked them a number of times. I got tired of changing every
67 * place they appear. :)
68 */
69
70 #define DO_ONE_PIXEL() \
71 movl (%ebx), %eax ; \
72 addl $4, %ebx ; \
73 bswap %eax /* ARGB -> BGRA */ ; \
74 rorl $8, %eax /* BGRA -> ABGR */ ; \
75 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
76 addl $4, %ecx
77
78 #define DO_ONE_LAST_PIXEL() \
79 movl (%ebx), %eax ; \
80 bswap %eax /* ARGB -> BGRA */ ; \
81 rorl $8, %eax /* BGRA -> ABGR */ ; \
82 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
83
84
85 /**
86 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
87 *
88 * \warning
89 * This function assumes that the caller will issue the EMMS instruction
90 * at the correct places.
91 */
92
93 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
94 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
95 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
96 _generic_read_RGBA_span_BGRA8888_REV_MMX:
97 pushl %ebx
98
99 #ifdef USE_INNER_EMMS
100 emms
101 #endif
102 /* Kevin F. Quinn 2nd July 2006
103 * Replace data segment constants with text-segment instructions
104 movq mask, %mm1
105 movq mask+16, %mm2
106 */
107 LOAD_MASK(movq,%mm1,%mm2)
108
109 movl 8(%esp), %ebx /* source pointer */
110 movl 16(%esp), %edx /* number of pixels to copy */
111 movl 12(%esp), %ecx /* destination pointer */
112
113 testl %edx, %edx
114 jle .L20 /* Bail if there's nothing to do. */
115
116 movl %ebx, %eax
117
118 negl %eax
119 sarl $2, %eax
120 andl $1, %eax
121 je .L17
122
123 subl %eax, %edx
124 DO_ONE_PIXEL()
125 .L17:
126
127 /* Would it be faster to unroll this loop once and process 4 pixels
128 * per pass, instead of just two?
129 */
130
131 movl %edx, %eax
132 shrl %eax
133 jmp .L18
134 .L19:
135 movq (%ebx), %mm0
136 addl $8, %ebx
137
138 /* These 9 instructions do what PSHUFB (if there were such an
139 * instruction) could do in 1. :(
140 */
141
142 movq %mm0, %mm3
143 movq %mm0, %mm4
144
145 pand %mm2, %mm3
146 psllq $16, %mm4
147 psrlq $16, %mm3
148 pand %mm2, %mm4
149
150 pand %mm1, %mm0
151 por %mm4, %mm3
152 por %mm3, %mm0
153
154 movq %mm0, (%ecx)
155 addl $8, %ecx
156 subl $1, %eax
157 .L18:
158 jne .L19
159
160 #ifdef USE_INNER_EMMS
161 emms
162 #endif
163
164 /* At this point there are either 1 or 0 pixels remaining to be
165 * converted. Convert the last pixel, if needed.
166 */
167
168 testl $1, %edx
169 je .L20
170
171 DO_ONE_LAST_PIXEL()
172
173 .L20:
174 popl %ebx
175 ret
176 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
177
178
179 /**
180 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
181 * instructions are only actually used to read data from the framebuffer.
182 * In practice, the speed-up is pretty small.
183 *
184 * \todo
185 * Do some more testing and determine if there's any reason to have this
186 * function in addition to the MMX version.
187 *
188 * \warning
189 * This function assumes that the caller will issue the EMMS instruction
190 * at the correct places.
191 */
192
193 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
194 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
195 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
196 _generic_read_RGBA_span_BGRA8888_REV_SSE:
197 pushl %esi
198 pushl %ebx
199 pushl %ebp
200
201 #ifdef USE_INNER_EMMS
202 emms
203 #endif
204 /* Kevin F. Quinn 2nd July 2006
205 * Replace data segment constants with text-segment instructions
206 movq mask, %mm1
207 movq mask+16, %mm2
208 */
209 LOAD_MASK(movq,%mm1,%mm2)
210
211 movl 16(%esp), %ebx /* source pointer */
212 movl 24(%esp), %edx /* number of pixels to copy */
213 movl 20(%esp), %ecx /* destination pointer */
214
215 testl %edx, %edx
216 jle .L35 /* Bail if there's nothing to do. */
217
218 movl %esp, %ebp
219 subl $16, %esp
220 andl $0xfffffff0, %esp
221
222 movl %ebx, %eax
223 movl %edx, %esi
224
225 negl %eax
226 andl $15, %eax
227 sarl $2, %eax
228 cmpl %edx, %eax
229 cmovle %eax, %esi
230
231 subl %esi, %edx
232
233 testl $1, %esi
234 je .L32
235
236 DO_ONE_PIXEL()
237 .L32:
238
239 testl $2, %esi
240 je .L31
241
242 movq (%ebx), %mm0
243 addl $8, %ebx
244
245 movq %mm0, %mm3
246 movq %mm0, %mm4
247
248 pand %mm2, %mm3
249 psllq $16, %mm4
250 psrlq $16, %mm3
251 pand %mm2, %mm4
252
253 pand %mm1, %mm0
254 por %mm4, %mm3
255 por %mm3, %mm0
256
257 movq %mm0, (%ecx)
258 addl $8, %ecx
259 .L31:
260
261 movl %edx, %eax
262 shrl $2, %eax
263 jmp .L33
264 .L34:
265 movaps (%ebx), %xmm0
266 addl $16, %ebx
267
268 /* This would be so much better if we could just move directly from
269 * an SSE register to an MMX register. Unfortunately, that
270 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
271 * instruction.
272 */
273
274 movaps %xmm0, (%esp)
275 movq (%esp), %mm0
276 movq 8(%esp), %mm5
277
278 movq %mm0, %mm3
279 movq %mm0, %mm4
280 movq %mm5, %mm6
281 movq %mm5, %mm7
282
283 pand %mm2, %mm3
284 pand %mm2, %mm6
285
286 psllq $16, %mm4
287 psllq $16, %mm7
288
289 psrlq $16, %mm3
290 psrlq $16, %mm6
291
292 pand %mm2, %mm4
293 pand %mm2, %mm7
294
295 pand %mm1, %mm0
296 pand %mm1, %mm5
297
298 por %mm4, %mm3
299 por %mm7, %mm6
300
301 por %mm3, %mm0
302 por %mm6, %mm5
303
304 movq %mm0, (%ecx)
305 movq %mm5, 8(%ecx)
306 addl $16, %ecx
307
308 subl $1, %eax
309 .L33:
310 jne .L34
311
312 #ifdef USE_INNER_EMMS
313 emms
314 #endif
315 movl %ebp, %esp
316
317 /* At this point there are either [0, 3] pixels remaining to be
318 * converted.
319 */
320
321 testl $2, %edx
322 je .L36
323
324 movq (%ebx), %mm0
325 addl $8, %ebx
326
327 movq %mm0, %mm3
328 movq %mm0, %mm4
329
330 pand %mm2, %mm3
331 psllq $16, %mm4
332 psrlq $16, %mm3
333 pand %mm2, %mm4
334
335 pand %mm1, %mm0
336 por %mm4, %mm3
337 por %mm3, %mm0
338
339 movq %mm0, (%ecx)
340 addl $8, %ecx
341 .L36:
342
343 testl $1, %edx
344 je .L35
345
346 DO_ONE_LAST_PIXEL()
347 .L35:
348 popl %ebp
349 popl %ebx
350 popl %esi
351 ret
352 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
353
354
355 /**
356 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
357 */
358
359 .text
360 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
361 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
362 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
363 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
364 pushl %esi
365 pushl %ebx
366
367 /* Kevin F. Quinn 2nd July 2006
368 * Replace data segment constants with text-segment instructions
369 movdqa mask, %xmm1
370 movdqa mask+16, %xmm2
371 */
372 LOAD_MASK(movdqu,%xmm1,%xmm2)
373
374 movl 12(%esp), %ebx /* source pointer */
375 movl 20(%esp), %edx /* number of pixels to copy */
376 movl 16(%esp), %ecx /* destination pointer */
377
378 movl %ebx, %eax
379 movl %edx, %esi
380
381 testl %edx, %edx
382 jle .L46 /* Bail if there's nothing to do. */
383
384 /* If the source pointer isn't a multiple of 16 we have to process
385 * a few pixels the "slow" way to get the address aligned for
386 * the SSE fetch intsructions.
387 */
388
389 negl %eax
390 andl $15, %eax
391 sarl $2, %eax
392
393 cmpl %edx, %eax
394 cmovbe %eax, %esi
395 subl %esi, %edx
396
397 testl $1, %esi
398 je .L41
399
400 DO_ONE_PIXEL()
401 .L41:
402 testl $2, %esi
403 je .L40
404
405 movq (%ebx), %xmm0
406 addl $8, %ebx
407
408 movdqa %xmm0, %xmm3
409 movdqa %xmm0, %xmm4
410 andps %xmm1, %xmm0
411
412 andps %xmm2, %xmm3
413 pslldq $2, %xmm4
414 psrldq $2, %xmm3
415 andps %xmm2, %xmm4
416
417 orps %xmm4, %xmm3
418 orps %xmm3, %xmm0
419
420 movq %xmm0, (%ecx)
421 addl $8, %ecx
422 .L40:
423
424 /* Would it be worth having a specialized version of this loop for
425 * the case where the destination is 16-byte aligned? That version
426 * would be identical except that it could use movedqa instead of
427 * movdqu.
428 */
429
430 movl %edx, %eax
431 shrl $2, %eax
432 jmp .L42
433 .L43:
434 movdqa (%ebx), %xmm0
435 addl $16, %ebx
436
437 movdqa %xmm0, %xmm3
438 movdqa %xmm0, %xmm4
439 andps %xmm1, %xmm0
440
441 andps %xmm2, %xmm3
442 pslldq $2, %xmm4
443 psrldq $2, %xmm3
444 andps %xmm2, %xmm4
445
446 orps %xmm4, %xmm3
447 orps %xmm3, %xmm0
448
449 movdqu %xmm0, (%ecx)
450 addl $16, %ecx
451 subl $1, %eax
452 .L42:
453 jne .L43
454
455
456 /* There may be upto 3 pixels remaining to be copied. Take care
457 * of them now. We do the 2 pixel case first because the data
458 * will be aligned.
459 */
460
461 testl $2, %edx
462 je .L47
463
464 movq (%ebx), %xmm0
465
466 movdqa %xmm0, %xmm3
467 movdqa %xmm0, %xmm4
468 andps %xmm1, %xmm0
469
470 andps %xmm2, %xmm3
471 pslldq $2, %xmm4
472 psrldq $2, %xmm3
473 andps %xmm2, %xmm4
474
475 orps %xmm4, %xmm3
476 orps %xmm3, %xmm0
477
478 movq %xmm0, (%ecx)
479 .L47:
480
481 testl $1, %edx
482 je .L46
483
484 DO_ONE_LAST_PIXEL()
485 .L46:
486
487 popl %ebx
488 popl %esi
489 ret
490 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
491
492
493
494 /* Kevin F. Quinn 2nd July 2006
495 * Replace data segment constants with text-segment instructions
496 */
497 #if 0
498 .section .rodata
499
500 .align 16
501 mask_565:
502 .word 0xf800
503 .word 0x07e0
504 .word 0x001f
505 .word 0x0000
506
507 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
508 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
509 * at a small cost to accuracy.
510 */
511
512 #define SCALE_ADJUST 5
513 #if SCALE_ADJUST == 5
514 prescale:
515 .word 0x0001
516 .word 0x0010
517 .word 0x0200
518 .word 0x0000
519
520 scale:
521 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
522 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
523 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
524 .word 0x0000
525 #elif SCALE_ADJUST == 0
526 prescale:
527 .word 0x0001
528 .word 0x0020
529 .word 0x0800
530 .word 0x0000
531
532 scale:
533 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
534 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
535 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
536 .word 0x0000
537 #else
538 #error SCALE_ADJUST must either be 5 or 0.
539 #endif
540
541
542 alpha: .long 0x00000000
543 .long 0x00ff0000
544 #endif
545
546 #define MASK_565_L 0x07e0f800
547 #define MASK_565_H 0x0000001f
548 #define SCALE_ADJUST 5
549 #if SCALE_ADJUST == 5
550 #define PRESCALE_L 0x00100001
551 #define PRESCALE_H 0x00000200
552 #define SCALE_L 0x40C620E8
553 #define SCALE_H 0x0000839d
554 #elif SCALE_ADJUST == 0
555 #define PRESCALE_L 0x00200001
556 #define PRESCALE_H 0x00000800
557 #define SCALE_L 0x01040108
558 #define SCALE_H 0x00000108
559 #else
560 #error SCALE_ADJUST must either be 5 or 0.
561 #endif
562 #define ALPHA_L 0x00000000
563 #define ALPHA_H 0x00ff0000
564
565 /**
566 * MMX optimized version of the RGB565 to RGBA copy routine.
567 */
568
569 .text
570 .globl _generic_read_RGBA_span_RGB565_MMX
571 .hidden _generic_read_RGBA_span_RGB565_MMX
572 .type _generic_read_RGBA_span_RGB565_MMX, @function
573
574 _generic_read_RGBA_span_RGB565_MMX:
575
576 #ifdef USE_INNER_EMMS
577 emms
578 #endif
579
580 movl 4(%esp), %eax /* source pointer */
581 movl 8(%esp), %edx /* destination pointer */
582 movl 12(%esp), %ecx /* number of pixels to copy */
583
584 /* Kevin F. Quinn 2nd July 2006
585 * Replace data segment constants with text-segment instructions
586 movq mask_565, %mm5
587 movq prescale, %mm6
588 movq scale, %mm7
589 */
590 pushl MASK_565_H
591 pushl MASK_565_L
592 movq (%esp), %mm5
593 pushl PRESCALE_H
594 pushl PRESCALE_L
595 movq (%esp), %mm6
596 pushl SCALE_H
597 pushl SCALE_L
598 movq (%esp), %mm7
599 pushl ALPHA_H
600 pushl ALPHA_L
601 movq (%esp), %mm3
602 addl $32,%esp
603
604 sarl $2, %ecx
605 jle .L01 /* Bail early if the count is negative. */
606 jmp .L02
607
608 .L03:
609 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
610 * second pixels into the four words of %mm0 and %mm2.
611 */
612
613 movq (%eax), %mm4
614 addl $8, %eax
615
616 pshufw $0x00, %mm4, %mm0
617 pshufw $0x55, %mm4, %mm2
618
619
620 /* Mask the pixels so that each word of each register contains only
621 * one color component.
622 */
623
624 pand %mm5, %mm0
625 pand %mm5, %mm2
626
627
628 /* Adjust the component values so that they are as small as possible,
629 * but large enough so that we can multiply them by an unsigned 16-bit
630 * number and get a value as large as 0x00ff0000.
631 */
632
633 pmullw %mm6, %mm0
634 pmullw %mm6, %mm2
635 #if SCALE_ADJUST > 0
636 psrlw $SCALE_ADJUST, %mm0
637 psrlw $SCALE_ADJUST, %mm2
638 #endif
639
640 /* Scale the input component values to be on the range
641 * [0, 0x00ff0000]. This it the real magic of the whole routine.
642 */
643
644 pmulhuw %mm7, %mm0
645 pmulhuw %mm7, %mm2
646
647
648 /* Always set the alpha value to 0xff.
649 */
650
651 /* Kevin F. Quinn 2nd July 2006
652 * Replace data segment constants with text-segment instructions
653 por alpha, %mm0
654 por alpha, %mm2
655 */
656 por %mm3, %mm0
657 por %mm3, %mm2
658
659
660 /* Pack the 16-bit values to 8-bit values and store the converted
661 * pixel data.
662 */
663
664 packuswb %mm2, %mm0
665 movq %mm0, (%edx)
666 addl $8, %edx
667
668
669
670 pshufw $0xaa, %mm4, %mm0
671 pshufw $0xff, %mm4, %mm2
672
673 pand %mm5, %mm0
674 pand %mm5, %mm2
675 pmullw %mm6, %mm0
676 pmullw %mm6, %mm2
677 #if SCALE_ADJUST > 0
678 psrlw $SCALE_ADJUST, %mm0
679 psrlw $SCALE_ADJUST, %mm2
680 #endif
681 pmulhuw %mm7, %mm0
682 pmulhuw %mm7, %mm2
683
684 /* Kevin F. Quinn 2nd July 2006
685 * Replace data segment constants with text-segment instructions
686 por alpha, %mm0
687 por alpha, %mm2
688 */
689 por %mm3, %mm0
690 por %mm3, %mm2
691
692 packuswb %mm2, %mm0
693
694 movq %mm0, (%edx)
695 addl $8, %edx
696
697 subl $1, %ecx
698 .L02:
699 jne .L03
700
701
702 /* At this point there can be at most 3 pixels left to process. If
703 * there is either 2 or 3 left, process 2.
704 */
705
706 movl 12(%esp), %ecx
707 testl $0x02, %ecx
708 je .L04
709
710 movd (%eax), %mm4
711 addl $4, %eax
712
713 pshufw $0x00, %mm4, %mm0
714 pshufw $0x55, %mm4, %mm2
715
716 pand %mm5, %mm0
717 pand %mm5, %mm2
718 pmullw %mm6, %mm0
719 pmullw %mm6, %mm2
720 #if SCALE_ADJUST > 0
721 psrlw $SCALE_ADJUST, %mm0
722 psrlw $SCALE_ADJUST, %mm2
723 #endif
724 pmulhuw %mm7, %mm0
725 pmulhuw %mm7, %mm2
726
727 /* Kevin F. Quinn 2nd July 2006
728 * Replace data segment constants with text-segment instructions
729 por alpha, %mm0
730 por alpha, %mm2
731 */
732 por %mm3, %mm0
733 por %mm3, %mm2
734
735 packuswb %mm2, %mm0
736
737 movq %mm0, (%edx)
738 addl $8, %edx
739
740 .L04:
741 /* At this point there can be at most 1 pixel left to process.
742 * Process it if needed.
743 */
744
745 testl $0x01, %ecx
746 je .L01
747
748 movzxw (%eax), %ecx
749 movd %ecx, %mm4
750
751 pshufw $0x00, %mm4, %mm0
752
753 pand %mm5, %mm0
754 pmullw %mm6, %mm0
755 #if SCALE_ADJUST > 0
756 psrlw $SCALE_ADJUST, %mm0
757 #endif
758 pmulhuw %mm7, %mm0
759
760 /* Kevin F. Quinn 2nd July 2006
761 * Replace data segment constants with text-segment instructions
762 por alpha, %mm0
763 */
764 por %mm3, %mm0
765
766 packuswb %mm0, %mm0
767
768 movd %mm0, (%edx)
769
770 .L01:
771 #ifdef USE_INNER_EMMS
772 emms
773 #endif
774 ret
775 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
776
777 #if defined (__ELF__) && defined (__linux__)
778 .section .note.GNU-stack,"",%progbits
779 #endif