0bade675a58fb98dbd8d9255d10a6f1df4005bbd
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 .section .rodata
36 .align 16
37 .type mask, @object
38 .size mask, 32
39 mask:
40 .long 0xff00ff00
41 .long 0xff00ff00
42 .long 0xff00ff00
43 .long 0xff00ff00
44 .long 0x00ff0000
45 .long 0x00ff0000
46 .long 0x00ff0000
47 .long 0x00ff0000
48
49
50 /* I implemented these as macros because the appear in quite a few places,
51 * and I've tweaked them a number of times. I got tired of changing every
52 * place they appear. :)
53 */
54
55 #define DO_ONE_PIXEL() \
56 movl (%ebx), %eax ; \
57 addl $4, %ebx ; \
58 bswap %eax /* ARGB -> BGRA */ ; \
59 rorl $8, %eax /* BGRA -> ABGR */ ; \
60 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
61 addl $4, %ecx
62
63 #define DO_ONE_LAST_PIXEL() \
64 movl (%ebx), %eax ; \
65 bswap %eax /* ARGB -> BGRA */ ; \
66 rorl $8, %eax /* BGRA -> ABGR */ ; \
67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
68
69
70 /**
71 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
72 *
73 * \warning
74 * This function assumes that the caller will issue the EMMS instruction
75 * at the correct places.
76 */
77
78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
79 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
80 _generic_read_RGBA_span_BGRA8888_REV_MMX:
81 pushl %ebx
82
83 #ifdef USE_INNER_EMMS
84 emms
85 #endif
86 movq mask, %mm1
87 movq mask+16, %mm2
88
89 movl 8(%esp), %ebx /* source pointer */
90 movl 16(%esp), %edx /* number of pixels to copy */
91 movl 12(%esp), %ecx /* destination pointer */
92
93 testl %edx, %edx
94 je .L20 /* Bail if there's nothing to do. */
95
96 movl %ebx, %eax
97
98 negl %eax
99 sarl $2, %eax
100 andl $1, %eax
101 je .L17
102
103 subl %eax, %edx
104 DO_ONE_PIXEL()
105 .L17:
106
107 /* Would it be faster to unroll this loop once and process 4 pixels
108 * per pass, instead of just two?
109 */
110
111 movl %edx, %eax
112 shrl %eax
113 jmp .L18
114 .L19:
115 movq (%ebx), %mm0
116 addl $8, %ebx
117
118 /* These 9 instructions do what PSHUFB (if there were such an
119 * instruction) could do in 1. :(
120 */
121
122 movq %mm0, %mm3
123 movq %mm0, %mm4
124
125 pand %mm2, %mm3
126 psllq $16, %mm4
127 psrlq $16, %mm3
128 pand %mm2, %mm4
129
130 pand %mm1, %mm0
131 por %mm4, %mm3
132 por %mm3, %mm0
133
134 movq %mm0, (%ecx)
135 addl $8, %ecx
136 subl $1, %eax
137 .L18:
138 jne .L19
139
140 #ifdef USE_INNER_EMMS
141 emms
142 #endif
143
144 /* At this point there are either 1 or 0 pixels remaining to be
145 * converted. Convert the last pixel, if needed.
146 */
147
148 testl $1, %edx
149 je .L20
150
151 DO_ONE_LAST_PIXEL()
152
153 .L20:
154 popl %ebx
155 ret
156 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
157
158
159 /**
160 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
161 * instructions are only actually used to read data from the framebuffer.
162 * In practice, the speed-up is pretty small.
163 *
164 * \todo
165 * Do some more testing and determine if there's any reason to have this
166 * function in addition to the MMX version.
167 *
168 * \warning
169 * This function assumes that the caller will issue the EMMS instruction
170 * at the correct places.
171 */
172
173 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
174 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
175 _generic_read_RGBA_span_BGRA8888_REV_SSE:
176 pushl %esi
177 pushl %ebx
178 pushl %ebp
179
180 #ifdef USE_INNER_EMMS
181 emms
182 #endif
183 movq mask, %mm1
184 movq mask+16, %mm2
185
186 movl 16(%esp), %ebx /* source pointer */
187 movl 24(%esp), %edx /* number of pixels to copy */
188 movl 20(%esp), %ecx /* destination pointer */
189
190 movl %esp, %ebp
191 subl $16, %esp
192 andl $0xfffffff0, %esp
193
194 movl %ebx, %eax
195 movl %edx, %esi
196
197 negl %eax
198 andl $15, %eax
199 sarl $2, %eax
200 cmpl %edx, %eax
201 cmovle %eax, %esi
202
203 subl %esi, %edx
204
205 testl $1, %esi
206 je .L32
207
208 DO_ONE_PIXEL()
209 .L32:
210
211 testl $2, %esi
212 je .L31
213
214 movq (%ebx), %mm0
215 addl $8, %ebx
216
217 movq %mm0, %mm3
218 movq %mm0, %mm4
219
220 pand %mm2, %mm3
221 psllq $16, %mm4
222 psrlq $16, %mm3
223 pand %mm2, %mm4
224
225 pand %mm1, %mm0
226 por %mm4, %mm3
227 por %mm3, %mm0
228
229 movq %mm0, (%ecx)
230 addl $8, %ecx
231 .L31:
232
233 movl %edx, %eax
234 shrl $2, %eax
235 jmp .L33
236 .L34:
237 movaps (%ebx), %xmm0
238 addl $16, %ebx
239
240 /* This would be so much better if we could just move directly from
241 * an SSE register to an MMX register. Unfortunately, that
242 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
243 * instruction.
244 */
245
246 movaps %xmm0, (%esp)
247 movq (%esp), %mm0
248 movq 8(%esp), %mm5
249
250 movq %mm0, %mm3
251 movq %mm0, %mm4
252 movq %mm5, %mm6
253 movq %mm5, %mm7
254
255 pand %mm2, %mm3
256 pand %mm2, %mm6
257
258 psllq $16, %mm4
259 psllq $16, %mm7
260
261 psrlq $16, %mm3
262 psrlq $16, %mm6
263
264 pand %mm2, %mm4
265 pand %mm2, %mm7
266
267 pand %mm1, %mm0
268 pand %mm1, %mm5
269
270 por %mm4, %mm3
271 por %mm7, %mm6
272
273 por %mm3, %mm0
274 por %mm6, %mm5
275
276 movq %mm0, (%ecx)
277 movq %mm5, 8(%ecx)
278 addl $16, %ecx
279
280 subl $1, %eax
281 .L33:
282 jne .L34
283
284 #ifdef USE_INNER_EMMS
285 emms
286 #endif
287 movl %ebp, %esp
288
289 /* At this point there are either [0, 3] pixels remaining to be
290 * converted.
291 */
292
293 testl $2, %edx
294 je .L36
295
296 movq (%ebx), %mm0
297 addl $8, %ebx
298
299 movq %mm0, %mm3
300 movq %mm0, %mm4
301
302 pand %mm2, %mm3
303 psllq $16, %mm4
304 psrlq $16, %mm3
305 pand %mm2, %mm4
306
307 pand %mm1, %mm0
308 por %mm4, %mm3
309 por %mm3, %mm0
310
311 movq %mm0, (%ecx)
312 addl $8, %ecx
313 .L36:
314
315 testl $1, %edx
316 je .L35
317
318 DO_ONE_LAST_PIXEL()
319 .L35:
320 popl %ebp
321 popl %ebx
322 popl %esi
323 ret
324 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
325
326
327 /**
328 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
329 */
330
331 .text
332 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
333 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
334 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
335 pushl %esi
336 pushl %ebx
337
338 movdqa mask, %xmm1
339 movdqa mask+16, %xmm2
340
341 movl 12(%esp), %ebx /* source pointer */
342 movl 20(%esp), %edx /* number of pixels to copy */
343 movl 16(%esp), %ecx /* destination pointer */
344
345 movl %ebx, %eax
346 movl %edx, %esi
347
348 /* If the source pointer isn't a multiple of 16 we have to process
349 * a few pixels the "slow" way to get the address aligned for
350 * the SSE fetch intsructions.
351 */
352
353 negl %eax
354 andl $15, %eax
355 sarl $2, %eax
356
357 cmpl %edx, %eax
358 cmovbe %eax, %esi
359 subl %esi, %edx
360
361 testl $1, %esi
362 je .L41
363
364 DO_ONE_PIXEL()
365 .L41:
366 testl $2, %esi
367 je .L40
368
369 movq (%ebx), %xmm0
370 addl $8, %ebx
371
372 movdqa %xmm0, %xmm3
373 movdqa %xmm0, %xmm4
374 andps %xmm1, %xmm0
375
376 andps %xmm2, %xmm3
377 pslldq $2, %xmm4
378 psrldq $2, %xmm3
379 andps %xmm2, %xmm4
380
381 orps %xmm4, %xmm3
382 orps %xmm3, %xmm0
383
384 movq %xmm0, (%ecx)
385 addl $8, %ecx
386 .L40:
387
388 /* Would it be worth having a specialized version of this loop for
389 * the case where the destination is 16-byte aligned? That version
390 * would be identical except that it could use movedqa instead of
391 * movdqu.
392 */
393
394 movl %edx, %eax
395 shrl $2, %eax
396 jmp .L42
397 .L43:
398 movdqa (%ebx), %xmm0
399 addl $16, %ebx
400
401 movdqa %xmm0, %xmm3
402 movdqa %xmm0, %xmm4
403 andps %xmm1, %xmm0
404
405 andps %xmm2, %xmm3
406 pslldq $2, %xmm4
407 psrldq $2, %xmm3
408 andps %xmm2, %xmm4
409
410 orps %xmm4, %xmm3
411 orps %xmm3, %xmm0
412
413 movdqu %xmm0, (%ecx)
414 addl $16, %ecx
415 subl $1, %eax
416 .L42:
417 jne .L43
418
419
420 /* There may be upto 3 pixels remaining to be copied. Take care
421 * of them now. We do the 2 pixel case first because the data
422 * will be aligned.
423 */
424
425 testl $2, %edx
426 je .L47
427
428 movq (%ebx), %xmm0
429
430 movdqa %xmm0, %xmm3
431 movdqa %xmm0, %xmm4
432 andps %xmm1, %xmm0
433
434 andps %xmm2, %xmm3
435 pslldq $2, %xmm4
436 psrldq $2, %xmm3
437 andps %xmm2, %xmm4
438
439 orps %xmm4, %xmm3
440 orps %xmm3, %xmm0
441
442 movq %xmm0, (%ecx)
443 .L47:
444
445 testl $1, %edx
446 je .L46
447
448 DO_ONE_LAST_PIXEL()
449 .L46:
450
451 popl %ebx
452 popl %esi
453 ret
454 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
455
456
457
458 .section .rodata
459
460 .align 16
461 mask_565:
462 .word 0xf800
463 .word 0x07e0
464 .word 0x001f
465 .word 0x0000
466
467 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
468 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
469 * at a small cost to accuracy.
470 */
471
472 #define SCALE_ADJUST 5
473 #if SCALE_ADJUST == 5
474 prescale:
475 .word 0x0001
476 .word 0x0010
477 .word 0x0200
478 .word 0x0000
479
480 scale:
481 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
482 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
483 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
484 .word 0x0000
485 #elif SCALE_ADJUST == 0
486 prescale:
487 .word 0x0001
488 .word 0x0020
489 .word 0x0800
490 .word 0x0000
491
492 scale:
493 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
494 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
495 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
496 .word 0x0000
497 #else
498 #error SCALE_ADJUST must either be 5 or 0.
499 #endif
500
501
502 alpha: .long 0x00000000
503 .long 0x00ff0000
504
505 /**
506 * MMX optimized version of the RGB565 to RGBA copy routine.
507 */
508
509 .text
510 .globl _generic_read_RGBA_span_RGB565_MMX
511 .type _generic_read_RGBA_span_RGB565_MMX, @function
512
513 _generic_read_RGBA_span_RGB565_MMX:
514
515 #ifdef USE_INNER_EMMS
516 emms
517 #endif
518
519 movl 4(%esp), %eax /* source pointer */
520 movl 8(%esp), %edx /* destination pointer */
521 movl 12(%esp), %ecx /* number of pixels to copy */
522
523 movq mask_565, %mm5
524 movq prescale, %mm6
525 movq scale, %mm7
526
527 shrl $2, %ecx
528 jmp .L02
529
530 .L03:
531 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
532 * second pixels into the four words of %mm0 and %mm2.
533 */
534
535 movq (%eax), %mm4
536 addl $8, %eax
537
538 pshufw $0x00, %mm4, %mm0
539 pshufw $0x55, %mm4, %mm2
540
541
542 /* Mask the pixels so that each word of each register contains only
543 * one color component.
544 */
545
546 pand %mm5, %mm0
547 pand %mm5, %mm2
548
549
550 /* Adjust the component values so that they are as small as possible,
551 * but large enough so that we can multiply them by an unsigned 16-bit
552 * number and get a value as large as 0x00ff0000.
553 */
554
555 pmullw %mm6, %mm0
556 pmullw %mm6, %mm2
557 #if SCALE_ADJUST > 0
558 psrlw $SCALE_ADJUST, %mm0
559 psrlw $SCALE_ADJUST, %mm2
560 #endif
561
562 /* Scale the input component values to be on the range
563 * [0, 0x00ff0000]. This it the real magic of the whole routine.
564 */
565
566 pmulhuw %mm7, %mm0
567 pmulhuw %mm7, %mm2
568
569
570 /* Always set the alpha value to 0xff.
571 */
572
573 por alpha, %mm0
574 por alpha, %mm2
575
576
577 /* Pack the 16-bit values to 8-bit values and store the converted
578 * pixel data.
579 */
580
581 packuswb %mm2, %mm0
582 movq %mm0, (%edx)
583 addl $8, %edx
584
585
586
587 pshufw $0xaa, %mm4, %mm0
588 pshufw $0xff, %mm4, %mm2
589
590 pand %mm5, %mm0
591 pand %mm5, %mm2
592 pmullw %mm6, %mm0
593 pmullw %mm6, %mm2
594 #if SCALE_ADJUST > 0
595 psrlw $SCALE_ADJUST, %mm0
596 psrlw $SCALE_ADJUST, %mm2
597 #endif
598 pmulhuw %mm7, %mm0
599 pmulhuw %mm7, %mm2
600
601 por alpha, %mm0
602 por alpha, %mm2
603
604 packuswb %mm2, %mm0
605
606 movq %mm0, (%edx)
607 addl $8, %edx
608
609 subl $1, %ecx
610 .L02:
611 jne .L03
612
613
614 /* At this point there can be at most 3 pixels left to process. If
615 * there is either 2 or 3 left, process 2.
616 */
617
618 movl 12(%esp), %ecx
619 testl $0x02, %ecx
620 je .L04
621
622 movd (%eax), %mm4
623 addl $4, %eax
624
625 pshufw $0x00, %mm4, %mm0
626 pshufw $0x55, %mm4, %mm2
627
628 pand %mm5, %mm0
629 pand %mm5, %mm2
630 pmullw %mm6, %mm0
631 pmullw %mm6, %mm2
632 #if SCALE_ADJUST > 0
633 psrlw $SCALE_ADJUST, %mm0
634 psrlw $SCALE_ADJUST, %mm2
635 #endif
636 pmulhuw %mm7, %mm0
637 pmulhuw %mm7, %mm2
638
639 por alpha, %mm0
640 por alpha, %mm2
641
642 packuswb %mm2, %mm0
643
644 movq %mm0, (%edx)
645 addl $8, %edx
646
647 .L04:
648 /* At this point there can be at most 1 pixel left to process.
649 * Process it if needed.
650 */
651
652 testl $0x01, %ecx
653 je .L01
654
655 movzxw (%eax), %ecx
656 movd %ecx, %mm4
657
658 pshufw $0x00, %mm4, %mm0
659
660 pand %mm5, %mm0
661 pmullw %mm6, %mm0
662 #if SCALE_ADJUST > 0
663 psrlw $SCALE_ADJUST, %mm0
664 #endif
665 pmulhuw %mm7, %mm0
666
667 por alpha, %mm0
668
669 packuswb %mm0, %mm0
670
671 movd %mm0, (%edx)
672
673 .L01:
674 #ifdef USE_INNER_EMMS
675 emms
676 #endif
677 ret
678 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */