Merge branch 'nouveau-import'
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 .section .rodata
36 .align 16
37 .type mask, @object
38 .size mask, 32
39 mask:
40 .long 0xff00ff00
41 .long 0xff00ff00
42 .long 0xff00ff00
43 .long 0xff00ff00
44 .long 0x00ff0000
45 .long 0x00ff0000
46 .long 0x00ff0000
47 .long 0x00ff0000
48
49
50 /* I implemented these as macros because the appear in quite a few places,
51 * and I've tweaked them a number of times. I got tired of changing every
52 * place they appear. :)
53 */
54
55 #define DO_ONE_PIXEL() \
56 movl (%ebx), %eax ; \
57 addl $4, %ebx ; \
58 bswap %eax /* ARGB -> BGRA */ ; \
59 rorl $8, %eax /* BGRA -> ABGR */ ; \
60 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
61 addl $4, %ecx
62
63 #define DO_ONE_LAST_PIXEL() \
64 movl (%ebx), %eax ; \
65 bswap %eax /* ARGB -> BGRA */ ; \
66 rorl $8, %eax /* BGRA -> ABGR */ ; \
67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
68
69
70 /**
71 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
72 *
73 * \warning
74 * This function assumes that the caller will issue the EMMS instruction
75 * at the correct places.
76 */
77
78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
79 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
80 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
81 _generic_read_RGBA_span_BGRA8888_REV_MMX:
82 pushl %ebx
83
84 #ifdef USE_INNER_EMMS
85 emms
86 #endif
87 movq mask, %mm1
88 movq mask+16, %mm2
89
90 movl 8(%esp), %ebx /* source pointer */
91 movl 16(%esp), %edx /* number of pixels to copy */
92 movl 12(%esp), %ecx /* destination pointer */
93
94 testl %edx, %edx
95 jle .L20 /* Bail if there's nothing to do. */
96
97 movl %ebx, %eax
98
99 negl %eax
100 sarl $2, %eax
101 andl $1, %eax
102 je .L17
103
104 subl %eax, %edx
105 DO_ONE_PIXEL()
106 .L17:
107
108 /* Would it be faster to unroll this loop once and process 4 pixels
109 * per pass, instead of just two?
110 */
111
112 movl %edx, %eax
113 shrl %eax
114 jmp .L18
115 .L19:
116 movq (%ebx), %mm0
117 addl $8, %ebx
118
119 /* These 9 instructions do what PSHUFB (if there were such an
120 * instruction) could do in 1. :(
121 */
122
123 movq %mm0, %mm3
124 movq %mm0, %mm4
125
126 pand %mm2, %mm3
127 psllq $16, %mm4
128 psrlq $16, %mm3
129 pand %mm2, %mm4
130
131 pand %mm1, %mm0
132 por %mm4, %mm3
133 por %mm3, %mm0
134
135 movq %mm0, (%ecx)
136 addl $8, %ecx
137 subl $1, %eax
138 .L18:
139 jne .L19
140
141 #ifdef USE_INNER_EMMS
142 emms
143 #endif
144
145 /* At this point there are either 1 or 0 pixels remaining to be
146 * converted. Convert the last pixel, if needed.
147 */
148
149 testl $1, %edx
150 je .L20
151
152 DO_ONE_LAST_PIXEL()
153
154 .L20:
155 popl %ebx
156 ret
157 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
158
159
160 /**
161 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
162 * instructions are only actually used to read data from the framebuffer.
163 * In practice, the speed-up is pretty small.
164 *
165 * \todo
166 * Do some more testing and determine if there's any reason to have this
167 * function in addition to the MMX version.
168 *
169 * \warning
170 * This function assumes that the caller will issue the EMMS instruction
171 * at the correct places.
172 */
173
174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
176 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
178 pushl %esi
179 pushl %ebx
180 pushl %ebp
181
182 #ifdef USE_INNER_EMMS
183 emms
184 #endif
185 movq mask, %mm1
186 movq mask+16, %mm2
187
188 movl 16(%esp), %ebx /* source pointer */
189 movl 24(%esp), %edx /* number of pixels to copy */
190 movl 20(%esp), %ecx /* destination pointer */
191
192 testl %edx, %edx
193 jle .L35 /* Bail if there's nothing to do. */
194
195 movl %esp, %ebp
196 subl $16, %esp
197 andl $0xfffffff0, %esp
198
199 movl %ebx, %eax
200 movl %edx, %esi
201
202 negl %eax
203 andl $15, %eax
204 sarl $2, %eax
205 cmpl %edx, %eax
206 cmovle %eax, %esi
207
208 subl %esi, %edx
209
210 testl $1, %esi
211 je .L32
212
213 DO_ONE_PIXEL()
214 .L32:
215
216 testl $2, %esi
217 je .L31
218
219 movq (%ebx), %mm0
220 addl $8, %ebx
221
222 movq %mm0, %mm3
223 movq %mm0, %mm4
224
225 pand %mm2, %mm3
226 psllq $16, %mm4
227 psrlq $16, %mm3
228 pand %mm2, %mm4
229
230 pand %mm1, %mm0
231 por %mm4, %mm3
232 por %mm3, %mm0
233
234 movq %mm0, (%ecx)
235 addl $8, %ecx
236 .L31:
237
238 movl %edx, %eax
239 shrl $2, %eax
240 jmp .L33
241 .L34:
242 movaps (%ebx), %xmm0
243 addl $16, %ebx
244
245 /* This would be so much better if we could just move directly from
246 * an SSE register to an MMX register. Unfortunately, that
247 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
248 * instruction.
249 */
250
251 movaps %xmm0, (%esp)
252 movq (%esp), %mm0
253 movq 8(%esp), %mm5
254
255 movq %mm0, %mm3
256 movq %mm0, %mm4
257 movq %mm5, %mm6
258 movq %mm5, %mm7
259
260 pand %mm2, %mm3
261 pand %mm2, %mm6
262
263 psllq $16, %mm4
264 psllq $16, %mm7
265
266 psrlq $16, %mm3
267 psrlq $16, %mm6
268
269 pand %mm2, %mm4
270 pand %mm2, %mm7
271
272 pand %mm1, %mm0
273 pand %mm1, %mm5
274
275 por %mm4, %mm3
276 por %mm7, %mm6
277
278 por %mm3, %mm0
279 por %mm6, %mm5
280
281 movq %mm0, (%ecx)
282 movq %mm5, 8(%ecx)
283 addl $16, %ecx
284
285 subl $1, %eax
286 .L33:
287 jne .L34
288
289 #ifdef USE_INNER_EMMS
290 emms
291 #endif
292 movl %ebp, %esp
293
294 /* At this point there are either [0, 3] pixels remaining to be
295 * converted.
296 */
297
298 testl $2, %edx
299 je .L36
300
301 movq (%ebx), %mm0
302 addl $8, %ebx
303
304 movq %mm0, %mm3
305 movq %mm0, %mm4
306
307 pand %mm2, %mm3
308 psllq $16, %mm4
309 psrlq $16, %mm3
310 pand %mm2, %mm4
311
312 pand %mm1, %mm0
313 por %mm4, %mm3
314 por %mm3, %mm0
315
316 movq %mm0, (%ecx)
317 addl $8, %ecx
318 .L36:
319
320 testl $1, %edx
321 je .L35
322
323 DO_ONE_LAST_PIXEL()
324 .L35:
325 popl %ebp
326 popl %ebx
327 popl %esi
328 ret
329 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
330
331
332 /**
333 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
334 */
335
336 .text
337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
339 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
341 pushl %esi
342 pushl %ebx
343
344 movdqa mask, %xmm1
345 movdqa mask+16, %xmm2
346
347 movl 12(%esp), %ebx /* source pointer */
348 movl 20(%esp), %edx /* number of pixels to copy */
349 movl 16(%esp), %ecx /* destination pointer */
350
351 movl %ebx, %eax
352 movl %edx, %esi
353
354 testl %edx, %edx
355 jle .L46 /* Bail if there's nothing to do. */
356
357 /* If the source pointer isn't a multiple of 16 we have to process
358 * a few pixels the "slow" way to get the address aligned for
359 * the SSE fetch intsructions.
360 */
361
362 negl %eax
363 andl $15, %eax
364 sarl $2, %eax
365
366 cmpl %edx, %eax
367 cmovbe %eax, %esi
368 subl %esi, %edx
369
370 testl $1, %esi
371 je .L41
372
373 DO_ONE_PIXEL()
374 .L41:
375 testl $2, %esi
376 je .L40
377
378 movq (%ebx), %xmm0
379 addl $8, %ebx
380
381 movdqa %xmm0, %xmm3
382 movdqa %xmm0, %xmm4
383 andps %xmm1, %xmm0
384
385 andps %xmm2, %xmm3
386 pslldq $2, %xmm4
387 psrldq $2, %xmm3
388 andps %xmm2, %xmm4
389
390 orps %xmm4, %xmm3
391 orps %xmm3, %xmm0
392
393 movq %xmm0, (%ecx)
394 addl $8, %ecx
395 .L40:
396
397 /* Would it be worth having a specialized version of this loop for
398 * the case where the destination is 16-byte aligned? That version
399 * would be identical except that it could use movedqa instead of
400 * movdqu.
401 */
402
403 movl %edx, %eax
404 shrl $2, %eax
405 jmp .L42
406 .L43:
407 movdqa (%ebx), %xmm0
408 addl $16, %ebx
409
410 movdqa %xmm0, %xmm3
411 movdqa %xmm0, %xmm4
412 andps %xmm1, %xmm0
413
414 andps %xmm2, %xmm3
415 pslldq $2, %xmm4
416 psrldq $2, %xmm3
417 andps %xmm2, %xmm4
418
419 orps %xmm4, %xmm3
420 orps %xmm3, %xmm0
421
422 movdqu %xmm0, (%ecx)
423 addl $16, %ecx
424 subl $1, %eax
425 .L42:
426 jne .L43
427
428
429 /* There may be upto 3 pixels remaining to be copied. Take care
430 * of them now. We do the 2 pixel case first because the data
431 * will be aligned.
432 */
433
434 testl $2, %edx
435 je .L47
436
437 movq (%ebx), %xmm0
438
439 movdqa %xmm0, %xmm3
440 movdqa %xmm0, %xmm4
441 andps %xmm1, %xmm0
442
443 andps %xmm2, %xmm3
444 pslldq $2, %xmm4
445 psrldq $2, %xmm3
446 andps %xmm2, %xmm4
447
448 orps %xmm4, %xmm3
449 orps %xmm3, %xmm0
450
451 movq %xmm0, (%ecx)
452 .L47:
453
454 testl $1, %edx
455 je .L46
456
457 DO_ONE_LAST_PIXEL()
458 .L46:
459
460 popl %ebx
461 popl %esi
462 ret
463 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
464
465
466
467 .section .rodata
468
469 .align 16
470 mask_565:
471 .word 0xf800
472 .word 0x07e0
473 .word 0x001f
474 .word 0x0000
475
476 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
477 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
478 * at a small cost to accuracy.
479 */
480
481 #define SCALE_ADJUST 5
482 #if SCALE_ADJUST == 5
483 prescale:
484 .word 0x0001
485 .word 0x0010
486 .word 0x0200
487 .word 0x0000
488
489 scale:
490 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
491 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
492 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
493 .word 0x0000
494 #elif SCALE_ADJUST == 0
495 prescale:
496 .word 0x0001
497 .word 0x0020
498 .word 0x0800
499 .word 0x0000
500
501 scale:
502 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
503 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
504 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
505 .word 0x0000
506 #else
507 #error SCALE_ADJUST must either be 5 or 0.
508 #endif
509
510
511 alpha: .long 0x00000000
512 .long 0x00ff0000
513
514 /**
515 * MMX optimized version of the RGB565 to RGBA copy routine.
516 */
517
518 .text
519 .globl _generic_read_RGBA_span_RGB565_MMX
520 .hidden _generic_read_RGBA_span_RGB565_MMX
521 .type _generic_read_RGBA_span_RGB565_MMX, @function
522
523 _generic_read_RGBA_span_RGB565_MMX:
524
525 #ifdef USE_INNER_EMMS
526 emms
527 #endif
528
529 movl 4(%esp), %eax /* source pointer */
530 movl 8(%esp), %edx /* destination pointer */
531 movl 12(%esp), %ecx /* number of pixels to copy */
532
533 movq mask_565, %mm5
534 movq prescale, %mm6
535 movq scale, %mm7
536
537 sarl $2, %ecx
538 jle .L01 /* Bail early if the count is negative. */
539 jmp .L02
540
541 .L03:
542 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
543 * second pixels into the four words of %mm0 and %mm2.
544 */
545
546 movq (%eax), %mm4
547 addl $8, %eax
548
549 pshufw $0x00, %mm4, %mm0
550 pshufw $0x55, %mm4, %mm2
551
552
553 /* Mask the pixels so that each word of each register contains only
554 * one color component.
555 */
556
557 pand %mm5, %mm0
558 pand %mm5, %mm2
559
560
561 /* Adjust the component values so that they are as small as possible,
562 * but large enough so that we can multiply them by an unsigned 16-bit
563 * number and get a value as large as 0x00ff0000.
564 */
565
566 pmullw %mm6, %mm0
567 pmullw %mm6, %mm2
568 #if SCALE_ADJUST > 0
569 psrlw $SCALE_ADJUST, %mm0
570 psrlw $SCALE_ADJUST, %mm2
571 #endif
572
573 /* Scale the input component values to be on the range
574 * [0, 0x00ff0000]. This it the real magic of the whole routine.
575 */
576
577 pmulhuw %mm7, %mm0
578 pmulhuw %mm7, %mm2
579
580
581 /* Always set the alpha value to 0xff.
582 */
583
584 por alpha, %mm0
585 por alpha, %mm2
586
587
588 /* Pack the 16-bit values to 8-bit values and store the converted
589 * pixel data.
590 */
591
592 packuswb %mm2, %mm0
593 movq %mm0, (%edx)
594 addl $8, %edx
595
596
597
598 pshufw $0xaa, %mm4, %mm0
599 pshufw $0xff, %mm4, %mm2
600
601 pand %mm5, %mm0
602 pand %mm5, %mm2
603 pmullw %mm6, %mm0
604 pmullw %mm6, %mm2
605 #if SCALE_ADJUST > 0
606 psrlw $SCALE_ADJUST, %mm0
607 psrlw $SCALE_ADJUST, %mm2
608 #endif
609 pmulhuw %mm7, %mm0
610 pmulhuw %mm7, %mm2
611
612 por alpha, %mm0
613 por alpha, %mm2
614
615 packuswb %mm2, %mm0
616
617 movq %mm0, (%edx)
618 addl $8, %edx
619
620 subl $1, %ecx
621 .L02:
622 jne .L03
623
624
625 /* At this point there can be at most 3 pixels left to process. If
626 * there is either 2 or 3 left, process 2.
627 */
628
629 movl 12(%esp), %ecx
630 testl $0x02, %ecx
631 je .L04
632
633 movd (%eax), %mm4
634 addl $4, %eax
635
636 pshufw $0x00, %mm4, %mm0
637 pshufw $0x55, %mm4, %mm2
638
639 pand %mm5, %mm0
640 pand %mm5, %mm2
641 pmullw %mm6, %mm0
642 pmullw %mm6, %mm2
643 #if SCALE_ADJUST > 0
644 psrlw $SCALE_ADJUST, %mm0
645 psrlw $SCALE_ADJUST, %mm2
646 #endif
647 pmulhuw %mm7, %mm0
648 pmulhuw %mm7, %mm2
649
650 por alpha, %mm0
651 por alpha, %mm2
652
653 packuswb %mm2, %mm0
654
655 movq %mm0, (%edx)
656 addl $8, %edx
657
658 .L04:
659 /* At this point there can be at most 1 pixel left to process.
660 * Process it if needed.
661 */
662
663 testl $0x01, %ecx
664 je .L01
665
666 movzxw (%eax), %ecx
667 movd %ecx, %mm4
668
669 pshufw $0x00, %mm4, %mm0
670
671 pand %mm5, %mm0
672 pmullw %mm6, %mm0
673 #if SCALE_ADJUST > 0
674 psrlw $SCALE_ADJUST, %mm0
675 #endif
676 pmulhuw %mm7, %mm0
677
678 por alpha, %mm0
679
680 packuswb %mm0, %mm0
681
682 movd %mm0, (%edx)
683
684 .L01:
685 #ifdef USE_INNER_EMMS
686 emms
687 #endif
688 ret
689 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
690
691 #if defined (__ELF__) && defined (__linux__)
692 .section .note.GNU-stack,"",%progbits
693 #endif