use HIDDEN macro to export fewer symbols (bug 2210)
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 .section .rodata
36 .align 16
37 .type mask, @object
38 .size mask, 32
39 mask:
40 .long 0xff00ff00
41 .long 0xff00ff00
42 .long 0xff00ff00
43 .long 0xff00ff00
44 .long 0x00ff0000
45 .long 0x00ff0000
46 .long 0x00ff0000
47 .long 0x00ff0000
48
49
50 /* I implemented these as macros because the appear in quite a few places,
51 * and I've tweaked them a number of times. I got tired of changing every
52 * place they appear. :)
53 */
54
55 #define DO_ONE_PIXEL() \
56 movl (%ebx), %eax ; \
57 addl $4, %ebx ; \
58 bswap %eax /* ARGB -> BGRA */ ; \
59 rorl $8, %eax /* BGRA -> ABGR */ ; \
60 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
61 addl $4, %ecx
62
63 #define DO_ONE_LAST_PIXEL() \
64 movl (%ebx), %eax ; \
65 bswap %eax /* ARGB -> BGRA */ ; \
66 rorl $8, %eax /* BGRA -> ABGR */ ; \
67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
68
69
70 /**
71 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
72 *
73 * \warning
74 * This function assumes that the caller will issue the EMMS instruction
75 * at the correct places.
76 */
77
78 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
79 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
80 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
81 _generic_read_RGBA_span_BGRA8888_REV_MMX:
82 pushl %ebx
83
84 #ifdef USE_INNER_EMMS
85 emms
86 #endif
87 movq mask, %mm1
88 movq mask+16, %mm2
89
90 movl 8(%esp), %ebx /* source pointer */
91 movl 16(%esp), %edx /* number of pixels to copy */
92 movl 12(%esp), %ecx /* destination pointer */
93
94 testl %edx, %edx
95 je .L20 /* Bail if there's nothing to do. */
96
97 movl %ebx, %eax
98
99 negl %eax
100 sarl $2, %eax
101 andl $1, %eax
102 je .L17
103
104 subl %eax, %edx
105 DO_ONE_PIXEL()
106 .L17:
107
108 /* Would it be faster to unroll this loop once and process 4 pixels
109 * per pass, instead of just two?
110 */
111
112 movl %edx, %eax
113 shrl %eax
114 jmp .L18
115 .L19:
116 movq (%ebx), %mm0
117 addl $8, %ebx
118
119 /* These 9 instructions do what PSHUFB (if there were such an
120 * instruction) could do in 1. :(
121 */
122
123 movq %mm0, %mm3
124 movq %mm0, %mm4
125
126 pand %mm2, %mm3
127 psllq $16, %mm4
128 psrlq $16, %mm3
129 pand %mm2, %mm4
130
131 pand %mm1, %mm0
132 por %mm4, %mm3
133 por %mm3, %mm0
134
135 movq %mm0, (%ecx)
136 addl $8, %ecx
137 subl $1, %eax
138 .L18:
139 jne .L19
140
141 #ifdef USE_INNER_EMMS
142 emms
143 #endif
144
145 /* At this point there are either 1 or 0 pixels remaining to be
146 * converted. Convert the last pixel, if needed.
147 */
148
149 testl $1, %edx
150 je .L20
151
152 DO_ONE_LAST_PIXEL()
153
154 .L20:
155 popl %ebx
156 ret
157 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
158
159
160 /**
161 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
162 * instructions are only actually used to read data from the framebuffer.
163 * In practice, the speed-up is pretty small.
164 *
165 * \todo
166 * Do some more testing and determine if there's any reason to have this
167 * function in addition to the MMX version.
168 *
169 * \warning
170 * This function assumes that the caller will issue the EMMS instruction
171 * at the correct places.
172 */
173
174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
176 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
178 pushl %esi
179 pushl %ebx
180 pushl %ebp
181
182 #ifdef USE_INNER_EMMS
183 emms
184 #endif
185 movq mask, %mm1
186 movq mask+16, %mm2
187
188 movl 16(%esp), %ebx /* source pointer */
189 movl 24(%esp), %edx /* number of pixels to copy */
190 movl 20(%esp), %ecx /* destination pointer */
191
192 movl %esp, %ebp
193 subl $16, %esp
194 andl $0xfffffff0, %esp
195
196 movl %ebx, %eax
197 movl %edx, %esi
198
199 negl %eax
200 andl $15, %eax
201 sarl $2, %eax
202 cmpl %edx, %eax
203 cmovle %eax, %esi
204
205 subl %esi, %edx
206
207 testl $1, %esi
208 je .L32
209
210 DO_ONE_PIXEL()
211 .L32:
212
213 testl $2, %esi
214 je .L31
215
216 movq (%ebx), %mm0
217 addl $8, %ebx
218
219 movq %mm0, %mm3
220 movq %mm0, %mm4
221
222 pand %mm2, %mm3
223 psllq $16, %mm4
224 psrlq $16, %mm3
225 pand %mm2, %mm4
226
227 pand %mm1, %mm0
228 por %mm4, %mm3
229 por %mm3, %mm0
230
231 movq %mm0, (%ecx)
232 addl $8, %ecx
233 .L31:
234
235 movl %edx, %eax
236 shrl $2, %eax
237 jmp .L33
238 .L34:
239 movaps (%ebx), %xmm0
240 addl $16, %ebx
241
242 /* This would be so much better if we could just move directly from
243 * an SSE register to an MMX register. Unfortunately, that
244 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
245 * instruction.
246 */
247
248 movaps %xmm0, (%esp)
249 movq (%esp), %mm0
250 movq 8(%esp), %mm5
251
252 movq %mm0, %mm3
253 movq %mm0, %mm4
254 movq %mm5, %mm6
255 movq %mm5, %mm7
256
257 pand %mm2, %mm3
258 pand %mm2, %mm6
259
260 psllq $16, %mm4
261 psllq $16, %mm7
262
263 psrlq $16, %mm3
264 psrlq $16, %mm6
265
266 pand %mm2, %mm4
267 pand %mm2, %mm7
268
269 pand %mm1, %mm0
270 pand %mm1, %mm5
271
272 por %mm4, %mm3
273 por %mm7, %mm6
274
275 por %mm3, %mm0
276 por %mm6, %mm5
277
278 movq %mm0, (%ecx)
279 movq %mm5, 8(%ecx)
280 addl $16, %ecx
281
282 subl $1, %eax
283 .L33:
284 jne .L34
285
286 #ifdef USE_INNER_EMMS
287 emms
288 #endif
289 movl %ebp, %esp
290
291 /* At this point there are either [0, 3] pixels remaining to be
292 * converted.
293 */
294
295 testl $2, %edx
296 je .L36
297
298 movq (%ebx), %mm0
299 addl $8, %ebx
300
301 movq %mm0, %mm3
302 movq %mm0, %mm4
303
304 pand %mm2, %mm3
305 psllq $16, %mm4
306 psrlq $16, %mm3
307 pand %mm2, %mm4
308
309 pand %mm1, %mm0
310 por %mm4, %mm3
311 por %mm3, %mm0
312
313 movq %mm0, (%ecx)
314 addl $8, %ecx
315 .L36:
316
317 testl $1, %edx
318 je .L35
319
320 DO_ONE_LAST_PIXEL()
321 .L35:
322 popl %ebp
323 popl %ebx
324 popl %esi
325 ret
326 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
327
328
329 /**
330 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
331 */
332
333 .text
334 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
335 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
336 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
337 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
338 pushl %esi
339 pushl %ebx
340
341 movdqa mask, %xmm1
342 movdqa mask+16, %xmm2
343
344 movl 12(%esp), %ebx /* source pointer */
345 movl 20(%esp), %edx /* number of pixels to copy */
346 movl 16(%esp), %ecx /* destination pointer */
347
348 movl %ebx, %eax
349 movl %edx, %esi
350
351 /* If the source pointer isn't a multiple of 16 we have to process
352 * a few pixels the "slow" way to get the address aligned for
353 * the SSE fetch intsructions.
354 */
355
356 negl %eax
357 andl $15, %eax
358 sarl $2, %eax
359
360 cmpl %edx, %eax
361 cmovbe %eax, %esi
362 subl %esi, %edx
363
364 testl $1, %esi
365 je .L41
366
367 DO_ONE_PIXEL()
368 .L41:
369 testl $2, %esi
370 je .L40
371
372 movq (%ebx), %xmm0
373 addl $8, %ebx
374
375 movdqa %xmm0, %xmm3
376 movdqa %xmm0, %xmm4
377 andps %xmm1, %xmm0
378
379 andps %xmm2, %xmm3
380 pslldq $2, %xmm4
381 psrldq $2, %xmm3
382 andps %xmm2, %xmm4
383
384 orps %xmm4, %xmm3
385 orps %xmm3, %xmm0
386
387 movq %xmm0, (%ecx)
388 addl $8, %ecx
389 .L40:
390
391 /* Would it be worth having a specialized version of this loop for
392 * the case where the destination is 16-byte aligned? That version
393 * would be identical except that it could use movedqa instead of
394 * movdqu.
395 */
396
397 movl %edx, %eax
398 shrl $2, %eax
399 jmp .L42
400 .L43:
401 movdqa (%ebx), %xmm0
402 addl $16, %ebx
403
404 movdqa %xmm0, %xmm3
405 movdqa %xmm0, %xmm4
406 andps %xmm1, %xmm0
407
408 andps %xmm2, %xmm3
409 pslldq $2, %xmm4
410 psrldq $2, %xmm3
411 andps %xmm2, %xmm4
412
413 orps %xmm4, %xmm3
414 orps %xmm3, %xmm0
415
416 movdqu %xmm0, (%ecx)
417 addl $16, %ecx
418 subl $1, %eax
419 .L42:
420 jne .L43
421
422
423 /* There may be upto 3 pixels remaining to be copied. Take care
424 * of them now. We do the 2 pixel case first because the data
425 * will be aligned.
426 */
427
428 testl $2, %edx
429 je .L47
430
431 movq (%ebx), %xmm0
432
433 movdqa %xmm0, %xmm3
434 movdqa %xmm0, %xmm4
435 andps %xmm1, %xmm0
436
437 andps %xmm2, %xmm3
438 pslldq $2, %xmm4
439 psrldq $2, %xmm3
440 andps %xmm2, %xmm4
441
442 orps %xmm4, %xmm3
443 orps %xmm3, %xmm0
444
445 movq %xmm0, (%ecx)
446 .L47:
447
448 testl $1, %edx
449 je .L46
450
451 DO_ONE_LAST_PIXEL()
452 .L46:
453
454 popl %ebx
455 popl %esi
456 ret
457 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
458
459
460
461 .section .rodata
462
463 .align 16
464 mask_565:
465 .word 0xf800
466 .word 0x07e0
467 .word 0x001f
468 .word 0x0000
469
470 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
471 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
472 * at a small cost to accuracy.
473 */
474
475 #define SCALE_ADJUST 5
476 #if SCALE_ADJUST == 5
477 prescale:
478 .word 0x0001
479 .word 0x0010
480 .word 0x0200
481 .word 0x0000
482
483 scale:
484 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
485 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
486 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
487 .word 0x0000
488 #elif SCALE_ADJUST == 0
489 prescale:
490 .word 0x0001
491 .word 0x0020
492 .word 0x0800
493 .word 0x0000
494
495 scale:
496 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
497 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
498 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
499 .word 0x0000
500 #else
501 #error SCALE_ADJUST must either be 5 or 0.
502 #endif
503
504
505 alpha: .long 0x00000000
506 .long 0x00ff0000
507
508 /**
509 * MMX optimized version of the RGB565 to RGBA copy routine.
510 */
511
512 .text
513 .globl _generic_read_RGBA_span_RGB565_MMX
514 .hidden _generic_read_RGBA_span_RGB565_MMX
515 .type _generic_read_RGBA_span_RGB565_MMX, @function
516
517 _generic_read_RGBA_span_RGB565_MMX:
518
519 #ifdef USE_INNER_EMMS
520 emms
521 #endif
522
523 movl 4(%esp), %eax /* source pointer */
524 movl 8(%esp), %edx /* destination pointer */
525 movl 12(%esp), %ecx /* number of pixels to copy */
526
527 movq mask_565, %mm5
528 movq prescale, %mm6
529 movq scale, %mm7
530
531 shrl $2, %ecx
532 jmp .L02
533
534 .L03:
535 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
536 * second pixels into the four words of %mm0 and %mm2.
537 */
538
539 movq (%eax), %mm4
540 addl $8, %eax
541
542 pshufw $0x00, %mm4, %mm0
543 pshufw $0x55, %mm4, %mm2
544
545
546 /* Mask the pixels so that each word of each register contains only
547 * one color component.
548 */
549
550 pand %mm5, %mm0
551 pand %mm5, %mm2
552
553
554 /* Adjust the component values so that they are as small as possible,
555 * but large enough so that we can multiply them by an unsigned 16-bit
556 * number and get a value as large as 0x00ff0000.
557 */
558
559 pmullw %mm6, %mm0
560 pmullw %mm6, %mm2
561 #if SCALE_ADJUST > 0
562 psrlw $SCALE_ADJUST, %mm0
563 psrlw $SCALE_ADJUST, %mm2
564 #endif
565
566 /* Scale the input component values to be on the range
567 * [0, 0x00ff0000]. This it the real magic of the whole routine.
568 */
569
570 pmulhuw %mm7, %mm0
571 pmulhuw %mm7, %mm2
572
573
574 /* Always set the alpha value to 0xff.
575 */
576
577 por alpha, %mm0
578 por alpha, %mm2
579
580
581 /* Pack the 16-bit values to 8-bit values and store the converted
582 * pixel data.
583 */
584
585 packuswb %mm2, %mm0
586 movq %mm0, (%edx)
587 addl $8, %edx
588
589
590
591 pshufw $0xaa, %mm4, %mm0
592 pshufw $0xff, %mm4, %mm2
593
594 pand %mm5, %mm0
595 pand %mm5, %mm2
596 pmullw %mm6, %mm0
597 pmullw %mm6, %mm2
598 #if SCALE_ADJUST > 0
599 psrlw $SCALE_ADJUST, %mm0
600 psrlw $SCALE_ADJUST, %mm2
601 #endif
602 pmulhuw %mm7, %mm0
603 pmulhuw %mm7, %mm2
604
605 por alpha, %mm0
606 por alpha, %mm2
607
608 packuswb %mm2, %mm0
609
610 movq %mm0, (%edx)
611 addl $8, %edx
612
613 subl $1, %ecx
614 .L02:
615 jne .L03
616
617
618 /* At this point there can be at most 3 pixels left to process. If
619 * there is either 2 or 3 left, process 2.
620 */
621
622 movl 12(%esp), %ecx
623 testl $0x02, %ecx
624 je .L04
625
626 movd (%eax), %mm4
627 addl $4, %eax
628
629 pshufw $0x00, %mm4, %mm0
630 pshufw $0x55, %mm4, %mm2
631
632 pand %mm5, %mm0
633 pand %mm5, %mm2
634 pmullw %mm6, %mm0
635 pmullw %mm6, %mm2
636 #if SCALE_ADJUST > 0
637 psrlw $SCALE_ADJUST, %mm0
638 psrlw $SCALE_ADJUST, %mm2
639 #endif
640 pmulhuw %mm7, %mm0
641 pmulhuw %mm7, %mm2
642
643 por alpha, %mm0
644 por alpha, %mm2
645
646 packuswb %mm2, %mm0
647
648 movq %mm0, (%edx)
649 addl $8, %edx
650
651 .L04:
652 /* At this point there can be at most 1 pixel left to process.
653 * Process it if needed.
654 */
655
656 testl $0x01, %ecx
657 je .L01
658
659 movzxw (%eax), %ecx
660 movd %ecx, %mm4
661
662 pshufw $0x00, %mm4, %mm0
663
664 pand %mm5, %mm0
665 pmullw %mm6, %mm0
666 #if SCALE_ADJUST > 0
667 psrlw $SCALE_ADJUST, %mm0
668 #endif
669 pmulhuw %mm7, %mm0
670
671 por alpha, %mm0
672
673 packuswb %mm0, %mm0
674
675 movd %mm0, (%edx)
676
677 .L01:
678 #ifdef USE_INNER_EMMS
679 emms
680 #endif
681 ret
682 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */