Merge commit 'origin/gallium-0.1'
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35 /* Kevin F. Quinn 2nd July 2006
36 * Replaced data segment constants with text-segment instructions.
37 */
38 #define LOAD_MASK(mvins,m1,m2) \
39 pushl $0xff00ff00 ;\
40 pushl $0xff00ff00 ;\
41 pushl $0xff00ff00 ;\
42 pushl $0xff00ff00 ;\
43 mvins (%esp), m1 ;\
44 pushl $0x00ff0000 ;\
45 pushl $0x00ff0000 ;\
46 pushl $0x00ff0000 ;\
47 pushl $0x00ff0000 ;\
48 mvins (%esp), m2 ;\
49 addl $32, %esp
50
51 /* I implemented these as macros because they appear in several places,
52 * and I've tweaked them a number of times. I got tired of changing every
53 * place they appear. :)
54 */
55
56 #define DO_ONE_PIXEL() \
57 movl (%ebx), %eax ; \
58 addl $4, %ebx ; \
59 bswap %eax /* ARGB -> BGRA */ ; \
60 rorl $8, %eax /* BGRA -> ABGR */ ; \
61 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
62 addl $4, %ecx
63
64 #define DO_ONE_LAST_PIXEL() \
65 movl (%ebx), %eax ; \
66 bswap %eax /* ARGB -> BGRA */ ; \
67 rorl $8, %eax /* BGRA -> ABGR */ ; \
68 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
69
70
71 /**
72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
73 *
74 * \warning
75 * This function assumes that the caller will issue the EMMS instruction
76 * at the correct places.
77 */
78
79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
81 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
82 _generic_read_RGBA_span_BGRA8888_REV_MMX:
83 pushl %ebx
84
85 #ifdef USE_INNER_EMMS
86 emms
87 #endif
88 LOAD_MASK(movq,%mm1,%mm2)
89
90 movl 8(%esp), %ebx /* source pointer */
91 movl 16(%esp), %edx /* number of pixels to copy */
92 movl 12(%esp), %ecx /* destination pointer */
93
94 testl %edx, %edx
95 jle .L20 /* Bail if there's nothing to do. */
96
97 movl %ebx, %eax
98
99 negl %eax
100 sarl $2, %eax
101 andl $1, %eax
102 je .L17
103
104 subl %eax, %edx
105 DO_ONE_PIXEL()
106 .L17:
107
108 /* Would it be faster to unroll this loop once and process 4 pixels
109 * per pass, instead of just two?
110 */
111
112 movl %edx, %eax
113 shrl %eax
114 jmp .L18
115 .L19:
116 movq (%ebx), %mm0
117 addl $8, %ebx
118
119 /* These 9 instructions do what PSHUFB (if there were such an
120 * instruction) could do in 1. :(
121 */
122
123 movq %mm0, %mm3
124 movq %mm0, %mm4
125
126 pand %mm2, %mm3
127 psllq $16, %mm4
128 psrlq $16, %mm3
129 pand %mm2, %mm4
130
131 pand %mm1, %mm0
132 por %mm4, %mm3
133 por %mm3, %mm0
134
135 movq %mm0, (%ecx)
136 addl $8, %ecx
137 subl $1, %eax
138 .L18:
139 jne .L19
140
141 #ifdef USE_INNER_EMMS
142 emms
143 #endif
144
145 /* At this point there are either 1 or 0 pixels remaining to be
146 * converted. Convert the last pixel, if needed.
147 */
148
149 testl $1, %edx
150 je .L20
151
152 DO_ONE_LAST_PIXEL()
153
154 .L20:
155 popl %ebx
156 ret
157 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
158
159
160 /**
161 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
162 * instructions are only actually used to read data from the framebuffer.
163 * In practice, the speed-up is pretty small.
164 *
165 * \todo
166 * Do some more testing and determine if there's any reason to have this
167 * function in addition to the MMX version.
168 *
169 * \warning
170 * This function assumes that the caller will issue the EMMS instruction
171 * at the correct places.
172 */
173
174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
176 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
178 pushl %esi
179 pushl %ebx
180 pushl %ebp
181
182 #ifdef USE_INNER_EMMS
183 emms
184 #endif
185
186 LOAD_MASK(movq,%mm1,%mm2)
187
188 movl 16(%esp), %ebx /* source pointer */
189 movl 24(%esp), %edx /* number of pixels to copy */
190 movl 20(%esp), %ecx /* destination pointer */
191
192 testl %edx, %edx
193 jle .L35 /* Bail if there's nothing to do. */
194
195 movl %esp, %ebp
196 subl $16, %esp
197 andl $0xfffffff0, %esp
198
199 movl %ebx, %eax
200 movl %edx, %esi
201
202 negl %eax
203 andl $15, %eax
204 sarl $2, %eax
205 cmpl %edx, %eax
206 cmovle %eax, %esi
207
208 subl %esi, %edx
209
210 testl $1, %esi
211 je .L32
212
213 DO_ONE_PIXEL()
214 .L32:
215
216 testl $2, %esi
217 je .L31
218
219 movq (%ebx), %mm0
220 addl $8, %ebx
221
222 movq %mm0, %mm3
223 movq %mm0, %mm4
224
225 pand %mm2, %mm3
226 psllq $16, %mm4
227 psrlq $16, %mm3
228 pand %mm2, %mm4
229
230 pand %mm1, %mm0
231 por %mm4, %mm3
232 por %mm3, %mm0
233
234 movq %mm0, (%ecx)
235 addl $8, %ecx
236 .L31:
237
238 movl %edx, %eax
239 shrl $2, %eax
240 jmp .L33
241 .L34:
242 movaps (%ebx), %xmm0
243 addl $16, %ebx
244
245 /* This would be so much better if we could just move directly from
246 * an SSE register to an MMX register. Unfortunately, that
247 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
248 * instruction.
249 */
250
251 movaps %xmm0, (%esp)
252 movq (%esp), %mm0
253 movq 8(%esp), %mm5
254
255 movq %mm0, %mm3
256 movq %mm0, %mm4
257 movq %mm5, %mm6
258 movq %mm5, %mm7
259
260 pand %mm2, %mm3
261 pand %mm2, %mm6
262
263 psllq $16, %mm4
264 psllq $16, %mm7
265
266 psrlq $16, %mm3
267 psrlq $16, %mm6
268
269 pand %mm2, %mm4
270 pand %mm2, %mm7
271
272 pand %mm1, %mm0
273 pand %mm1, %mm5
274
275 por %mm4, %mm3
276 por %mm7, %mm6
277
278 por %mm3, %mm0
279 por %mm6, %mm5
280
281 movq %mm0, (%ecx)
282 movq %mm5, 8(%ecx)
283 addl $16, %ecx
284
285 subl $1, %eax
286 .L33:
287 jne .L34
288
289 #ifdef USE_INNER_EMMS
290 emms
291 #endif
292 movl %ebp, %esp
293
294 /* At this point there are either [0, 3] pixels remaining to be
295 * converted.
296 */
297
298 testl $2, %edx
299 je .L36
300
301 movq (%ebx), %mm0
302 addl $8, %ebx
303
304 movq %mm0, %mm3
305 movq %mm0, %mm4
306
307 pand %mm2, %mm3
308 psllq $16, %mm4
309 psrlq $16, %mm3
310 pand %mm2, %mm4
311
312 pand %mm1, %mm0
313 por %mm4, %mm3
314 por %mm3, %mm0
315
316 movq %mm0, (%ecx)
317 addl $8, %ecx
318 .L36:
319
320 testl $1, %edx
321 je .L35
322
323 DO_ONE_LAST_PIXEL()
324 .L35:
325 popl %ebp
326 popl %ebx
327 popl %esi
328 ret
329 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
330
331
332 /**
333 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
334 */
335
336 .text
337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
339 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
341 pushl %esi
342 pushl %ebx
343
344 LOAD_MASK(movdqu,%xmm1,%xmm2)
345
346 movl 12(%esp), %ebx /* source pointer */
347 movl 20(%esp), %edx /* number of pixels to copy */
348 movl 16(%esp), %ecx /* destination pointer */
349
350 movl %ebx, %eax
351 movl %edx, %esi
352
353 testl %edx, %edx
354 jle .L46 /* Bail if there's nothing to do. */
355
356 /* If the source pointer isn't a multiple of 16 we have to process
357 * a few pixels the "slow" way to get the address aligned for
358 * the SSE fetch intsructions.
359 */
360
361 negl %eax
362 andl $15, %eax
363 sarl $2, %eax
364
365 cmpl %edx, %eax
366 cmovbe %eax, %esi
367 subl %esi, %edx
368
369 testl $1, %esi
370 je .L41
371
372 DO_ONE_PIXEL()
373 .L41:
374 testl $2, %esi
375 je .L40
376
377 movq (%ebx), %xmm0
378 addl $8, %ebx
379
380 movdqa %xmm0, %xmm3
381 movdqa %xmm0, %xmm4
382 andps %xmm1, %xmm0
383
384 andps %xmm2, %xmm3
385 pslldq $2, %xmm4
386 psrldq $2, %xmm3
387 andps %xmm2, %xmm4
388
389 orps %xmm4, %xmm3
390 orps %xmm3, %xmm0
391
392 movq %xmm0, (%ecx)
393 addl $8, %ecx
394 .L40:
395
396 /* Would it be worth having a specialized version of this loop for
397 * the case where the destination is 16-byte aligned? That version
398 * would be identical except that it could use movedqa instead of
399 * movdqu.
400 */
401
402 movl %edx, %eax
403 shrl $2, %eax
404 jmp .L42
405 .L43:
406 movdqa (%ebx), %xmm0
407 addl $16, %ebx
408
409 movdqa %xmm0, %xmm3
410 movdqa %xmm0, %xmm4
411 andps %xmm1, %xmm0
412
413 andps %xmm2, %xmm3
414 pslldq $2, %xmm4
415 psrldq $2, %xmm3
416 andps %xmm2, %xmm4
417
418 orps %xmm4, %xmm3
419 orps %xmm3, %xmm0
420
421 movdqu %xmm0, (%ecx)
422 addl $16, %ecx
423 subl $1, %eax
424 .L42:
425 jne .L43
426
427
428 /* There may be upto 3 pixels remaining to be copied. Take care
429 * of them now. We do the 2 pixel case first because the data
430 * will be aligned.
431 */
432
433 testl $2, %edx
434 je .L47
435
436 movq (%ebx), %xmm0
437 addl $8, %ebx
438
439 movdqa %xmm0, %xmm3
440 movdqa %xmm0, %xmm4
441 andps %xmm1, %xmm0
442
443 andps %xmm2, %xmm3
444 pslldq $2, %xmm4
445 psrldq $2, %xmm3
446 andps %xmm2, %xmm4
447
448 orps %xmm4, %xmm3
449 orps %xmm3, %xmm0
450
451 movq %xmm0, (%ecx)
452 addl $8, %ecx
453 .L47:
454
455 testl $1, %edx
456 je .L46
457
458 DO_ONE_LAST_PIXEL()
459 .L46:
460
461 popl %ebx
462 popl %esi
463 ret
464 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
465
466
467
468 #define MASK_565_L 0x07e0f800
469 #define MASK_565_H 0x0000001f
470 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
471 * classic C implementation in Mesa. Setting SCALE_ADJUST
472 * to 0 is slightly faster but at a small cost to accuracy.
473 */
474 #define SCALE_ADJUST 5
475 #if SCALE_ADJUST == 5
476 #define PRESCALE_L 0x00100001
477 #define PRESCALE_H 0x00000200
478 #define SCALE_L 0x40C620E8
479 #define SCALE_H 0x0000839d
480 #elif SCALE_ADJUST == 0
481 #define PRESCALE_L 0x00200001
482 #define PRESCALE_H 0x00000800
483 #define SCALE_L 0x01040108
484 #define SCALE_H 0x00000108
485 #else
486 #error SCALE_ADJUST must either be 5 or 0.
487 #endif
488 #define ALPHA_L 0x00000000
489 #define ALPHA_H 0x00ff0000
490
491 /**
492 * MMX optimized version of the RGB565 to RGBA copy routine.
493 */
494
495 .text
496 .globl _generic_read_RGBA_span_RGB565_MMX
497 .hidden _generic_read_RGBA_span_RGB565_MMX
498 .type _generic_read_RGBA_span_RGB565_MMX, @function
499
500 _generic_read_RGBA_span_RGB565_MMX:
501
502 #ifdef USE_INNER_EMMS
503 emms
504 #endif
505
506 movl 4(%esp), %eax /* source pointer */
507 movl 8(%esp), %edx /* destination pointer */
508 movl 12(%esp), %ecx /* number of pixels to copy */
509
510 pushl $MASK_565_H
511 pushl $MASK_565_L
512 movq (%esp), %mm5
513 pushl $PRESCALE_H
514 pushl $PRESCALE_L
515 movq (%esp), %mm6
516 pushl $SCALE_H
517 pushl $SCALE_L
518 movq (%esp), %mm7
519 pushl $ALPHA_H
520 pushl $ALPHA_L
521 movq (%esp), %mm3
522 addl $32,%esp
523
524 sarl $2, %ecx
525 jle .L01 /* Bail early if the count is negative. */
526 jmp .L02
527
528 .L03:
529 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
530 * second pixels into the four words of %mm0 and %mm2.
531 */
532
533 movq (%eax), %mm4
534 addl $8, %eax
535
536 pshufw $0x00, %mm4, %mm0
537 pshufw $0x55, %mm4, %mm2
538
539
540 /* Mask the pixels so that each word of each register contains only
541 * one color component.
542 */
543
544 pand %mm5, %mm0
545 pand %mm5, %mm2
546
547
548 /* Adjust the component values so that they are as small as possible,
549 * but large enough so that we can multiply them by an unsigned 16-bit
550 * number and get a value as large as 0x00ff0000.
551 */
552
553 pmullw %mm6, %mm0
554 pmullw %mm6, %mm2
555 #if SCALE_ADJUST > 0
556 psrlw $SCALE_ADJUST, %mm0
557 psrlw $SCALE_ADJUST, %mm2
558 #endif
559
560 /* Scale the input component values to be on the range
561 * [0, 0x00ff0000]. This it the real magic of the whole routine.
562 */
563
564 pmulhuw %mm7, %mm0
565 pmulhuw %mm7, %mm2
566
567
568 /* Always set the alpha value to 0xff.
569 */
570
571 por %mm3, %mm0
572 por %mm3, %mm2
573
574
575 /* Pack the 16-bit values to 8-bit values and store the converted
576 * pixel data.
577 */
578
579 packuswb %mm2, %mm0
580 movq %mm0, (%edx)
581 addl $8, %edx
582
583 pshufw $0xaa, %mm4, %mm0
584 pshufw $0xff, %mm4, %mm2
585
586 pand %mm5, %mm0
587 pand %mm5, %mm2
588 pmullw %mm6, %mm0
589 pmullw %mm6, %mm2
590 #if SCALE_ADJUST > 0
591 psrlw $SCALE_ADJUST, %mm0
592 psrlw $SCALE_ADJUST, %mm2
593 #endif
594 pmulhuw %mm7, %mm0
595 pmulhuw %mm7, %mm2
596
597 por %mm3, %mm0
598 por %mm3, %mm2
599
600 packuswb %mm2, %mm0
601
602 movq %mm0, (%edx)
603 addl $8, %edx
604
605 subl $1, %ecx
606 .L02:
607 jne .L03
608
609
610 /* At this point there can be at most 3 pixels left to process. If
611 * there is either 2 or 3 left, process 2.
612 */
613
614 movl 12(%esp), %ecx
615 testl $0x02, %ecx
616 je .L04
617
618 movd (%eax), %mm4
619 addl $4, %eax
620
621 pshufw $0x00, %mm4, %mm0
622 pshufw $0x55, %mm4, %mm2
623
624 pand %mm5, %mm0
625 pand %mm5, %mm2
626 pmullw %mm6, %mm0
627 pmullw %mm6, %mm2
628 #if SCALE_ADJUST > 0
629 psrlw $SCALE_ADJUST, %mm0
630 psrlw $SCALE_ADJUST, %mm2
631 #endif
632 pmulhuw %mm7, %mm0
633 pmulhuw %mm7, %mm2
634
635 por %mm3, %mm0
636 por %mm3, %mm2
637
638 packuswb %mm2, %mm0
639
640 movq %mm0, (%edx)
641 addl $8, %edx
642
643 .L04:
644 /* At this point there can be at most 1 pixel left to process.
645 * Process it if needed.
646 */
647
648 testl $0x01, %ecx
649 je .L01
650
651 movzxw (%eax), %ecx
652 movd %ecx, %mm4
653
654 pshufw $0x00, %mm4, %mm0
655
656 pand %mm5, %mm0
657 pmullw %mm6, %mm0
658 #if SCALE_ADJUST > 0
659 psrlw $SCALE_ADJUST, %mm0
660 #endif
661 pmulhuw %mm7, %mm0
662
663 por %mm3, %mm0
664
665 packuswb %mm0, %mm0
666
667 movd %mm0, (%edx)
668
669 .L01:
670 #ifdef USE_INNER_EMMS
671 emms
672 #endif
673 ret
674 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
675
676 #if defined (__ELF__) && defined (__linux__)
677 .section .note.GNU-stack,"",%progbits
678 #endif