Added MMX optimized version of the RGB565 ReadRGBASpan routine.
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 .section .rodata
35 .align 16
36 .type mask, @object
37 .size mask, 32
38 mask:
39 .long 0xff00ff00
40 .long 0xff00ff00
41 .long 0xff00ff00
42 .long 0xff00ff00
43 .long 0x00ff0000
44 .long 0x00ff0000
45 .long 0x00ff0000
46 .long 0x00ff0000
47
48
49 /* I implemented these as macros because the appear in quite a few places,
50 * and I've tweaked them a number of times. I got tired of changing every
51 * place they appear. :)
52 */
53
54 #define DO_ONE_PIXEL() \
55 movl (%ebx), %eax ; \
56 addl $4, %ebx ; \
57 bswap %eax /* ARGB -> BGRA */ ; \
58 rorl $8, %eax /* BGRA -> ABGR */ ; \
59 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
60 addl $4, %ecx
61
62 #define DO_ONE_LAST_PIXEL() \
63 movl (%ebx), %eax ; \
64 bswap %eax /* ARGB -> BGRA */ ; \
65 rorl $8, %eax /* BGRA -> ABGR */ ; \
66 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
67
68
69 /**
70 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
71 *
72 * \warning
73 * This function assumes that the caller will issue the EMMS instruction
74 * at the correct places.
75 */
76
77 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
78 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
79 _generic_read_RGBA_span_BGRA8888_REV_MMX:
80 pushl %ebx
81
82 #ifdef USE_INNER_EMMS
83 emms
84 #endif
85 movq mask, %mm1
86 movq mask+16, %mm2
87
88 movl 8(%esp), %ebx /* source pointer */
89 movl 16(%esp), %edx /* number of pixels to copy */
90 movl 12(%esp), %ecx /* destination pointer */
91
92 testl %edx, %edx
93 je .L20 /* Bail if there's nothing to do. */
94
95 movl %ebx, %eax
96
97 negl %eax
98 sarl $2, %eax
99 andl $1, %eax
100 je .L17
101
102 subl %eax, %edx
103 DO_ONE_PIXEL()
104 .L17:
105
106 /* Would it be faster to unroll this loop once and process 4 pixels
107 * per pass, instead of just two?
108 */
109
110 movl %edx, %eax
111 shrl %eax
112 jmp .L18
113 .L19:
114 movq (%ebx), %mm0
115 addl $8, %ebx
116
117 /* These 9 instructions do what PSHUFB (if there were such an
118 * instruction) could do in 1. :(
119 */
120
121 movq %mm0, %mm3
122 movq %mm0, %mm4
123
124 pand %mm2, %mm3
125 psllq $16, %mm4
126 psrlq $16, %mm3
127 pand %mm2, %mm4
128
129 pand %mm1, %mm0
130 por %mm4, %mm3
131 por %mm3, %mm0
132
133 movq %mm0, (%ecx)
134 addl $8, %ecx
135 subl $1, %eax
136 .L18:
137 jne .L19
138
139 #ifdef USE_INNER_EMMS
140 emms
141 #endif
142
143 /* At this point there are either 1 or 0 pixels remaining to be
144 * converted. Convert the last pixel, if needed.
145 */
146
147 testl $1, %edx
148 je .L20
149
150 DO_ONE_LAST_PIXEL()
151
152 .L20:
153 popl %ebx
154 ret
155 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
156
157
158 /**
159 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
160 * instructions are only actually used to read data from the framebuffer.
161 * In practice, the speed-up is pretty small.
162 *
163 * \todo
164 * Do some more testing and determine if there's any reason to have this
165 * function in addition to the MMX version.
166 *
167 * \warning
168 * This function assumes that the caller will issue the EMMS instruction
169 * at the correct places.
170 */
171
172 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
173 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
174 _generic_read_RGBA_span_BGRA8888_REV_SSE:
175 pushl %esi
176 pushl %ebx
177 pushl %ebp
178
179 #ifdef USE_INNER_EMMS
180 emms
181 #endif
182 movq mask, %mm1
183 movq mask+16, %mm2
184
185 movl 16(%esp), %ebx /* source pointer */
186 movl 24(%esp), %edx /* number of pixels to copy */
187 movl 20(%esp), %ecx /* destination pointer */
188
189 movl %esp, %ebp
190 subl $16, %esp
191 andl $0xfffffff0, %esp
192
193 movl %ebx, %eax
194 movl %edx, %esi
195
196 negl %eax
197 andl $15, %eax
198 sarl $2, %eax
199 cmpl %edx, %eax
200 cmovle %eax, %esi
201
202 subl %esi, %edx
203
204 testl $1, %esi
205 je .L32
206
207 DO_ONE_PIXEL()
208 .L32:
209
210 testl $2, %esi
211 je .L31
212
213 movq (%ebx), %mm0
214 addl $8, %ebx
215
216 movq %mm0, %mm3
217 movq %mm0, %mm4
218
219 pand %mm2, %mm3
220 psllq $16, %mm4
221 psrlq $16, %mm3
222 pand %mm2, %mm4
223
224 pand %mm1, %mm0
225 por %mm4, %mm3
226 por %mm3, %mm0
227
228 movq %mm0, (%ecx)
229 addl $8, %ecx
230 .L31:
231
232 movl %edx, %eax
233 shrl $2, %eax
234 jmp .L33
235 .L34:
236 movaps (%ebx), %xmm0
237 addl $16, %ebx
238
239 /* This would be so much better if we could just move directly from
240 * an SSE register to an MMX register. Unfortunately, that
241 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
242 * instruction.
243 */
244
245 movaps %xmm0, (%esp)
246 movq (%esp), %mm0
247 movq 8(%esp), %mm5
248
249 movq %mm0, %mm3
250 movq %mm0, %mm4
251 movq %mm5, %mm6
252 movq %mm5, %mm7
253
254 pand %mm2, %mm3
255 pand %mm2, %mm6
256
257 psllq $16, %mm4
258 psllq $16, %mm7
259
260 psrlq $16, %mm3
261 psrlq $16, %mm6
262
263 pand %mm2, %mm4
264 pand %mm2, %mm7
265
266 pand %mm1, %mm0
267 pand %mm1, %mm5
268
269 por %mm4, %mm3
270 por %mm7, %mm6
271
272 por %mm3, %mm0
273 por %mm6, %mm5
274
275 movq %mm0, (%ecx)
276 movq %mm5, 8(%ecx)
277 addl $16, %ecx
278
279 subl $1, %eax
280 .L33:
281 jne .L34
282
283 #ifdef USE_INNER_EMMS
284 emms
285 #endif
286 movl %ebp, %esp
287
288 /* At this point there are either [0, 3] pixels remaining to be
289 * converted.
290 */
291
292 testl $2, %edx
293 je .L36
294
295 movq (%ebx), %mm0
296 addl $8, %ebx
297
298 movq %mm0, %mm3
299 movq %mm0, %mm4
300
301 pand %mm2, %mm3
302 psllq $16, %mm4
303 psrlq $16, %mm3
304 pand %mm2, %mm4
305
306 pand %mm1, %mm0
307 por %mm4, %mm3
308 por %mm3, %mm0
309
310 movq %mm0, (%ecx)
311 addl $8, %ecx
312 .L36:
313
314 testl $1, %edx
315 je .L35
316
317 DO_ONE_LAST_PIXEL()
318 .L35:
319 popl %ebp
320 popl %ebx
321 popl %esi
322 ret
323 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
324
325
326 /**
327 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
328 */
329
330 .text
331 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
332 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
333 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
334 pushl %esi
335 pushl %ebx
336
337 movdqa mask, %xmm1
338 movdqa mask+16, %xmm2
339
340 movl 12(%esp), %ebx /* source pointer */
341 movl 20(%esp), %edx /* number of pixels to copy */
342 movl 16(%esp), %ecx /* destination pointer */
343
344 movl %ebx, %eax
345 movl %edx, %esi
346
347 /* If the source pointer isn't a multiple of 16 we have to process
348 * a few pixels the "slow" way to get the address aligned for
349 * the SSE fetch intsructions.
350 */
351
352 negl %eax
353 andl $15, %eax
354 sarl $2, %eax
355
356 cmpl %edx, %eax
357 cmovbe %eax, %esi
358 subl %esi, %edx
359
360 testl $1, %esi
361 je .L41
362
363 DO_ONE_PIXEL()
364 .L41:
365 testl $2, %esi
366 je .L40
367
368 movq (%ebx), %xmm0
369 addl $8, %ebx
370
371 movdqa %xmm0, %xmm3
372 movdqa %xmm0, %xmm4
373 andps %xmm1, %xmm0
374
375 andps %xmm2, %xmm3
376 pslldq $2, %xmm4
377 psrldq $2, %xmm3
378 andps %xmm2, %xmm4
379
380 orps %xmm4, %xmm3
381 orps %xmm3, %xmm0
382
383 movq %xmm0, (%ecx)
384 addl $8, %ecx
385 .L40:
386
387 /* Would it be worth having a specialized version of this loop for
388 * the case where the destination is 16-byte aligned? That version
389 * would be identical except that it could use movedqa instead of
390 * movdqu.
391 */
392
393 movl %edx, %eax
394 shrl $2, %eax
395 jmp .L42
396 .L43:
397 movdqa (%ebx), %xmm0
398 addl $16, %ebx
399
400 movdqa %xmm0, %xmm3
401 movdqa %xmm0, %xmm4
402 andps %xmm1, %xmm0
403
404 andps %xmm2, %xmm3
405 pslldq $2, %xmm4
406 psrldq $2, %xmm3
407 andps %xmm2, %xmm4
408
409 orps %xmm4, %xmm3
410 orps %xmm3, %xmm0
411
412 movdqu %xmm0, (%ecx)
413 addl $16, %ecx
414 subl $1, %eax
415 .L42:
416 jne .L43
417
418
419 /* There may be upto 3 pixels remaining to be copied. Take care
420 * of them now. We do the 2 pixel case first because the data
421 * will be aligned.
422 */
423
424 testl $2, %edx
425 je .L47
426
427 movq (%ebx), %xmm0
428
429 movdqa %xmm0, %xmm3
430 movdqa %xmm0, %xmm4
431 andps %xmm1, %xmm0
432
433 andps %xmm2, %xmm3
434 pslldq $2, %xmm4
435 psrldq $2, %xmm3
436 andps %xmm2, %xmm4
437
438 orps %xmm4, %xmm3
439 orps %xmm3, %xmm0
440
441 movq %xmm0, (%ecx)
442 .L47:
443
444 testl $1, %edx
445 je .L46
446
447 DO_ONE_LAST_PIXEL()
448 .L46:
449
450 popl %ebx
451 popl %esi
452 ret
453 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
454
455
456
457 .section .rodata
458
459 .align 16
460 mask_565:
461 .word 0xf800
462 .word 0x07e0
463 .word 0x001f
464 .word 0x0000
465
466 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
467 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but
468 * at a small cost to accuracy.
469 */
470
471 #define SCALE_ADJUST 5
472 #if SCALE_ADJUST == 5
473 prescale:
474 .word 0x0001
475 .word 0x0010
476 .word 0x0200
477 .word 0x0000
478
479 scale:
480 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */
481 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */
482 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */
483 .word 0x0000
484 #elif SCALE_ADJUST == 0
485 prescale:
486 .word 0x0001
487 .word 0x0020
488 .word 0x0800
489 .word 0x0000
490
491 scale:
492 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
493 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */
494 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */
495 .word 0x0000
496 #else
497 #error SCALE_ADJUST must either be 5 or 0.
498 #endif
499
500
501 alpha: .long 0x00000000
502 .long 0x00ff0000
503
504 /**
505 * MMX optimized version of the RGB565 to RGBA copy routine.
506 */
507
508 .text
509 .globl _generic_read_RGBA_span_RGB565_MMX
510 .type _generic_read_RGBA_span_RGB565_MMX, @function
511
512 _generic_read_RGBA_span_RGB565_MMX:
513
514 #ifdef USE_INNER_EMMS
515 emms
516 #endif
517
518 movl 4(%esp), %eax /* source pointer */
519 movl 8(%esp), %edx /* destination pointer */
520 movl 12(%esp), %ecx /* number of pixels to copy */
521
522 movq mask_565, %mm5
523 movq prescale, %mm6
524 movq scale, %mm7
525
526 shrl $2, %ecx
527 jmp .L02
528
529 .L03:
530 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
531 * second pixels into the four words of %mm0 and %mm2.
532 */
533
534 movq (%eax), %mm4
535 addl $8, %eax
536
537 pshufw $0x00, %mm4, %mm0
538 pshufw $0x55, %mm4, %mm2
539
540
541 /* Mask the pixels so that each word of each register contains only
542 * one color component.
543 */
544
545 pand %mm5, %mm0
546 pand %mm5, %mm2
547
548
549 /* Adjust the component values so that they are as small as possible,
550 * but large enough so that we can multiply them by an unsigned 16-bit
551 * number and get a value as large as 0x00ff0000.
552 */
553
554 pmullw %mm6, %mm0
555 pmullw %mm6, %mm2
556 #if SCALE_ADJUST > 0
557 psrlw $SCALE_ADJUST, %mm0
558 psrlw $SCALE_ADJUST, %mm2
559 #endif
560
561 /* Scale the input component values to be on the range
562 * [0, 0x00ff0000]. This it the real magic of the whole routine.
563 */
564
565 pmulhuw %mm7, %mm0
566 pmulhuw %mm7, %mm2
567
568
569 /* Always set the alpha value to 0xff.
570 */
571
572 por alpha, %mm0
573 por alpha, %mm2
574
575
576 /* Pack the 16-bit values to 8-bit values and store the converted
577 * pixel data.
578 */
579
580 packuswb %mm2, %mm0
581 movq %mm0, (%edx)
582 addl $8, %edx
583
584
585
586 pshufw $0xaa, %mm4, %mm0
587 pshufw $0xff, %mm4, %mm2
588
589 pand %mm5, %mm0
590 pand %mm5, %mm2
591 pmullw %mm6, %mm0
592 pmullw %mm6, %mm2
593 #if SCALE_ADJUST > 0
594 psrlw $SCALE_ADJUST, %mm0
595 psrlw $SCALE_ADJUST, %mm2
596 #endif
597 pmulhuw %mm7, %mm0
598 pmulhuw %mm7, %mm2
599
600 por alpha, %mm0
601 por alpha, %mm2
602
603 packuswb %mm2, %mm0
604
605 movq %mm0, (%edx)
606 addl $8, %edx
607
608 subl $1, %ecx
609 .L02:
610 jne .L03
611
612
613 /* At this point there can be at most 3 pixels left to process. If
614 * there is either 2 or 3 left, process 2.
615 */
616
617 movl 12(%esp), %ecx
618 testl $0x02, %ecx
619 je .L04
620
621 movd (%eax), %mm4
622 addl $4, %eax
623
624 pshufw $0x00, %mm4, %mm0
625 pshufw $0x55, %mm4, %mm2
626
627 pand %mm5, %mm0
628 pand %mm5, %mm2
629 pmullw %mm6, %mm0
630 pmullw %mm6, %mm2
631 #if SCALE_ADJUST > 0
632 psrlw $SCALE_ADJUST, %mm0
633 psrlw $SCALE_ADJUST, %mm2
634 #endif
635 pmulhuw %mm7, %mm0
636 pmulhuw %mm7, %mm2
637
638 por alpha, %mm0
639 por alpha, %mm2
640
641 packuswb %mm2, %mm0
642
643 movq %mm0, (%edx)
644 addl $8, %edx
645
646 .L04:
647 /* At this point there can be at most 1 pixel left to process.
648 * Process it if needed.
649 */
650
651 testl $0x01, %ecx
652 je .L01
653
654 movzxw (%eax), %ecx
655 movd %ecx, %mm4
656
657 pshufw $0x00, %mm4, %mm0
658
659 pand %mm5, %mm0
660 pmullw %mm6, %mm0
661 #if SCALE_ADJUST > 0
662 psrlw $SCALE_ADJUST, %mm0
663 #endif
664 pmulhuw %mm7, %mm0
665
666 por alpha, %mm0
667
668 packuswb %mm0, %mm0
669
670 movd %mm0, (%edx)
671
672 .L01:
673 #ifdef USE_INNER_EMMS
674 emms
675 #endif
676 ret