mesa: Add "OR COPYRIGHT HOLDERS" to license text disclaiming liability.
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33 .file "read_rgba_span_x86.S"
34 #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
35 /* Kevin F. Quinn 2nd July 2006
36 * Replaced data segment constants with text-segment instructions.
37 */
38 #define LOAD_MASK(mvins,m1,m2) \
39 pushl $0xff00ff00 ;\
40 pushl $0xff00ff00 ;\
41 pushl $0xff00ff00 ;\
42 pushl $0xff00ff00 ;\
43 mvins (%esp), m1 ;\
44 pushl $0x00ff0000 ;\
45 pushl $0x00ff0000 ;\
46 pushl $0x00ff0000 ;\
47 pushl $0x00ff0000 ;\
48 mvins (%esp), m2 ;\
49 addl $32, %esp
50
51 /* I implemented these as macros because they appear in several places,
52 * and I've tweaked them a number of times. I got tired of changing every
53 * place they appear. :)
54 */
55
56 #define DO_ONE_PIXEL() \
57 movl (%ebx), %eax ; \
58 addl $4, %ebx ; \
59 bswap %eax /* ARGB -> BGRA */ ; \
60 rorl $8, %eax /* BGRA -> ABGR */ ; \
61 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
62 addl $4, %ecx
63
64 #define DO_ONE_LAST_PIXEL() \
65 movl (%ebx), %eax ; \
66 bswap %eax /* ARGB -> BGRA */ ; \
67 rorl $8, %eax /* BGRA -> ABGR */ ; \
68 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
69
70
71 /**
72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
73 *
74 * \warning
75 * This function assumes that the caller will issue the EMMS instruction
76 * at the correct places.
77 */
78
79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80 #ifndef USE_DRICORE
81 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
82 #endif
83 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
84 _generic_read_RGBA_span_BGRA8888_REV_MMX:
85 pushl %ebx
86
87 #ifdef USE_INNER_EMMS
88 emms
89 #endif
90 LOAD_MASK(movq,%mm1,%mm2)
91
92 movl 8(%esp), %ebx /* source pointer */
93 movl 16(%esp), %edx /* number of pixels to copy */
94 movl 12(%esp), %ecx /* destination pointer */
95
96 testl %edx, %edx
97 jle .L20 /* Bail if there's nothing to do. */
98
99 movl %ebx, %eax
100
101 negl %eax
102 sarl $2, %eax
103 andl $1, %eax
104 je .L17
105
106 subl %eax, %edx
107 DO_ONE_PIXEL()
108 .L17:
109
110 /* Would it be faster to unroll this loop once and process 4 pixels
111 * per pass, instead of just two?
112 */
113
114 movl %edx, %eax
115 shrl %eax
116 jmp .L18
117 .L19:
118 movq (%ebx), %mm0
119 addl $8, %ebx
120
121 /* These 9 instructions do what PSHUFB (if there were such an
122 * instruction) could do in 1. :(
123 */
124
125 movq %mm0, %mm3
126 movq %mm0, %mm4
127
128 pand %mm2, %mm3
129 psllq $16, %mm4
130 psrlq $16, %mm3
131 pand %mm2, %mm4
132
133 pand %mm1, %mm0
134 por %mm4, %mm3
135 por %mm3, %mm0
136
137 movq %mm0, (%ecx)
138 addl $8, %ecx
139 subl $1, %eax
140 .L18:
141 jne .L19
142
143 #ifdef USE_INNER_EMMS
144 emms
145 #endif
146
147 /* At this point there are either 1 or 0 pixels remaining to be
148 * converted. Convert the last pixel, if needed.
149 */
150
151 testl $1, %edx
152 je .L20
153
154 DO_ONE_LAST_PIXEL()
155
156 .L20:
157 popl %ebx
158 ret
159 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
160
161
162 /**
163 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
164 * instructions are only actually used to read data from the framebuffer.
165 * In practice, the speed-up is pretty small.
166 *
167 * \todo
168 * Do some more testing and determine if there's any reason to have this
169 * function in addition to the MMX version.
170 *
171 * \warning
172 * This function assumes that the caller will issue the EMMS instruction
173 * at the correct places.
174 */
175
176 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
177 #ifndef USE_DRICORE
178 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
179 #endif
180 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
181 _generic_read_RGBA_span_BGRA8888_REV_SSE:
182 pushl %esi
183 pushl %ebx
184 pushl %ebp
185
186 #ifdef USE_INNER_EMMS
187 emms
188 #endif
189
190 LOAD_MASK(movq,%mm1,%mm2)
191
192 movl 16(%esp), %ebx /* source pointer */
193 movl 24(%esp), %edx /* number of pixels to copy */
194 movl 20(%esp), %ecx /* destination pointer */
195
196 testl %edx, %edx
197 jle .L35 /* Bail if there's nothing to do. */
198
199 movl %esp, %ebp
200 subl $16, %esp
201 andl $0xfffffff0, %esp
202
203 movl %ebx, %eax
204 movl %edx, %esi
205
206 negl %eax
207 andl $15, %eax
208 sarl $2, %eax
209 cmpl %edx, %eax
210 cmovle %eax, %esi
211
212 subl %esi, %edx
213
214 testl $1, %esi
215 je .L32
216
217 DO_ONE_PIXEL()
218 .L32:
219
220 testl $2, %esi
221 je .L31
222
223 movq (%ebx), %mm0
224 addl $8, %ebx
225
226 movq %mm0, %mm3
227 movq %mm0, %mm4
228
229 pand %mm2, %mm3
230 psllq $16, %mm4
231 psrlq $16, %mm3
232 pand %mm2, %mm4
233
234 pand %mm1, %mm0
235 por %mm4, %mm3
236 por %mm3, %mm0
237
238 movq %mm0, (%ecx)
239 addl $8, %ecx
240 .L31:
241
242 movl %edx, %eax
243 shrl $2, %eax
244 jmp .L33
245 .L34:
246 movaps (%ebx), %xmm0
247 addl $16, %ebx
248
249 /* This would be so much better if we could just move directly from
250 * an SSE register to an MMX register. Unfortunately, that
251 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
252 * instruction.
253 */
254
255 movaps %xmm0, (%esp)
256 movq (%esp), %mm0
257 movq 8(%esp), %mm5
258
259 movq %mm0, %mm3
260 movq %mm0, %mm4
261 movq %mm5, %mm6
262 movq %mm5, %mm7
263
264 pand %mm2, %mm3
265 pand %mm2, %mm6
266
267 psllq $16, %mm4
268 psllq $16, %mm7
269
270 psrlq $16, %mm3
271 psrlq $16, %mm6
272
273 pand %mm2, %mm4
274 pand %mm2, %mm7
275
276 pand %mm1, %mm0
277 pand %mm1, %mm5
278
279 por %mm4, %mm3
280 por %mm7, %mm6
281
282 por %mm3, %mm0
283 por %mm6, %mm5
284
285 movq %mm0, (%ecx)
286 movq %mm5, 8(%ecx)
287 addl $16, %ecx
288
289 subl $1, %eax
290 .L33:
291 jne .L34
292
293 #ifdef USE_INNER_EMMS
294 emms
295 #endif
296 movl %ebp, %esp
297
298 /* At this point there are either [0, 3] pixels remaining to be
299 * converted.
300 */
301
302 testl $2, %edx
303 je .L36
304
305 movq (%ebx), %mm0
306 addl $8, %ebx
307
308 movq %mm0, %mm3
309 movq %mm0, %mm4
310
311 pand %mm2, %mm3
312 psllq $16, %mm4
313 psrlq $16, %mm3
314 pand %mm2, %mm4
315
316 pand %mm1, %mm0
317 por %mm4, %mm3
318 por %mm3, %mm0
319
320 movq %mm0, (%ecx)
321 addl $8, %ecx
322 .L36:
323
324 testl $1, %edx
325 je .L35
326
327 DO_ONE_LAST_PIXEL()
328 .L35:
329 popl %ebp
330 popl %ebx
331 popl %esi
332 ret
333 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
334
335
336 /**
337 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
338 */
339
340 .text
341 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
342 #ifndef USE_DRICORE
343 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
344 #endif
345 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
346 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
347 pushl %esi
348 pushl %ebx
349
350 LOAD_MASK(movdqu,%xmm1,%xmm2)
351
352 movl 12(%esp), %ebx /* source pointer */
353 movl 20(%esp), %edx /* number of pixels to copy */
354 movl 16(%esp), %ecx /* destination pointer */
355
356 movl %ebx, %eax
357 movl %edx, %esi
358
359 testl %edx, %edx
360 jle .L46 /* Bail if there's nothing to do. */
361
362 /* If the source pointer isn't a multiple of 16 we have to process
363 * a few pixels the "slow" way to get the address aligned for
364 * the SSE fetch intsructions.
365 */
366
367 negl %eax
368 andl $15, %eax
369 sarl $2, %eax
370
371 cmpl %edx, %eax
372 cmovbe %eax, %esi
373 subl %esi, %edx
374
375 testl $1, %esi
376 je .L41
377
378 DO_ONE_PIXEL()
379 .L41:
380 testl $2, %esi
381 je .L40
382
383 movq (%ebx), %xmm0
384 addl $8, %ebx
385
386 movdqa %xmm0, %xmm3
387 movdqa %xmm0, %xmm4
388 andps %xmm1, %xmm0
389
390 andps %xmm2, %xmm3
391 pslldq $2, %xmm4
392 psrldq $2, %xmm3
393 andps %xmm2, %xmm4
394
395 orps %xmm4, %xmm3
396 orps %xmm3, %xmm0
397
398 movq %xmm0, (%ecx)
399 addl $8, %ecx
400 .L40:
401
402 /* Would it be worth having a specialized version of this loop for
403 * the case where the destination is 16-byte aligned? That version
404 * would be identical except that it could use movedqa instead of
405 * movdqu.
406 */
407
408 movl %edx, %eax
409 shrl $2, %eax
410 jmp .L42
411 .L43:
412 movdqa (%ebx), %xmm0
413 addl $16, %ebx
414
415 movdqa %xmm0, %xmm3
416 movdqa %xmm0, %xmm4
417 andps %xmm1, %xmm0
418
419 andps %xmm2, %xmm3
420 pslldq $2, %xmm4
421 psrldq $2, %xmm3
422 andps %xmm2, %xmm4
423
424 orps %xmm4, %xmm3
425 orps %xmm3, %xmm0
426
427 movdqu %xmm0, (%ecx)
428 addl $16, %ecx
429 subl $1, %eax
430 .L42:
431 jne .L43
432
433
434 /* There may be upto 3 pixels remaining to be copied. Take care
435 * of them now. We do the 2 pixel case first because the data
436 * will be aligned.
437 */
438
439 testl $2, %edx
440 je .L47
441
442 movq (%ebx), %xmm0
443 addl $8, %ebx
444
445 movdqa %xmm0, %xmm3
446 movdqa %xmm0, %xmm4
447 andps %xmm1, %xmm0
448
449 andps %xmm2, %xmm3
450 pslldq $2, %xmm4
451 psrldq $2, %xmm3
452 andps %xmm2, %xmm4
453
454 orps %xmm4, %xmm3
455 orps %xmm3, %xmm0
456
457 movq %xmm0, (%ecx)
458 addl $8, %ecx
459 .L47:
460
461 testl $1, %edx
462 je .L46
463
464 DO_ONE_LAST_PIXEL()
465 .L46:
466
467 popl %ebx
468 popl %esi
469 ret
470 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
471
472
473
474 #define MASK_565_L 0x07e0f800
475 #define MASK_565_H 0x0000001f
476 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
477 * classic C implementation in Mesa. Setting SCALE_ADJUST
478 * to 0 is slightly faster but at a small cost to accuracy.
479 */
480 #define SCALE_ADJUST 5
481 #if SCALE_ADJUST == 5
482 #define PRESCALE_L 0x00100001
483 #define PRESCALE_H 0x00000200
484 #define SCALE_L 0x40C620E8
485 #define SCALE_H 0x0000839d
486 #elif SCALE_ADJUST == 0
487 #define PRESCALE_L 0x00200001
488 #define PRESCALE_H 0x00000800
489 #define SCALE_L 0x01040108
490 #define SCALE_H 0x00000108
491 #else
492 #error SCALE_ADJUST must either be 5 or 0.
493 #endif
494 #define ALPHA_L 0x00000000
495 #define ALPHA_H 0x00ff0000
496
497 /**
498 * MMX optimized version of the RGB565 to RGBA copy routine.
499 */
500
501 .text
502 .globl _generic_read_RGBA_span_RGB565_MMX
503 #ifndef USE_DRICORE
504 .hidden _generic_read_RGBA_span_RGB565_MMX
505 #endif
506 .type _generic_read_RGBA_span_RGB565_MMX, @function
507
508 _generic_read_RGBA_span_RGB565_MMX:
509
510 #ifdef USE_INNER_EMMS
511 emms
512 #endif
513
514 movl 4(%esp), %eax /* source pointer */
515 movl 8(%esp), %edx /* destination pointer */
516 movl 12(%esp), %ecx /* number of pixels to copy */
517
518 pushl $MASK_565_H
519 pushl $MASK_565_L
520 movq (%esp), %mm5
521 pushl $PRESCALE_H
522 pushl $PRESCALE_L
523 movq (%esp), %mm6
524 pushl $SCALE_H
525 pushl $SCALE_L
526 movq (%esp), %mm7
527 pushl $ALPHA_H
528 pushl $ALPHA_L
529 movq (%esp), %mm3
530 addl $32,%esp
531
532 sarl $2, %ecx
533 jl .L01 /* Bail early if the count is negative. */
534 jmp .L02
535
536 .L03:
537 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
538 * second pixels into the four words of %mm0 and %mm2.
539 */
540
541 movq (%eax), %mm4
542 addl $8, %eax
543
544 pshufw $0x00, %mm4, %mm0
545 pshufw $0x55, %mm4, %mm2
546
547
548 /* Mask the pixels so that each word of each register contains only
549 * one color component.
550 */
551
552 pand %mm5, %mm0
553 pand %mm5, %mm2
554
555
556 /* Adjust the component values so that they are as small as possible,
557 * but large enough so that we can multiply them by an unsigned 16-bit
558 * number and get a value as large as 0x00ff0000.
559 */
560
561 pmullw %mm6, %mm0
562 pmullw %mm6, %mm2
563 #if SCALE_ADJUST > 0
564 psrlw $SCALE_ADJUST, %mm0
565 psrlw $SCALE_ADJUST, %mm2
566 #endif
567
568 /* Scale the input component values to be on the range
569 * [0, 0x00ff0000]. This it the real magic of the whole routine.
570 */
571
572 pmulhuw %mm7, %mm0
573 pmulhuw %mm7, %mm2
574
575
576 /* Always set the alpha value to 0xff.
577 */
578
579 por %mm3, %mm0
580 por %mm3, %mm2
581
582
583 /* Pack the 16-bit values to 8-bit values and store the converted
584 * pixel data.
585 */
586
587 packuswb %mm2, %mm0
588 movq %mm0, (%edx)
589 addl $8, %edx
590
591 pshufw $0xaa, %mm4, %mm0
592 pshufw $0xff, %mm4, %mm2
593
594 pand %mm5, %mm0
595 pand %mm5, %mm2
596 pmullw %mm6, %mm0
597 pmullw %mm6, %mm2
598 #if SCALE_ADJUST > 0
599 psrlw $SCALE_ADJUST, %mm0
600 psrlw $SCALE_ADJUST, %mm2
601 #endif
602 pmulhuw %mm7, %mm0
603 pmulhuw %mm7, %mm2
604
605 por %mm3, %mm0
606 por %mm3, %mm2
607
608 packuswb %mm2, %mm0
609
610 movq %mm0, (%edx)
611 addl $8, %edx
612
613 subl $1, %ecx
614 .L02:
615 jne .L03
616
617
618 /* At this point there can be at most 3 pixels left to process. If
619 * there is either 2 or 3 left, process 2.
620 */
621
622 movl 12(%esp), %ecx
623 testl $0x02, %ecx
624 je .L04
625
626 movd (%eax), %mm4
627 addl $4, %eax
628
629 pshufw $0x00, %mm4, %mm0
630 pshufw $0x55, %mm4, %mm2
631
632 pand %mm5, %mm0
633 pand %mm5, %mm2
634 pmullw %mm6, %mm0
635 pmullw %mm6, %mm2
636 #if SCALE_ADJUST > 0
637 psrlw $SCALE_ADJUST, %mm0
638 psrlw $SCALE_ADJUST, %mm2
639 #endif
640 pmulhuw %mm7, %mm0
641 pmulhuw %mm7, %mm2
642
643 por %mm3, %mm0
644 por %mm3, %mm2
645
646 packuswb %mm2, %mm0
647
648 movq %mm0, (%edx)
649 addl $8, %edx
650
651 .L04:
652 /* At this point there can be at most 1 pixel left to process.
653 * Process it if needed.
654 */
655
656 testl $0x01, %ecx
657 je .L01
658
659 movzwl (%eax), %ecx
660 movd %ecx, %mm4
661
662 pshufw $0x00, %mm4, %mm0
663
664 pand %mm5, %mm0
665 pmullw %mm6, %mm0
666 #if SCALE_ADJUST > 0
667 psrlw $SCALE_ADJUST, %mm0
668 #endif
669 pmulhuw %mm7, %mm0
670
671 por %mm3, %mm0
672
673 packuswb %mm0, %mm0
674
675 movd %mm0, (%edx)
676
677 .L01:
678 #ifdef USE_INNER_EMMS
679 emms
680 #endif
681 ret
682 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
683
684 #if defined (__ELF__) && defined (__linux__)
685 .section .note.GNU-stack,"",%progbits
686 #endif