Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
1 /*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32 /* Control flow enforcement support */
33 #ifdef HAVE_CET_H
34 #include <cet.h>
35 #else
36 #define _CET_ENDBR
37 #endif
38
39 .file "read_rgba_span_x86.S"
40 #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
41 /* Kevin F. Quinn 2nd July 2006
42 * Replaced data segment constants with text-segment instructions.
43 */
44 #define LOAD_MASK(mvins,m1,m2) \
45 pushl $0xff00ff00 ;\
46 pushl $0xff00ff00 ;\
47 pushl $0xff00ff00 ;\
48 pushl $0xff00ff00 ;\
49 mvins (%esp), m1 ;\
50 pushl $0x00ff0000 ;\
51 pushl $0x00ff0000 ;\
52 pushl $0x00ff0000 ;\
53 pushl $0x00ff0000 ;\
54 mvins (%esp), m2 ;\
55 addl $32, %esp
56
57 /* I implemented these as macros because they appear in several places,
58 * and I've tweaked them a number of times. I got tired of changing every
59 * place they appear. :)
60 */
61
62 #define DO_ONE_PIXEL() \
63 movl (%ebx), %eax ; \
64 addl $4, %ebx ; \
65 bswap %eax /* ARGB -> BGRA */ ; \
66 rorl $8, %eax /* BGRA -> ABGR */ ; \
67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
68 addl $4, %ecx
69
70 #define DO_ONE_LAST_PIXEL() \
71 movl (%ebx), %eax ; \
72 bswap %eax /* ARGB -> BGRA */ ; \
73 rorl $8, %eax /* BGRA -> ABGR */ ; \
74 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ;
75
76
77 /**
78 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
79 *
80 * \warning
81 * This function assumes that the caller will issue the EMMS instruction
82 * at the correct places.
83 */
84
85 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
86 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
87 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
88 _generic_read_RGBA_span_BGRA8888_REV_MMX:
89 _CET_ENDBR
90 pushl %ebx
91
92 #ifdef USE_INNER_EMMS
93 emms
94 #endif
95 LOAD_MASK(movq,%mm1,%mm2)
96
97 movl 8(%esp), %ebx /* source pointer */
98 movl 16(%esp), %edx /* number of pixels to copy */
99 movl 12(%esp), %ecx /* destination pointer */
100
101 testl %edx, %edx
102 jle .L20 /* Bail if there's nothing to do. */
103
104 movl %ebx, %eax
105
106 negl %eax
107 sarl $2, %eax
108 andl $1, %eax
109 je .L17
110
111 subl %eax, %edx
112 DO_ONE_PIXEL()
113 .L17:
114
115 /* Would it be faster to unroll this loop once and process 4 pixels
116 * per pass, instead of just two?
117 */
118
119 movl %edx, %eax
120 shrl %eax
121 jmp .L18
122 .L19:
123 movq (%ebx), %mm0
124 addl $8, %ebx
125
126 /* These 9 instructions do what PSHUFB (if there were such an
127 * instruction) could do in 1. :(
128 */
129
130 movq %mm0, %mm3
131 movq %mm0, %mm4
132
133 pand %mm2, %mm3
134 psllq $16, %mm4
135 psrlq $16, %mm3
136 pand %mm2, %mm4
137
138 pand %mm1, %mm0
139 por %mm4, %mm3
140 por %mm3, %mm0
141
142 movq %mm0, (%ecx)
143 addl $8, %ecx
144 subl $1, %eax
145 .L18:
146 jne .L19
147
148 #ifdef USE_INNER_EMMS
149 emms
150 #endif
151
152 /* At this point there are either 1 or 0 pixels remaining to be
153 * converted. Convert the last pixel, if needed.
154 */
155
156 testl $1, %edx
157 je .L20
158
159 DO_ONE_LAST_PIXEL()
160
161 .L20:
162 popl %ebx
163 ret
164 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
165
166
167 /**
168 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
169 * instructions are only actually used to read data from the framebuffer.
170 * In practice, the speed-up is pretty small.
171 *
172 * \todo
173 * Do some more testing and determine if there's any reason to have this
174 * function in addition to the MMX version.
175 *
176 * \warning
177 * This function assumes that the caller will issue the EMMS instruction
178 * at the correct places.
179 */
180
181 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
182 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
183 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
184 _generic_read_RGBA_span_BGRA8888_REV_SSE:
185 _CET_ENDBR
186 pushl %esi
187 pushl %ebx
188 pushl %ebp
189
190 #ifdef USE_INNER_EMMS
191 emms
192 #endif
193
194 LOAD_MASK(movq,%mm1,%mm2)
195
196 movl 16(%esp), %ebx /* source pointer */
197 movl 24(%esp), %edx /* number of pixels to copy */
198 movl 20(%esp), %ecx /* destination pointer */
199
200 testl %edx, %edx
201 jle .L35 /* Bail if there's nothing to do. */
202
203 movl %esp, %ebp
204 subl $16, %esp
205 andl $0xfffffff0, %esp
206
207 movl %ebx, %eax
208 movl %edx, %esi
209
210 negl %eax
211 andl $15, %eax
212 sarl $2, %eax
213 cmpl %edx, %eax
214 cmovle %eax, %esi
215
216 subl %esi, %edx
217
218 testl $1, %esi
219 je .L32
220
221 DO_ONE_PIXEL()
222 .L32:
223
224 testl $2, %esi
225 je .L31
226
227 movq (%ebx), %mm0
228 addl $8, %ebx
229
230 movq %mm0, %mm3
231 movq %mm0, %mm4
232
233 pand %mm2, %mm3
234 psllq $16, %mm4
235 psrlq $16, %mm3
236 pand %mm2, %mm4
237
238 pand %mm1, %mm0
239 por %mm4, %mm3
240 por %mm3, %mm0
241
242 movq %mm0, (%ecx)
243 addl $8, %ecx
244 .L31:
245
246 movl %edx, %eax
247 shrl $2, %eax
248 jmp .L33
249 .L34:
250 movaps (%ebx), %xmm0
251 addl $16, %ebx
252
253 /* This would be so much better if we could just move directly from
254 * an SSE register to an MMX register. Unfortunately, that
255 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
256 * instruction.
257 */
258
259 movaps %xmm0, (%esp)
260 movq (%esp), %mm0
261 movq 8(%esp), %mm5
262
263 movq %mm0, %mm3
264 movq %mm0, %mm4
265 movq %mm5, %mm6
266 movq %mm5, %mm7
267
268 pand %mm2, %mm3
269 pand %mm2, %mm6
270
271 psllq $16, %mm4
272 psllq $16, %mm7
273
274 psrlq $16, %mm3
275 psrlq $16, %mm6
276
277 pand %mm2, %mm4
278 pand %mm2, %mm7
279
280 pand %mm1, %mm0
281 pand %mm1, %mm5
282
283 por %mm4, %mm3
284 por %mm7, %mm6
285
286 por %mm3, %mm0
287 por %mm6, %mm5
288
289 movq %mm0, (%ecx)
290 movq %mm5, 8(%ecx)
291 addl $16, %ecx
292
293 subl $1, %eax
294 .L33:
295 jne .L34
296
297 #ifdef USE_INNER_EMMS
298 emms
299 #endif
300 movl %ebp, %esp
301
302 /* At this point there are either [0, 3] pixels remaining to be
303 * converted.
304 */
305
306 testl $2, %edx
307 je .L36
308
309 movq (%ebx), %mm0
310 addl $8, %ebx
311
312 movq %mm0, %mm3
313 movq %mm0, %mm4
314
315 pand %mm2, %mm3
316 psllq $16, %mm4
317 psrlq $16, %mm3
318 pand %mm2, %mm4
319
320 pand %mm1, %mm0
321 por %mm4, %mm3
322 por %mm3, %mm0
323
324 movq %mm0, (%ecx)
325 addl $8, %ecx
326 .L36:
327
328 testl $1, %edx
329 je .L35
330
331 DO_ONE_LAST_PIXEL()
332 .L35:
333 popl %ebp
334 popl %ebx
335 popl %esi
336 ret
337 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
338
339
340 /**
341 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
342 */
343
344 .text
345 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
346 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
347 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
348 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
349 _CET_ENDBR
350 pushl %esi
351 pushl %ebx
352
353 LOAD_MASK(movdqu,%xmm1,%xmm2)
354
355 movl 12(%esp), %ebx /* source pointer */
356 movl 20(%esp), %edx /* number of pixels to copy */
357 movl 16(%esp), %ecx /* destination pointer */
358
359 movl %ebx, %eax
360 movl %edx, %esi
361
362 testl %edx, %edx
363 jle .L46 /* Bail if there's nothing to do. */
364
365 /* If the source pointer isn't a multiple of 16 we have to process
366 * a few pixels the "slow" way to get the address aligned for
367 * the SSE fetch intsructions.
368 */
369
370 negl %eax
371 andl $15, %eax
372 sarl $2, %eax
373
374 cmpl %edx, %eax
375 cmovbe %eax, %esi
376 subl %esi, %edx
377
378 testl $1, %esi
379 je .L41
380
381 DO_ONE_PIXEL()
382 .L41:
383 testl $2, %esi
384 je .L40
385
386 movq (%ebx), %xmm0
387 addl $8, %ebx
388
389 movdqa %xmm0, %xmm3
390 movdqa %xmm0, %xmm4
391 andps %xmm1, %xmm0
392
393 andps %xmm2, %xmm3
394 pslldq $2, %xmm4
395 psrldq $2, %xmm3
396 andps %xmm2, %xmm4
397
398 orps %xmm4, %xmm3
399 orps %xmm3, %xmm0
400
401 movq %xmm0, (%ecx)
402 addl $8, %ecx
403 .L40:
404
405 /* Would it be worth having a specialized version of this loop for
406 * the case where the destination is 16-byte aligned? That version
407 * would be identical except that it could use movedqa instead of
408 * movdqu.
409 */
410
411 movl %edx, %eax
412 shrl $2, %eax
413 jmp .L42
414 .L43:
415 movdqa (%ebx), %xmm0
416 addl $16, %ebx
417
418 movdqa %xmm0, %xmm3
419 movdqa %xmm0, %xmm4
420 andps %xmm1, %xmm0
421
422 andps %xmm2, %xmm3
423 pslldq $2, %xmm4
424 psrldq $2, %xmm3
425 andps %xmm2, %xmm4
426
427 orps %xmm4, %xmm3
428 orps %xmm3, %xmm0
429
430 movdqu %xmm0, (%ecx)
431 addl $16, %ecx
432 subl $1, %eax
433 .L42:
434 jne .L43
435
436
437 /* There may be upto 3 pixels remaining to be copied. Take care
438 * of them now. We do the 2 pixel case first because the data
439 * will be aligned.
440 */
441
442 testl $2, %edx
443 je .L47
444
445 movq (%ebx), %xmm0
446 addl $8, %ebx
447
448 movdqa %xmm0, %xmm3
449 movdqa %xmm0, %xmm4
450 andps %xmm1, %xmm0
451
452 andps %xmm2, %xmm3
453 pslldq $2, %xmm4
454 psrldq $2, %xmm3
455 andps %xmm2, %xmm4
456
457 orps %xmm4, %xmm3
458 orps %xmm3, %xmm0
459
460 movq %xmm0, (%ecx)
461 addl $8, %ecx
462 .L47:
463
464 testl $1, %edx
465 je .L46
466
467 DO_ONE_LAST_PIXEL()
468 .L46:
469
470 popl %ebx
471 popl %esi
472 ret
473 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
474
475
476
477 #define MASK_565_L 0x07e0f800
478 #define MASK_565_H 0x0000001f
479 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
480 * classic C implementation in Mesa. Setting SCALE_ADJUST
481 * to 0 is slightly faster but at a small cost to accuracy.
482 */
483 #define SCALE_ADJUST 5
484 #if SCALE_ADJUST == 5
485 #define PRESCALE_L 0x00100001
486 #define PRESCALE_H 0x00000200
487 #define SCALE_L 0x40C620E8
488 #define SCALE_H 0x0000839d
489 #elif SCALE_ADJUST == 0
490 #define PRESCALE_L 0x00200001
491 #define PRESCALE_H 0x00000800
492 #define SCALE_L 0x01040108
493 #define SCALE_H 0x00000108
494 #else
495 #error SCALE_ADJUST must either be 5 or 0.
496 #endif
497 #define ALPHA_L 0x00000000
498 #define ALPHA_H 0x00ff0000
499
500 /**
501 * MMX optimized version of the RGB565 to RGBA copy routine.
502 */
503
504 .text
505 .globl _generic_read_RGBA_span_RGB565_MMX
506 .hidden _generic_read_RGBA_span_RGB565_MMX
507 .type _generic_read_RGBA_span_RGB565_MMX, @function
508
509 _generic_read_RGBA_span_RGB565_MMX:
510 _CET_ENDBR
511 #ifdef USE_INNER_EMMS
512 emms
513 #endif
514
515 movl 4(%esp), %eax /* source pointer */
516 movl 8(%esp), %edx /* destination pointer */
517 movl 12(%esp), %ecx /* number of pixels to copy */
518
519 pushl $MASK_565_H
520 pushl $MASK_565_L
521 movq (%esp), %mm5
522 pushl $PRESCALE_H
523 pushl $PRESCALE_L
524 movq (%esp), %mm6
525 pushl $SCALE_H
526 pushl $SCALE_L
527 movq (%esp), %mm7
528 pushl $ALPHA_H
529 pushl $ALPHA_L
530 movq (%esp), %mm3
531 addl $32,%esp
532
533 sarl $2, %ecx
534 jl .L01 /* Bail early if the count is negative. */
535 jmp .L02
536
537 .L03:
538 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
539 * second pixels into the four words of %mm0 and %mm2.
540 */
541
542 movq (%eax), %mm4
543 addl $8, %eax
544
545 pshufw $0x00, %mm4, %mm0
546 pshufw $0x55, %mm4, %mm2
547
548
549 /* Mask the pixels so that each word of each register contains only
550 * one color component.
551 */
552
553 pand %mm5, %mm0
554 pand %mm5, %mm2
555
556
557 /* Adjust the component values so that they are as small as possible,
558 * but large enough so that we can multiply them by an unsigned 16-bit
559 * number and get a value as large as 0x00ff0000.
560 */
561
562 pmullw %mm6, %mm0
563 pmullw %mm6, %mm2
564 #if SCALE_ADJUST > 0
565 psrlw $SCALE_ADJUST, %mm0
566 psrlw $SCALE_ADJUST, %mm2
567 #endif
568
569 /* Scale the input component values to be on the range
570 * [0, 0x00ff0000]. This it the real magic of the whole routine.
571 */
572
573 pmulhuw %mm7, %mm0
574 pmulhuw %mm7, %mm2
575
576
577 /* Always set the alpha value to 0xff.
578 */
579
580 por %mm3, %mm0
581 por %mm3, %mm2
582
583
584 /* Pack the 16-bit values to 8-bit values and store the converted
585 * pixel data.
586 */
587
588 packuswb %mm2, %mm0
589 movq %mm0, (%edx)
590 addl $8, %edx
591
592 pshufw $0xaa, %mm4, %mm0
593 pshufw $0xff, %mm4, %mm2
594
595 pand %mm5, %mm0
596 pand %mm5, %mm2
597 pmullw %mm6, %mm0
598 pmullw %mm6, %mm2
599 #if SCALE_ADJUST > 0
600 psrlw $SCALE_ADJUST, %mm0
601 psrlw $SCALE_ADJUST, %mm2
602 #endif
603 pmulhuw %mm7, %mm0
604 pmulhuw %mm7, %mm2
605
606 por %mm3, %mm0
607 por %mm3, %mm2
608
609 packuswb %mm2, %mm0
610
611 movq %mm0, (%edx)
612 addl $8, %edx
613
614 subl $1, %ecx
615 .L02:
616 jne .L03
617
618
619 /* At this point there can be at most 3 pixels left to process. If
620 * there is either 2 or 3 left, process 2.
621 */
622
623 movl 12(%esp), %ecx
624 testl $0x02, %ecx
625 je .L04
626
627 movd (%eax), %mm4
628 addl $4, %eax
629
630 pshufw $0x00, %mm4, %mm0
631 pshufw $0x55, %mm4, %mm2
632
633 pand %mm5, %mm0
634 pand %mm5, %mm2
635 pmullw %mm6, %mm0
636 pmullw %mm6, %mm2
637 #if SCALE_ADJUST > 0
638 psrlw $SCALE_ADJUST, %mm0
639 psrlw $SCALE_ADJUST, %mm2
640 #endif
641 pmulhuw %mm7, %mm0
642 pmulhuw %mm7, %mm2
643
644 por %mm3, %mm0
645 por %mm3, %mm2
646
647 packuswb %mm2, %mm0
648
649 movq %mm0, (%edx)
650 addl $8, %edx
651
652 .L04:
653 /* At this point there can be at most 1 pixel left to process.
654 * Process it if needed.
655 */
656
657 testl $0x01, %ecx
658 je .L01
659
660 movzwl (%eax), %ecx
661 movd %ecx, %mm4
662
663 pshufw $0x00, %mm4, %mm0
664
665 pand %mm5, %mm0
666 pmullw %mm6, %mm0
667 #if SCALE_ADJUST > 0
668 psrlw $SCALE_ADJUST, %mm0
669 #endif
670 pmulhuw %mm7, %mm0
671
672 por %mm3, %mm0
673
674 packuswb %mm0, %mm0
675
676 movd %mm0, (%edx)
677
678 .L01:
679 #ifdef USE_INNER_EMMS
680 emms
681 #endif
682 ret
683 #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
684
685 #if defined (__ELF__) && defined (__linux__)
686 .section .note.GNU-stack,"",%progbits
687 #endif