Use write posting in the kickoff function too.
[mesa.git] / src / mesa / pipe / tgsi / exec / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "tgsi_platform.h"
29 #include "tgsi_core.h"
30 #include "x86/rtasm/x86sse.h"
31
32 #if defined(__i386__) || defined(__386__)
33
34 #define DUMP_SSE 0
35
36 #if DUMP_SSE
37
38 static void
39 _print_reg(
40 struct x86_reg reg )
41 {
42 switch( reg.file ) {
43 case file_REG32:
44 switch( reg.idx ) {
45 case reg_AX:
46 printf( "EAX" );
47 break;
48 case reg_CX:
49 printf( "ECX" );
50 break;
51 case reg_DX:
52 printf( "EDX" );
53 break;
54 case reg_BX:
55 printf( "EBX" );
56 break;
57 case reg_SP:
58 printf( "ESP" );
59 break;
60 case reg_BP:
61 printf( "EBP" );
62 break;
63 case reg_SI:
64 printf( "ESI" );
65 break;
66 case reg_DI:
67 printf( "EDI" );
68 break;
69 }
70 break;
71 case file_MMX:
72 assert( 0 );
73 break;
74 case file_XMM:
75 printf( "XMM%u", reg.idx );
76 break;
77 case file_x87:
78 assert( 0 );
79 break;
80 }
81 }
82
83 static void
84 _fill(
85 const char *op )
86 {
87 unsigned count = 10 - strlen( op );
88
89 while( count-- ) {
90 printf( " " );
91 }
92 }
93
94 #define DUMP_START() printf( "\nsse-dump start ----------------" )
95 #define DUMP_END() printf( "\nsse-dump end ----------------\n" )
96 #define DUMP( OP ) printf( "\n%s", OP )
97 #define DUMP_I( OP, I ) do {\
98 printf( "\n%s", OP );\
99 _fill( OP );\
100 printf( "%u", I ); } while( 0 )
101 #define DUMP_R( OP, R0 ) do {\
102 printf( "\n%s", OP );\
103 _fill( OP );\
104 _print_reg( R0 ); } while( 0 )
105 #define DUMP_RR( OP, R0, R1 ) do {\
106 printf( "\n%s", OP );\
107 _fill( OP );\
108 _print_reg( R0 );\
109 printf( ", " );\
110 _print_reg( R1 ); } while( 0 )
111 #define DUMP_RRI( OP, R0, R1, I ) do {\
112 printf( "\n%s", OP );\
113 _fill( OP );\
114 _print_reg( R0 );\
115 printf( ", " );\
116 _print_reg( R1 );\
117 printf( ", " );\
118 printf( "%u", I ); } while( 0 )
119
120 #else
121
122 #define DUMP_START()
123 #define DUMP_END()
124 #define DUMP( OP )
125 #define DUMP_I( OP, I )
126 #define DUMP_R( OP, R0 )
127 #define DUMP_RR( OP, R0, R1 )
128 #define DUMP_RRI( OP, R0, R1, I )
129
130 #endif
131
132 #define FOR_EACH_CHANNEL( CHAN )\
133 for( CHAN = 0; CHAN < 4; CHAN++ )
134
135 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
136 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
137
138 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
139 if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
140
141 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
142 FOR_EACH_CHANNEL( CHAN )\
143 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
144
145 #define CHAN_X 0
146 #define CHAN_Y 1
147 #define CHAN_Z 2
148 #define CHAN_W 3
149
150 #define TEMP_R0 TGSI_EXEC_TEMP_R0
151
152 /**
153 * X86 utility functions.
154 */
155
156 static struct x86_reg
157 make_xmm(
158 unsigned xmm )
159 {
160 return x86_make_reg(
161 file_XMM,
162 (enum x86_reg_name) xmm );
163 }
164
165 /**
166 * X86 register mapping helpers.
167 */
168
169 static struct x86_reg
170 get_const_base( void )
171 {
172 return x86_make_reg(
173 file_REG32,
174 reg_CX );
175 }
176
177 static struct x86_reg
178 get_input_base( void )
179 {
180 return x86_make_reg(
181 file_REG32,
182 reg_AX );
183 }
184
185 static struct x86_reg
186 get_output_base( void )
187 {
188 return x86_make_reg(
189 file_REG32,
190 reg_DX );
191 }
192
193 static struct x86_reg
194 get_temp_base( void )
195 {
196 return x86_make_reg(
197 file_REG32,
198 reg_BX );
199 }
200
201 static struct x86_reg
202 get_coef_base( void )
203 {
204 return get_output_base();
205 }
206
207 /**
208 * Data access helpers.
209 */
210
211 static struct x86_reg
212 get_argument(
213 unsigned index )
214 {
215 return x86_make_disp(
216 x86_make_reg( file_REG32, reg_SP ),
217 (index + 1) * 4 );
218 }
219
220 static struct x86_reg
221 get_const(
222 unsigned vec,
223 unsigned chan )
224 {
225 return x86_make_disp(
226 get_const_base(),
227 (vec * 4 + chan) * 4 );
228 }
229
230 static struct x86_reg
231 get_input(
232 unsigned vec,
233 unsigned chan )
234 {
235 return x86_make_disp(
236 get_input_base(),
237 (vec * 4 + chan) * 16 );
238 }
239
240 static struct x86_reg
241 get_output(
242 unsigned vec,
243 unsigned chan )
244 {
245 return x86_make_disp(
246 get_output_base(),
247 (vec * 4 + chan) * 16 );
248 }
249
250 static struct x86_reg
251 get_temp(
252 unsigned vec,
253 unsigned chan )
254 {
255 return x86_make_disp(
256 get_temp_base(),
257 (vec * 4 + chan) * 16 );
258 }
259
260 static struct x86_reg
261 get_coef(
262 unsigned vec,
263 unsigned chan,
264 unsigned member )
265 {
266 return x86_make_disp(
267 get_coef_base(),
268 ((vec * 3 + member) * 4 + chan) * 4 );
269 }
270
271 /**
272 * X86 rtasm wrappers.
273 */
274
275 static void
276 emit_addps(
277 struct x86_function *func,
278 struct x86_reg dst,
279 struct x86_reg src )
280 {
281 DUMP_RR( "ADDPS", dst, src );
282 sse_addps( func, dst, src );
283 }
284
285 static void
286 emit_andnps(
287 struct x86_function *func,
288 struct x86_reg dst,
289 struct x86_reg src )
290 {
291 DUMP_RR( "ANDNPS", dst, src );
292 sse_andnps( func, dst, src );
293 }
294
295 static void
296 emit_andps(
297 struct x86_function *func,
298 struct x86_reg dst,
299 struct x86_reg src )
300 {
301 DUMP_RR( "ANDPS", dst, src );
302 sse_andps( func, dst, src );
303 }
304
305 static void
306 emit_call(
307 struct x86_function *func,
308 void (* addr)() )
309 {
310 DUMP_I( "CALL", addr );
311 x86_call( func, addr );
312 }
313
314 static void
315 emit_cmpps(
316 struct x86_function *func,
317 struct x86_reg dst,
318 struct x86_reg src,
319 enum sse_cc cc )
320 {
321 DUMP_RRI( "CMPPS", dst, src, cc );
322 sse_cmpps( func, dst, src, cc );
323 }
324
325 static void
326 emit_cvttps2dq(
327 struct x86_function *func,
328 struct x86_reg dst,
329 struct x86_reg src )
330 {
331 DUMP_RR( "CVTTPS2DQ", dst, src );
332 sse2_cvttps2dq( func, dst, src );
333 }
334
335 static void
336 emit_maxps(
337 struct x86_function *func,
338 struct x86_reg dst,
339 struct x86_reg src )
340 {
341 DUMP_RR( "MAXPS", dst, src );
342 sse_maxps( func, dst, src );
343 }
344
345 static void
346 emit_minps(
347 struct x86_function *func,
348 struct x86_reg dst,
349 struct x86_reg src )
350 {
351 DUMP_RR( "MINPS", dst, src );
352 sse_minps( func, dst, src );
353 }
354
355 static void
356 emit_mov(
357 struct x86_function *func,
358 struct x86_reg dst,
359 struct x86_reg src )
360 {
361 DUMP_RR( "MOV", dst, src );
362 x86_mov( func, dst, src );
363 }
364
365 static void
366 emit_movaps(
367 struct x86_function *func,
368 struct x86_reg dst,
369 struct x86_reg src )
370 {
371 DUMP_RR( "MOVAPS", dst, src );
372 sse_movaps( func, dst, src );
373 }
374
375 static void
376 emit_movss(
377 struct x86_function *func,
378 struct x86_reg dst,
379 struct x86_reg src )
380 {
381 DUMP_RR( "MOVSS", dst, src );
382 sse_movss( func, dst, src );
383 }
384
385 static void
386 emit_movups(
387 struct x86_function *func,
388 struct x86_reg dst,
389 struct x86_reg src )
390 {
391 DUMP_RR( "MOVUPS", dst, src );
392 sse_movups( func, dst, src );
393 }
394
395 static void
396 emit_mulps(
397 struct x86_function *func,
398 struct x86_reg dst,
399 struct x86_reg src )
400 {
401 DUMP_RR( "MULPS", dst, src );
402 sse_mulps( func, dst, src );
403 }
404
405 static void
406 emit_or(
407 struct x86_function *func,
408 struct x86_reg dst,
409 struct x86_reg src )
410 {
411 DUMP_RR( "OR", dst, src );
412 x86_or( func, dst, src );
413 }
414
415 static void
416 emit_orps(
417 struct x86_function *func,
418 struct x86_reg dst,
419 struct x86_reg src )
420 {
421 DUMP_RR( "ORPS", dst, src );
422 sse_orps( func, dst, src );
423 }
424
425 static void
426 emit_pmovmskb(
427 struct x86_function *func,
428 struct x86_reg dst,
429 struct x86_reg src )
430 {
431 DUMP_RR( "PMOVMSKB", dst, src );
432 sse_pmovmskb( func, dst, src );
433 }
434
435 static void
436 emit_pop(
437 struct x86_function *func,
438 struct x86_reg dst )
439 {
440 DUMP_R( "POP", dst );
441 x86_pop( func, dst );
442 }
443
444 static void
445 emit_push(
446 struct x86_function *func,
447 struct x86_reg dst )
448 {
449 DUMP_R( "PUSH", dst );
450 x86_push( func, dst );
451 }
452
453 static void
454 emit_rcpps(
455 struct x86_function *func,
456 struct x86_reg dst,
457 struct x86_reg src )
458 {
459 DUMP_RR( "RCPPS", dst, src );
460 sse2_rcpps( func, dst, src );
461 }
462
463 #ifdef WIN32
464 static void
465 emit_retw(
466 struct x86_function *func,
467 unsigned size )
468 {
469 DUMP_I( "RET", size );
470 x86_retw( func, size );
471 }
472 #else
473 static void
474 emit_ret(
475 struct x86_function *func )
476 {
477 DUMP( "RET" );
478 x86_ret( func );
479 }
480 #endif
481
482 static void
483 emit_rsqrtps(
484 struct x86_function *func,
485 struct x86_reg dst,
486 struct x86_reg src )
487 {
488 DUMP_RR( "RSQRTPS", dst, src );
489 sse_rsqrtps( func, dst, src );
490 }
491
492 static void
493 emit_shufps(
494 struct x86_function *func,
495 struct x86_reg dst,
496 struct x86_reg src,
497 unsigned char shuf )
498 {
499 DUMP_RRI( "SHUFPS", dst, src, shuf );
500 sse_shufps( func, dst, src, shuf );
501 }
502
503 static void
504 emit_subps(
505 struct x86_function *func,
506 struct x86_reg dst,
507 struct x86_reg src )
508 {
509 DUMP_RR( "SUBPS", dst, src );
510 sse_subps( func, dst, src );
511 }
512
513 static void
514 emit_xorps(
515 struct x86_function *func,
516 struct x86_reg dst,
517 struct x86_reg src )
518 {
519 DUMP_RR( "XORPS", dst, src );
520 sse_xorps( func, dst, src );
521 }
522
523 /**
524 * Data fetch helpers.
525 */
526
527 static void
528 emit_const(
529 struct x86_function *func,
530 unsigned xmm,
531 unsigned vec,
532 unsigned chan )
533 {
534 emit_movss(
535 func,
536 make_xmm( xmm ),
537 get_const( vec, chan ) );
538 emit_shufps(
539 func,
540 make_xmm( xmm ),
541 make_xmm( xmm ),
542 SHUF( 0, 0, 0, 0 ) );
543 }
544
545 static void
546 emit_inputf(
547 struct x86_function *func,
548 unsigned xmm,
549 unsigned vec,
550 unsigned chan )
551 {
552 emit_movups(
553 func,
554 make_xmm( xmm ),
555 get_input( vec, chan ) );
556 }
557
558 static void
559 emit_output(
560 struct x86_function *func,
561 unsigned xmm,
562 unsigned vec,
563 unsigned chan )
564 {
565 emit_movups(
566 func,
567 get_output( vec, chan ),
568 make_xmm( xmm ) );
569 }
570
571 static void
572 emit_tempf(
573 struct x86_function *func,
574 unsigned xmm,
575 unsigned vec,
576 unsigned chan )
577 {
578 emit_movaps(
579 func,
580 make_xmm( xmm ),
581 get_temp( vec, chan ) );
582 }
583
584 static void
585 emit_coef(
586 struct x86_function *func,
587 unsigned xmm,
588 unsigned vec,
589 unsigned chan,
590 unsigned member )
591 {
592 emit_movss(
593 func,
594 make_xmm( xmm ),
595 get_coef( vec, chan, member ) );
596 emit_shufps(
597 func,
598 make_xmm( xmm ),
599 make_xmm( xmm ),
600 SHUF( 0, 0, 0, 0 ) );
601 }
602
603 /**
604 * Data store helpers.
605 */
606
607 static void
608 emit_inputs(
609 struct x86_function *func,
610 unsigned xmm,
611 unsigned vec,
612 unsigned chan )
613 {
614 emit_movups(
615 func,
616 get_input( vec, chan ),
617 make_xmm( xmm ) );
618 }
619
620 static void
621 emit_temps(
622 struct x86_function *func,
623 unsigned xmm,
624 unsigned vec,
625 unsigned chan )
626 {
627 emit_movaps(
628 func,
629 get_temp( vec, chan ),
630 make_xmm( xmm ) );
631 }
632
633 static void
634 emit_addrs(
635 struct x86_function *func,
636 unsigned xmm,
637 unsigned vec,
638 unsigned chan )
639 {
640 emit_temps(
641 func,
642 xmm,
643 vec + TGSI_EXEC_NUM_TEMPS,
644 chan );
645 }
646
647 /**
648 * Coefficent fetch helpers.
649 */
650
651 static void
652 emit_coef_a0(
653 struct x86_function *func,
654 unsigned xmm,
655 unsigned vec,
656 unsigned chan )
657 {
658 emit_coef(
659 func,
660 xmm,
661 vec,
662 chan,
663 0 );
664 }
665
666 static void
667 emit_coef_dadx(
668 struct x86_function *func,
669 unsigned xmm,
670 unsigned vec,
671 unsigned chan )
672 {
673 emit_coef(
674 func,
675 xmm,
676 vec,
677 chan,
678 1 );
679 }
680
681 static void
682 emit_coef_dady(
683 struct x86_function *func,
684 unsigned xmm,
685 unsigned vec,
686 unsigned chan )
687 {
688 emit_coef(
689 func,
690 xmm,
691 vec,
692 chan,
693 2 );
694 }
695
696 /**
697 * Function call helpers.
698 */
699
700 static void
701 emit_push_gp(
702 struct x86_function *func )
703 {
704 emit_push(
705 func,
706 get_const_base() );
707 emit_push(
708 func,
709 get_input_base() );
710 emit_push(
711 func,
712 get_output_base() );
713
714 /* It is important on non-win32 platforms that temp base is pushed last.
715 */
716 emit_push(
717 func,
718 get_temp_base() );
719 }
720
721 static void
722 emit_pop_gp(
723 struct x86_function *func )
724 {
725 /* Restore GP registers in a reverse order.
726 */
727 emit_pop(
728 func,
729 get_temp_base() );
730 emit_pop(
731 func,
732 get_output_base() );
733 emit_pop(
734 func,
735 get_input_base() );
736 emit_pop(
737 func,
738 get_const_base() );
739 }
740
741 static void
742 emit_func_call_dst(
743 struct x86_function *func,
744 unsigned xmm_dst,
745 void (*code)() )
746 {
747 emit_movaps(
748 func,
749 get_temp( TEMP_R0, 0 ),
750 make_xmm( xmm_dst ) );
751
752 emit_push_gp(
753 func );
754
755 #ifdef WIN32
756 emit_push(
757 func,
758 get_temp( TEMP_R0, 0 ) );
759 #endif
760
761 emit_call(
762 func,
763 code );
764
765 emit_pop_gp(
766 func );
767
768 emit_movaps(
769 func,
770 make_xmm( xmm_dst ),
771 get_temp( TEMP_R0, 0 ) );
772 }
773
774 static void
775 emit_func_call_dst_src(
776 struct x86_function *func,
777 unsigned xmm_dst,
778 unsigned xmm_src,
779 void (*code)() )
780 {
781 emit_movaps(
782 func,
783 get_temp( TEMP_R0, 1 ),
784 make_xmm( xmm_src ) );
785
786 emit_func_call_dst(
787 func,
788 xmm_dst,
789 code );
790 }
791
792 /**
793 * Low-level instruction translators.
794 */
795
796 static void
797 emit_abs(
798 struct x86_function *func,
799 unsigned xmm )
800 {
801 emit_andps(
802 func,
803 make_xmm( xmm ),
804 get_temp(
805 TGSI_EXEC_TEMP_7FFFFFFF_I,
806 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
807 }
808
809 static void
810 emit_add(
811 struct x86_function *func,
812 unsigned xmm_dst,
813 unsigned xmm_src )
814 {
815 emit_addps(
816 func,
817 make_xmm( xmm_dst ),
818 make_xmm( xmm_src ) );
819 }
820
821 static void XSTDCALL
822 cos4f(
823 float *store )
824 {
825 #ifdef WIN32
826 store[0] = (float) cos( (double) store[0] );
827 store[1] = (float) cos( (double) store[1] );
828 store[2] = (float) cos( (double) store[2] );
829 store[3] = (float) cos( (double) store[3] );
830 #else
831 const unsigned X = TEMP_R0 * 16;
832 store[X + 0] = cosf( store[X + 0] );
833 store[X + 1] = cosf( store[X + 1] );
834 store[X + 2] = cosf( store[X + 2] );
835 store[X + 3] = cosf( store[X + 3] );
836 #endif
837 }
838
839 static void
840 emit_cos(
841 struct x86_function *func,
842 unsigned xmm_dst )
843 {
844 emit_func_call_dst(
845 func,
846 xmm_dst,
847 cos4f );
848 }
849
850 static void XSTDCALL
851 ex24f(
852 float *store )
853 {
854 #ifdef WIN32
855 store[0] = (float) pow( 2.0, (double) store[0] );
856 store[1] = (float) pow( 2.0, (double) store[1] );
857 store[2] = (float) pow( 2.0, (double) store[2] );
858 store[3] = (float) pow( 2.0, (double) store[3] );
859 #else
860 const unsigned X = TEMP_R0 * 16;
861 store[X + 0] = powf( 2.0f, store[X + 0] );
862 store[X + 1] = powf( 2.0f, store[X + 1] );
863 store[X + 2] = powf( 2.0f, store[X + 2] );
864 store[X + 3] = powf( 2.0f, store[X + 3] );
865 #endif
866 }
867
868 static void
869 emit_ex2(
870 struct x86_function *func,
871 unsigned xmm_dst )
872 {
873 emit_func_call_dst(
874 func,
875 xmm_dst,
876 ex24f );
877 }
878
879 static void
880 emit_f2it(
881 struct x86_function *func,
882 unsigned xmm )
883 {
884 emit_cvttps2dq(
885 func,
886 make_xmm( xmm ),
887 make_xmm( xmm ) );
888 }
889
890 static void XSTDCALL
891 flr4f(
892 float *store )
893 {
894 #ifdef WIN32
895 const unsigned X = 0;
896 #else
897 const unsigned X = TEMP_R0 * 16;
898 #endif
899 store[X + 0] = (float) floor( (double) store[X + 0] );
900 store[X + 1] = (float) floor( (double) store[X + 1] );
901 store[X + 2] = (float) floor( (double) store[X + 2] );
902 store[X + 3] = (float) floor( (double) store[X + 3] );
903 }
904
905 static void
906 emit_flr(
907 struct x86_function *func,
908 unsigned xmm_dst )
909 {
910 emit_func_call_dst(
911 func,
912 xmm_dst,
913 flr4f );
914 }
915
916 static void XSTDCALL
917 frc4f(
918 float *store )
919 {
920 #ifdef WIN32
921 const unsigned X = 0;
922 #else
923 const unsigned X = TEMP_R0 * 16;
924 #endif
925 store[X + 0] -= (float) floor( (double) store[X + 0] );
926 store[X + 1] -= (float) floor( (double) store[X + 1] );
927 store[X + 2] -= (float) floor( (double) store[X + 2] );
928 store[X + 3] -= (float) floor( (double) store[X + 3] );
929 }
930
931 static void
932 emit_frc(
933 struct x86_function *func,
934 unsigned xmm_dst )
935 {
936 emit_func_call_dst(
937 func,
938 xmm_dst,
939 frc4f );
940 }
941
942 static void XSTDCALL
943 lg24f(
944 float *store )
945 {
946 #ifdef WIN32
947 const unsigned X = 0;
948 #else
949 const unsigned X = TEMP_R0 * 16;
950 #endif
951 store[X + 0] = LOG2( store[X + 0] );
952 store[X + 1] = LOG2( store[X + 1] );
953 store[X + 2] = LOG2( store[X + 2] );
954 store[X + 3] = LOG2( store[X + 3] );
955 }
956
957 static void
958 emit_lg2(
959 struct x86_function *func,
960 unsigned xmm_dst )
961 {
962 emit_func_call_dst(
963 func,
964 xmm_dst,
965 lg24f );
966 }
967
968 static void
969 emit_MOV(
970 struct x86_function *func,
971 unsigned xmm_dst,
972 unsigned xmm_src )
973 {
974 emit_movups(
975 func,
976 make_xmm( xmm_dst ),
977 make_xmm( xmm_src ) );
978 }
979
980 static void
981 emit_mul (struct x86_function *func,
982 unsigned xmm_dst,
983 unsigned xmm_src)
984 {
985 emit_mulps(
986 func,
987 make_xmm( xmm_dst ),
988 make_xmm( xmm_src ) );
989 }
990
991 static void
992 emit_neg(
993 struct x86_function *func,
994 unsigned xmm )
995 {
996 emit_xorps(
997 func,
998 make_xmm( xmm ),
999 get_temp(
1000 TGSI_EXEC_TEMP_80000000_I,
1001 TGSI_EXEC_TEMP_80000000_C ) );
1002 }
1003
1004 static void XSTDCALL
1005 pow4f(
1006 float *store )
1007 {
1008 #ifdef WIN32
1009 store[0] = (float) pow( (double) store[0], (double) store[4] );
1010 store[1] = (float) pow( (double) store[1], (double) store[5] );
1011 store[2] = (float) pow( (double) store[2], (double) store[6] );
1012 store[3] = (float) pow( (double) store[3], (double) store[7] );
1013 #else
1014 const unsigned X = TEMP_R0 * 16;
1015 store[X + 0] = powf( store[X + 0], store[X + 4] );
1016 store[X + 1] = powf( store[X + 1], store[X + 5] );
1017 store[X + 2] = powf( store[X + 2], store[X + 6] );
1018 store[X + 3] = powf( store[X + 3], store[X + 7] );
1019 #endif
1020 }
1021
1022 static void
1023 emit_pow(
1024 struct x86_function *func,
1025 unsigned xmm_dst,
1026 unsigned xmm_src )
1027 {
1028 emit_func_call_dst_src(
1029 func,
1030 xmm_dst,
1031 xmm_src,
1032 pow4f );
1033 }
1034
1035 static void
1036 emit_rcp (
1037 struct x86_function *func,
1038 unsigned xmm_dst,
1039 unsigned xmm_src )
1040 {
1041 emit_rcpps(
1042 func,
1043 make_xmm( xmm_dst ),
1044 make_xmm( xmm_src ) );
1045 }
1046
1047 static void
1048 emit_rsqrt(
1049 struct x86_function *func,
1050 unsigned xmm_dst,
1051 unsigned xmm_src )
1052 {
1053 emit_rsqrtps(
1054 func,
1055 make_xmm( xmm_dst ),
1056 make_xmm( xmm_src ) );
1057 }
1058
1059 static void
1060 emit_setsign(
1061 struct x86_function *func,
1062 unsigned xmm )
1063 {
1064 emit_orps(
1065 func,
1066 make_xmm( xmm ),
1067 get_temp(
1068 TGSI_EXEC_TEMP_80000000_I,
1069 TGSI_EXEC_TEMP_80000000_C ) );
1070 }
1071
1072 static void XSTDCALL
1073 sin4f(
1074 float *store )
1075 {
1076 #ifdef WIN32
1077 store[0] = (float) sin( (double) store[0] );
1078 store[1] = (float) sin( (double) store[1] );
1079 store[2] = (float) sin( (double) store[2] );
1080 store[3] = (float) sin( (double) store[3] );
1081 #else
1082 const unsigned X = TEMP_R0 * 16;
1083 store[X + 0] = sinf( store[X + 0] );
1084 store[X + 1] = sinf( store[X + 1] );
1085 store[X + 2] = sinf( store[X + 2] );
1086 store[X + 3] = sinf( store[X + 3] );
1087 #endif
1088 }
1089
1090 static void
1091 emit_sin (struct x86_function *func,
1092 unsigned xmm_dst)
1093 {
1094 emit_func_call_dst(
1095 func,
1096 xmm_dst,
1097 sin4f );
1098 }
1099
1100 static void
1101 emit_sub(
1102 struct x86_function *func,
1103 unsigned xmm_dst,
1104 unsigned xmm_src )
1105 {
1106 emit_subps(
1107 func,
1108 make_xmm( xmm_dst ),
1109 make_xmm( xmm_src ) );
1110 }
1111
1112 /**
1113 * Register fetch.
1114 */
1115
1116 static void
1117 emit_fetch(
1118 struct x86_function *func,
1119 unsigned xmm,
1120 const struct tgsi_full_src_register *reg,
1121 const unsigned chan_index )
1122 {
1123 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1124
1125 switch( swizzle ) {
1126 case TGSI_EXTSWIZZLE_X:
1127 case TGSI_EXTSWIZZLE_Y:
1128 case TGSI_EXTSWIZZLE_Z:
1129 case TGSI_EXTSWIZZLE_W:
1130 switch( reg->SrcRegister.File ) {
1131 case TGSI_FILE_CONSTANT:
1132 emit_const(
1133 func,
1134 xmm,
1135 reg->SrcRegister.Index,
1136 swizzle );
1137 break;
1138
1139 case TGSI_FILE_INPUT:
1140 emit_inputf(
1141 func,
1142 xmm,
1143 reg->SrcRegister.Index,
1144 swizzle );
1145 break;
1146
1147 case TGSI_FILE_TEMPORARY:
1148 emit_tempf(
1149 func,
1150 xmm,
1151 reg->SrcRegister.Index,
1152 swizzle );
1153 break;
1154
1155 default:
1156 assert( 0 );
1157 }
1158 break;
1159
1160 case TGSI_EXTSWIZZLE_ZERO:
1161 emit_tempf(
1162 func,
1163 xmm,
1164 TGSI_EXEC_TEMP_00000000_I,
1165 TGSI_EXEC_TEMP_00000000_C );
1166 break;
1167
1168 case TGSI_EXTSWIZZLE_ONE:
1169 emit_tempf(
1170 func,
1171 xmm,
1172 TGSI_EXEC_TEMP_ONE_I,
1173 TGSI_EXEC_TEMP_ONE_C );
1174 break;
1175
1176 default:
1177 assert( 0 );
1178 }
1179
1180 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1181 case TGSI_UTIL_SIGN_CLEAR:
1182 emit_abs( func, xmm );
1183 break;
1184
1185 case TGSI_UTIL_SIGN_SET:
1186 emit_setsign( func, xmm );
1187 break;
1188
1189 case TGSI_UTIL_SIGN_TOGGLE:
1190 emit_neg( func, xmm );
1191 break;
1192
1193 case TGSI_UTIL_SIGN_KEEP:
1194 break;
1195 }
1196 }
1197
1198 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1199 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1200
1201 /**
1202 * Register store.
1203 */
1204
1205 static void
1206 emit_store(
1207 struct x86_function *func,
1208 unsigned xmm,
1209 const struct tgsi_full_dst_register *reg,
1210 const struct tgsi_full_instruction *inst,
1211 unsigned chan_index )
1212 {
1213 switch( reg->DstRegister.File ) {
1214 case TGSI_FILE_OUTPUT:
1215 emit_output(
1216 func,
1217 xmm,
1218 reg->DstRegister.Index,
1219 chan_index );
1220 break;
1221
1222 case TGSI_FILE_TEMPORARY:
1223 emit_temps(
1224 func,
1225 xmm,
1226 reg->DstRegister.Index,
1227 chan_index );
1228 break;
1229
1230 case TGSI_FILE_ADDRESS:
1231 emit_addrs(
1232 func,
1233 xmm,
1234 reg->DstRegister.Index,
1235 chan_index );
1236 break;
1237
1238 default:
1239 assert( 0 );
1240 }
1241
1242 switch( inst->Instruction.Saturate ) {
1243 case TGSI_SAT_NONE:
1244 break;
1245
1246 case TGSI_SAT_ZERO_ONE:
1247 // assert( 0 );
1248 break;
1249
1250 case TGSI_SAT_MINUS_PLUS_ONE:
1251 assert( 0 );
1252 break;
1253 }
1254 }
1255
1256 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1257 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1258
1259 /**
1260 * High-level instruction translators.
1261 */
1262
1263 static void
1264 emit_kil(
1265 struct x86_function *func,
1266 const struct tgsi_full_src_register *reg )
1267 {
1268 unsigned uniquemask;
1269 unsigned registers[4];
1270 unsigned nextregister = 0;
1271 unsigned firstchan = ~0;
1272 unsigned chan_index;
1273
1274 /* This mask stores component bits that were already tested. Note that
1275 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1276 * tested. */
1277 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1278
1279 FOR_EACH_CHANNEL( chan_index ) {
1280 unsigned swizzle;
1281
1282 /* unswizzle channel */
1283 swizzle = tgsi_util_get_full_src_register_extswizzle(
1284 reg,
1285 chan_index );
1286
1287 /* check if the component has not been already tested */
1288 if( !(uniquemask & (1 << swizzle)) ) {
1289 uniquemask |= 1 << swizzle;
1290
1291 /* allocate register */
1292 registers[chan_index] = nextregister;
1293 emit_fetch(
1294 func,
1295 nextregister,
1296 reg,
1297 chan_index );
1298 nextregister++;
1299
1300 /* mark the first channel used */
1301 if( firstchan == ~0 ) {
1302 firstchan = chan_index;
1303 }
1304 }
1305 }
1306
1307 emit_push(
1308 func,
1309 x86_make_reg( file_REG32, reg_AX ) );
1310 emit_push(
1311 func,
1312 x86_make_reg( file_REG32, reg_DX ) );
1313
1314 FOR_EACH_CHANNEL( chan_index ) {
1315 if( uniquemask & (1 << chan_index) ) {
1316 emit_cmpps(
1317 func,
1318 make_xmm( registers[chan_index] ),
1319 get_temp(
1320 TGSI_EXEC_TEMP_00000000_I,
1321 TGSI_EXEC_TEMP_00000000_C ),
1322 cc_LessThan );
1323
1324 if( chan_index == firstchan ) {
1325 emit_pmovmskb(
1326 func,
1327 x86_make_reg( file_REG32, reg_AX ),
1328 make_xmm( registers[chan_index] ) );
1329 }
1330 else {
1331 emit_pmovmskb(
1332 func,
1333 x86_make_reg( file_REG32, reg_DX ),
1334 make_xmm( registers[chan_index] ) );
1335 emit_or(
1336 func,
1337 x86_make_reg( file_REG32, reg_AX ),
1338 x86_make_reg( file_REG32, reg_DX ) );
1339 }
1340 }
1341 }
1342
1343 emit_or(
1344 func,
1345 get_temp(
1346 TGSI_EXEC_TEMP_KILMASK_I,
1347 TGSI_EXEC_TEMP_KILMASK_C ),
1348 x86_make_reg( file_REG32, reg_AX ) );
1349
1350 emit_pop(
1351 func,
1352 x86_make_reg( file_REG32, reg_DX ) );
1353 emit_pop(
1354 func,
1355 x86_make_reg( file_REG32, reg_AX ) );
1356 }
1357
1358 static void
1359 emit_setcc(
1360 struct x86_function *func,
1361 struct tgsi_full_instruction *inst,
1362 enum sse_cc cc )
1363 {
1364 unsigned chan_index;
1365
1366 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1367 FETCH( func, *inst, 0, 0, chan_index );
1368 FETCH( func, *inst, 1, 1, chan_index );
1369 emit_cmpps(
1370 func,
1371 make_xmm( 0 ),
1372 make_xmm( 1 ),
1373 cc );
1374 emit_andps(
1375 func,
1376 make_xmm( 0 ),
1377 get_temp(
1378 TGSI_EXEC_TEMP_ONE_I,
1379 TGSI_EXEC_TEMP_ONE_C ) );
1380 STORE( func, *inst, 0, 0, chan_index );
1381 }
1382 }
1383
1384 static void
1385 emit_cmp(
1386 struct x86_function *func,
1387 struct tgsi_full_instruction *inst )
1388 {
1389 unsigned chan_index;
1390
1391 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1392 FETCH( func, *inst, 0, 0, chan_index );
1393 FETCH( func, *inst, 1, 1, chan_index );
1394 FETCH( func, *inst, 2, 2, chan_index );
1395 emit_cmpps(
1396 func,
1397 make_xmm( 0 ),
1398 get_temp(
1399 TGSI_EXEC_TEMP_00000000_I,
1400 TGSI_EXEC_TEMP_00000000_C ),
1401 cc_LessThan );
1402 emit_andps(
1403 func,
1404 make_xmm( 1 ),
1405 make_xmm( 0 ) );
1406 emit_andnps(
1407 func,
1408 make_xmm( 0 ),
1409 make_xmm( 2 ) );
1410 emit_orps(
1411 func,
1412 make_xmm( 0 ),
1413 make_xmm( 1 ) );
1414 STORE( func, *inst, 0, 0, chan_index );
1415 }
1416 }
1417
1418 static void
1419 emit_instruction(
1420 struct x86_function *func,
1421 struct tgsi_full_instruction *inst )
1422 {
1423 unsigned chan_index;
1424
1425 switch( inst->Instruction.Opcode ) {
1426 case TGSI_OPCODE_ARL:
1427 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1428 FETCH( func, *inst, 0, 0, chan_index );
1429 emit_f2it( func, 0 );
1430 STORE( func, *inst, 0, 0, chan_index );
1431 }
1432 break;
1433
1434 case TGSI_OPCODE_MOV:
1435 /* TGSI_OPCODE_SWZ */
1436 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1437 FETCH( func, *inst, 0, 0, chan_index );
1438 STORE( func, *inst, 0, 0, chan_index );
1439 }
1440 break;
1441
1442 case TGSI_OPCODE_LIT:
1443 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1444 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1445 emit_tempf(
1446 func,
1447 0,
1448 TGSI_EXEC_TEMP_ONE_I,
1449 TGSI_EXEC_TEMP_ONE_C);
1450 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1451 STORE( func, *inst, 0, 0, CHAN_X );
1452 }
1453 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1454 STORE( func, *inst, 0, 0, CHAN_W );
1455 }
1456 }
1457 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1458 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1459 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1460 FETCH( func, *inst, 0, 0, CHAN_X );
1461 emit_maxps(
1462 func,
1463 make_xmm( 0 ),
1464 get_temp(
1465 TGSI_EXEC_TEMP_00000000_I,
1466 TGSI_EXEC_TEMP_00000000_C ) );
1467 STORE( func, *inst, 0, 0, CHAN_Y );
1468 }
1469 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1470 FETCH( func, *inst, 1, 0, CHAN_Y );
1471 emit_maxps(
1472 func,
1473 make_xmm( 1 ),
1474 get_temp(
1475 TGSI_EXEC_TEMP_00000000_I,
1476 TGSI_EXEC_TEMP_00000000_C ) );
1477 FETCH( func, *inst, 2, 0, CHAN_W );
1478 emit_minps(
1479 func,
1480 make_xmm( 2 ),
1481 get_temp(
1482 TGSI_EXEC_TEMP_128_I,
1483 TGSI_EXEC_TEMP_128_C ) );
1484 emit_maxps(
1485 func,
1486 make_xmm( 2 ),
1487 get_temp(
1488 TGSI_EXEC_TEMP_MINUS_128_I,
1489 TGSI_EXEC_TEMP_MINUS_128_C ) );
1490 emit_pow( func, 1, 2 );
1491 FETCH( func, *inst, 0, 0, CHAN_X );
1492 emit_xorps(
1493 func,
1494 make_xmm( 2 ),
1495 make_xmm( 2 ) );
1496 emit_cmpps(
1497 func,
1498 make_xmm( 2 ),
1499 make_xmm( 0 ),
1500 cc_LessThanEqual );
1501 emit_andps(
1502 func,
1503 make_xmm( 2 ),
1504 make_xmm( 1 ) );
1505 STORE( func, *inst, 2, 0, CHAN_Z );
1506 }
1507 }
1508 break;
1509
1510 case TGSI_OPCODE_RCP:
1511 /* TGSI_OPCODE_RECIP */
1512 FETCH( func, *inst, 0, 0, CHAN_X );
1513 emit_rcp( func, 0, 0 );
1514 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1515 STORE( func, *inst, 0, 0, chan_index );
1516 }
1517 break;
1518
1519 case TGSI_OPCODE_RSQ:
1520 /* TGSI_OPCODE_RECIPSQRT */
1521 FETCH( func, *inst, 0, 0, CHAN_X );
1522 emit_rsqrt( func, 0, 0 );
1523 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1524 STORE( func, *inst, 0, 0, chan_index );
1525 }
1526 break;
1527
1528 case TGSI_OPCODE_EXP:
1529 assert( 0 );
1530 break;
1531
1532 case TGSI_OPCODE_LOG:
1533 assert( 0 );
1534 break;
1535
1536 case TGSI_OPCODE_MUL:
1537 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1538 FETCH( func, *inst, 0, 0, chan_index );
1539 FETCH( func, *inst, 1, 1, chan_index );
1540 emit_mul( func, 0, 1 );
1541 STORE( func, *inst, 0, 0, chan_index );
1542 }
1543 break;
1544
1545 case TGSI_OPCODE_ADD:
1546 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1547 FETCH( func, *inst, 0, 0, chan_index );
1548 FETCH( func, *inst, 1, 1, chan_index );
1549 emit_add( func, 0, 1 );
1550 STORE( func, *inst, 0, 0, chan_index );
1551 }
1552 break;
1553
1554 case TGSI_OPCODE_DP3:
1555 /* TGSI_OPCODE_DOT3 */
1556 FETCH( func, *inst, 0, 0, CHAN_X );
1557 FETCH( func, *inst, 1, 1, CHAN_X );
1558 emit_mul( func, 0, 1 );
1559 FETCH( func, *inst, 1, 0, CHAN_Y );
1560 FETCH( func, *inst, 2, 1, CHAN_Y );
1561 emit_mul( func, 1, 2 );
1562 emit_add( func, 0, 1 );
1563 FETCH( func, *inst, 1, 0, CHAN_Z );
1564 FETCH( func, *inst, 2, 1, CHAN_Z );
1565 emit_mul( func, 1, 2 );
1566 emit_add( func, 0, 1 );
1567 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1568 STORE( func, *inst, 0, 0, chan_index );
1569 }
1570 break;
1571
1572 case TGSI_OPCODE_DP4:
1573 /* TGSI_OPCODE_DOT4 */
1574 FETCH( func, *inst, 0, 0, CHAN_X );
1575 FETCH( func, *inst, 1, 1, CHAN_X );
1576 emit_mul( func, 0, 1 );
1577 FETCH( func, *inst, 1, 0, CHAN_Y );
1578 FETCH( func, *inst, 2, 1, CHAN_Y );
1579 emit_mul( func, 1, 2 );
1580 emit_add( func, 0, 1 );
1581 FETCH( func, *inst, 1, 0, CHAN_Z );
1582 FETCH( func, *inst, 2, 1, CHAN_Z );
1583 emit_mul(func, 1, 2 );
1584 emit_add(func, 0, 1 );
1585 FETCH( func, *inst, 1, 0, CHAN_W );
1586 FETCH( func, *inst, 2, 1, CHAN_W );
1587 emit_mul( func, 1, 2 );
1588 emit_add( func, 0, 1 );
1589 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1590 STORE( func, *inst, 0, 0, chan_index );
1591 }
1592 break;
1593
1594 case TGSI_OPCODE_DST:
1595 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1596 emit_tempf(
1597 func,
1598 0,
1599 TGSI_EXEC_TEMP_ONE_I,
1600 TGSI_EXEC_TEMP_ONE_C );
1601 STORE( func, *inst, 0, 0, CHAN_X );
1602 }
1603 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1604 FETCH( func, *inst, 0, 0, CHAN_Y );
1605 FETCH( func, *inst, 1, 1, CHAN_Y );
1606 emit_mul( func, 0, 1 );
1607 STORE( func, *inst, 0, 0, CHAN_Y );
1608 }
1609 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1610 FETCH( func, *inst, 0, 0, CHAN_Z );
1611 STORE( func, *inst, 0, 0, CHAN_Z );
1612 }
1613 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1614 FETCH( func, *inst, 0, 1, CHAN_W );
1615 STORE( func, *inst, 0, 0, CHAN_W );
1616 }
1617 break;
1618
1619 case TGSI_OPCODE_MIN:
1620 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1621 FETCH( func, *inst, 0, 0, chan_index );
1622 FETCH( func, *inst, 1, 1, chan_index );
1623 emit_minps(
1624 func,
1625 make_xmm( 0 ),
1626 make_xmm( 1 ) );
1627 STORE( func, *inst, 0, 0, chan_index );
1628 }
1629 break;
1630
1631 case TGSI_OPCODE_MAX:
1632 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1633 FETCH( func, *inst, 0, 0, chan_index );
1634 FETCH( func, *inst, 1, 1, chan_index );
1635 emit_maxps(
1636 func,
1637 make_xmm( 0 ),
1638 make_xmm( 1 ) );
1639 STORE( func, *inst, 0, 0, chan_index );
1640 }
1641 break;
1642
1643 case TGSI_OPCODE_SLT:
1644 /* TGSI_OPCODE_SETLT */
1645 emit_setcc( func, inst, cc_LessThan );
1646 break;
1647
1648 case TGSI_OPCODE_SGE:
1649 /* TGSI_OPCODE_SETGE */
1650 emit_setcc( func, inst, cc_NotLessThan );
1651 break;
1652
1653 case TGSI_OPCODE_MAD:
1654 /* TGSI_OPCODE_MADD */
1655 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1656 FETCH( func, *inst, 0, 0, chan_index );
1657 FETCH( func, *inst, 1, 1, chan_index );
1658 FETCH( func, *inst, 2, 2, chan_index );
1659 emit_mul( func, 0, 1 );
1660 emit_add( func, 0, 2 );
1661 STORE( func, *inst, 0, 0, chan_index );
1662 }
1663 break;
1664
1665 case TGSI_OPCODE_SUB:
1666 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1667 FETCH( func, *inst, 0, 0, chan_index );
1668 FETCH( func, *inst, 1, 1, chan_index );
1669 emit_sub( func, 0, 1 );
1670 STORE( func, *inst, 0, 0, chan_index );
1671 }
1672 break;
1673
1674 case TGSI_OPCODE_LERP:
1675 /* TGSI_OPCODE_LRP */
1676 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677 FETCH( func, *inst, 0, 0, chan_index );
1678 FETCH( func, *inst, 1, 1, chan_index );
1679 FETCH( func, *inst, 2, 2, chan_index );
1680 emit_sub( func, 1, 2 );
1681 emit_mul( func, 0, 1 );
1682 emit_add( func, 0, 2 );
1683 STORE( func, *inst, 0, 0, chan_index );
1684 }
1685 break;
1686
1687 case TGSI_OPCODE_CND:
1688 assert( 0 );
1689 break;
1690
1691 case TGSI_OPCODE_CND0:
1692 assert( 0 );
1693 break;
1694
1695 case TGSI_OPCODE_DOT2ADD:
1696 /* TGSI_OPCODE_DP2A */
1697 assert( 0 );
1698 break;
1699
1700 case TGSI_OPCODE_INDEX:
1701 assert( 0 );
1702 break;
1703
1704 case TGSI_OPCODE_NEGATE:
1705 assert( 0 );
1706 break;
1707
1708 case TGSI_OPCODE_FRAC:
1709 /* TGSI_OPCODE_FRC */
1710 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1711 FETCH( func, *inst, 0, 0, chan_index );
1712 emit_frc( func, 0 );
1713 STORE( func, *inst, 0, 0, chan_index );
1714 }
1715 break;
1716
1717 case TGSI_OPCODE_CLAMP:
1718 assert( 0 );
1719 break;
1720
1721 case TGSI_OPCODE_FLOOR:
1722 /* TGSI_OPCODE_FLR */
1723 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1724 FETCH( func, *inst, 0, 0, chan_index );
1725 emit_flr( func, 0 );
1726 STORE( func, *inst, 0, 0, chan_index );
1727 }
1728 break;
1729
1730 case TGSI_OPCODE_ROUND:
1731 assert( 0 );
1732 break;
1733
1734 case TGSI_OPCODE_EXPBASE2:
1735 /* TGSI_OPCODE_EX2 */
1736 FETCH( func, *inst, 0, 0, CHAN_X );
1737 emit_ex2( func, 0 );
1738 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1739 STORE( func, *inst, 0, 0, chan_index );
1740 }
1741 break;
1742
1743 case TGSI_OPCODE_LOGBASE2:
1744 /* TGSI_OPCODE_LG2 */
1745 FETCH( func, *inst, 0, 0, CHAN_X );
1746 emit_lg2( func, 0 );
1747 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1748 STORE( func, *inst, 0, 0, chan_index );
1749 }
1750 break;
1751
1752 case TGSI_OPCODE_POWER:
1753 /* TGSI_OPCODE_POW */
1754 FETCH( func, *inst, 0, 0, CHAN_X );
1755 FETCH( func, *inst, 1, 1, CHAN_X );
1756 emit_pow( func, 0, 1 );
1757 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1758 STORE( func, *inst, 0, 0, chan_index );
1759 }
1760 break;
1761
1762 case TGSI_OPCODE_CROSSPRODUCT:
1763 /* TGSI_OPCODE_XPD */
1764 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1765 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1766 FETCH( func, *inst, 1, 1, CHAN_Z );
1767 FETCH( func, *inst, 3, 0, CHAN_Z );
1768 }
1769 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1770 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1771 FETCH( func, *inst, 0, 0, CHAN_Y );
1772 FETCH( func, *inst, 4, 1, CHAN_Y );
1773 }
1774 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1775 emit_MOV( func, 2, 0 );
1776 emit_mul( func, 2, 1 );
1777 emit_MOV( func, 5, 3 );
1778 emit_mul( func, 5, 4 );
1779 emit_sub( func, 2, 5 );
1780 STORE( func, *inst, 2, 0, CHAN_X );
1781 }
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1783 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1784 FETCH( func, *inst, 2, 1, CHAN_X );
1785 FETCH( func, *inst, 5, 0, CHAN_X );
1786 }
1787 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1788 emit_mul( func, 3, 2 );
1789 emit_mul( func, 1, 5 );
1790 emit_sub( func, 3, 1 );
1791 STORE( func, *inst, 3, 0, CHAN_Y );
1792 }
1793 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1794 emit_mul( func, 5, 4 );
1795 emit_mul( func, 0, 2 );
1796 emit_sub( func, 5, 0 );
1797 STORE( func, *inst, 5, 0, CHAN_Z );
1798 }
1799 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1800 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
1801 STORE( func, *inst, 0, 0, CHAN_W );
1802 }
1803 break;
1804
1805 case TGSI_OPCODE_MULTIPLYMATRIX:
1806 assert( 0 );
1807 break;
1808
1809 case TGSI_OPCODE_ABS:
1810 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1811 FETCH( func, *inst, 0, 0, chan_index );
1812 emit_abs( func, 0) ;
1813
1814 STORE( func, *inst, 0, 0, chan_index );
1815 }
1816 break;
1817
1818 case TGSI_OPCODE_RCC:
1819 assert( 0 );
1820 break;
1821
1822 case TGSI_OPCODE_DPH:
1823 FETCH( func, *inst, 0, 0, CHAN_X );
1824 FETCH( func, *inst, 1, 1, CHAN_X );
1825 emit_mul( func, 0, 1 );
1826 FETCH( func, *inst, 1, 0, CHAN_Y );
1827 FETCH( func, *inst, 2, 1, CHAN_Y );
1828 emit_mul( func, 1, 2 );
1829 emit_add( func, 0, 1 );
1830 FETCH( func, *inst, 1, 0, CHAN_Z );
1831 FETCH( func, *inst, 2, 1, CHAN_Z );
1832 emit_mul( func, 1, 2 );
1833 emit_add( func, 0, 1 );
1834 FETCH( func, *inst, 1, 1, CHAN_W );
1835 emit_add( func, 0, 1 );
1836 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1837 STORE( func, *inst, 0, 0, chan_index );
1838 }
1839 break;
1840
1841 case TGSI_OPCODE_COS:
1842 FETCH( func, *inst, 0, 0, CHAN_X );
1843 emit_cos( func, 0 );
1844 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1845 STORE( func, *inst, 0, 0, chan_index );
1846 }
1847 break;
1848
1849 case TGSI_OPCODE_DDX:
1850 assert( 0 );
1851 break;
1852
1853 case TGSI_OPCODE_DDY:
1854 assert( 0 );
1855 break;
1856
1857 case TGSI_OPCODE_KIL:
1858 emit_kil( func, &inst->FullSrcRegisters[0] );
1859 break;
1860
1861 case TGSI_OPCODE_PK2H:
1862 assert( 0 );
1863 break;
1864
1865 case TGSI_OPCODE_PK2US:
1866 assert( 0 );
1867 break;
1868
1869 case TGSI_OPCODE_PK4B:
1870 assert( 0 );
1871 break;
1872
1873 case TGSI_OPCODE_PK4UB:
1874 assert( 0 );
1875 break;
1876
1877 case TGSI_OPCODE_RFL:
1878 assert( 0 );
1879 break;
1880
1881 case TGSI_OPCODE_SEQ:
1882 assert( 0 );
1883 break;
1884
1885 case TGSI_OPCODE_SFL:
1886 assert( 0 );
1887 break;
1888
1889 case TGSI_OPCODE_SGT:
1890 assert( 0 );
1891 break;
1892
1893 case TGSI_OPCODE_SIN:
1894 FETCH( func, *inst, 0, 0, CHAN_X );
1895 emit_sin( func, 0 );
1896 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1897 STORE( func, *inst, 0, 0, chan_index );
1898 }
1899 break;
1900
1901 case TGSI_OPCODE_SLE:
1902 assert( 0 );
1903 break;
1904
1905 case TGSI_OPCODE_SNE:
1906 assert( 0 );
1907 break;
1908
1909 case TGSI_OPCODE_STR:
1910 assert( 0 );
1911 break;
1912
1913 case TGSI_OPCODE_TEX:
1914 emit_tempf(
1915 func,
1916 0,
1917 TGSI_EXEC_TEMP_ONE_I,
1918 TGSI_EXEC_TEMP_ONE_C );
1919 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1920 STORE( func, *inst, 0, 0, chan_index );
1921 }
1922 break;
1923
1924 case TGSI_OPCODE_TXD:
1925 assert( 0 );
1926 break;
1927
1928 case TGSI_OPCODE_UP2H:
1929 assert( 0 );
1930 break;
1931
1932 case TGSI_OPCODE_UP2US:
1933 assert( 0 );
1934 break;
1935
1936 case TGSI_OPCODE_UP4B:
1937 assert( 0 );
1938 break;
1939
1940 case TGSI_OPCODE_UP4UB:
1941 assert( 0 );
1942 break;
1943
1944 case TGSI_OPCODE_X2D:
1945 assert( 0 );
1946 break;
1947
1948 case TGSI_OPCODE_ARA:
1949 assert( 0 );
1950 break;
1951
1952 case TGSI_OPCODE_ARR:
1953 assert( 0 );
1954 break;
1955
1956 case TGSI_OPCODE_BRA:
1957 assert( 0 );
1958 break;
1959
1960 case TGSI_OPCODE_CAL:
1961 assert( 0 );
1962 break;
1963
1964 case TGSI_OPCODE_RET:
1965 #ifdef WIN32
1966 emit_retw( func, 16 );
1967 #else
1968 emit_ret( func );
1969 #endif
1970 break;
1971
1972 case TGSI_OPCODE_SSG:
1973 assert( 0 );
1974 break;
1975
1976 case TGSI_OPCODE_CMP:
1977 emit_cmp (func, inst);
1978 break;
1979
1980 case TGSI_OPCODE_SCS:
1981 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1982 FETCH( func, *inst, 0, 0, CHAN_X );
1983 emit_cos( func, 0 );
1984 STORE( func, *inst, 0, 0, CHAN_X );
1985 }
1986 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1987 FETCH( func, *inst, 0, 0, CHAN_Y );
1988 emit_sin( func, 0 );
1989 STORE( func, *inst, 0, 0, CHAN_Y );
1990 }
1991 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1992 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C );
1993 STORE( func, *inst, 0, 0, CHAN_Z );
1994 }
1995 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1996 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
1997 STORE( func, *inst, 0, 0, CHAN_W );
1998 }
1999 break;
2000
2001 case TGSI_OPCODE_TXB:
2002 assert( 0 );
2003 break;
2004
2005 case TGSI_OPCODE_NRM:
2006 assert( 0 );
2007 break;
2008
2009 case TGSI_OPCODE_DIV:
2010 assert( 0 );
2011 break;
2012
2013 case TGSI_OPCODE_DP2:
2014 assert( 0 );
2015 break;
2016
2017 case TGSI_OPCODE_TXL:
2018 assert( 0 );
2019 break;
2020
2021 case TGSI_OPCODE_BRK:
2022 assert( 0 );
2023 break;
2024
2025 case TGSI_OPCODE_IF:
2026 assert( 0 );
2027 break;
2028
2029 case TGSI_OPCODE_LOOP:
2030 assert( 0 );
2031 break;
2032
2033 case TGSI_OPCODE_REP:
2034 assert( 0 );
2035 break;
2036
2037 case TGSI_OPCODE_ELSE:
2038 assert( 0 );
2039 break;
2040
2041 case TGSI_OPCODE_ENDIF:
2042 assert( 0 );
2043 break;
2044
2045 case TGSI_OPCODE_ENDLOOP:
2046 assert( 0 );
2047 break;
2048
2049 case TGSI_OPCODE_ENDREP:
2050 assert( 0 );
2051 break;
2052
2053 case TGSI_OPCODE_PUSHA:
2054 assert( 0 );
2055 break;
2056
2057 case TGSI_OPCODE_POPA:
2058 assert( 0 );
2059 break;
2060
2061 case TGSI_OPCODE_CEIL:
2062 assert( 0 );
2063 break;
2064
2065 case TGSI_OPCODE_I2F:
2066 assert( 0 );
2067 break;
2068
2069 case TGSI_OPCODE_NOT:
2070 assert( 0 );
2071 break;
2072
2073 case TGSI_OPCODE_TRUNC:
2074 assert( 0 );
2075 break;
2076
2077 case TGSI_OPCODE_SHL:
2078 assert( 0 );
2079 break;
2080
2081 case TGSI_OPCODE_SHR:
2082 assert( 0 );
2083 break;
2084
2085 case TGSI_OPCODE_AND:
2086 assert( 0 );
2087 break;
2088
2089 case TGSI_OPCODE_OR:
2090 assert( 0 );
2091 break;
2092
2093 case TGSI_OPCODE_MOD:
2094 assert( 0 );
2095 break;
2096
2097 case TGSI_OPCODE_XOR:
2098 assert( 0 );
2099 break;
2100
2101 case TGSI_OPCODE_SAD:
2102 assert( 0 );
2103 break;
2104
2105 case TGSI_OPCODE_TXF:
2106 assert( 0 );
2107 break;
2108
2109 case TGSI_OPCODE_TXQ:
2110 assert( 0 );
2111 break;
2112
2113 case TGSI_OPCODE_CONT:
2114 assert( 0 );
2115 break;
2116
2117 case TGSI_OPCODE_EMIT:
2118 assert( 0 );
2119 break;
2120
2121 case TGSI_OPCODE_ENDPRIM:
2122 assert( 0 );
2123 break;
2124
2125 default:
2126 assert( 0 );
2127 }
2128 }
2129
2130 static void
2131 emit_declaration(
2132 struct x86_function *func,
2133 struct tgsi_full_declaration *decl )
2134 {
2135 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2136 unsigned first, last, mask;
2137 unsigned i, j;
2138
2139 assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
2140
2141 first = decl->u.DeclarationRange.First;
2142 last = decl->u.DeclarationRange.Last;
2143 mask = decl->Declaration.UsageMask;
2144
2145 /* Do not touch WPOS.xy */
2146 if( first == 0 ) {
2147 mask &= ~TGSI_WRITEMASK_XY;
2148 if( mask == TGSI_WRITEMASK_NONE ) {
2149 first++;
2150 }
2151 }
2152
2153 for( i = first; i <= last; i++ ) {
2154 for( j = 0; j < NUM_CHANNELS; j++ ) {
2155 if( mask & (1 << j) ) {
2156 switch( decl->Interpolation.Interpolate ) {
2157 case TGSI_INTERPOLATE_CONSTANT:
2158 emit_coef_a0( func, 0, i, j );
2159 emit_inputs( func, 0, i, j );
2160 break;
2161
2162 case TGSI_INTERPOLATE_LINEAR:
2163 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2164 emit_coef_dadx( func, 1, i, j );
2165 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2166 emit_coef_dady( func, 3, i, j );
2167 emit_mul( func, 0, 1 ); /* x * dadx */
2168 emit_coef_a0( func, 4, i, j );
2169 emit_mul( func, 2, 3 ); /* y * dady */
2170 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2171 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2172 emit_inputs( func, 0, i, j );
2173 break;
2174
2175 case TGSI_INTERPOLATE_PERSPECTIVE:
2176 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2177 emit_coef_dadx( func, 1, i, j );
2178 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2179 emit_coef_dady( func, 3, i, j );
2180 emit_mul( func, 0, 1 ); /* x * dadx */
2181 emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
2182 emit_coef_a0( func, 5, i, j );
2183 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2184 emit_mul( func, 2, 3 ); /* y * dady */
2185 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2186 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2187 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2188 emit_inputs( func, 0, i, j );
2189 break;
2190
2191 default:
2192 assert( 0 );
2193 }
2194 }
2195 }
2196 }
2197 }
2198 }
2199
2200 unsigned
2201 tgsi_emit_sse2(
2202 struct tgsi_token *tokens,
2203 struct x86_function *func )
2204 {
2205 struct tgsi_parse_context parse;
2206
2207 DUMP_START();
2208
2209 func->csr = func->store;
2210
2211 emit_mov(
2212 func,
2213 get_input_base(),
2214 get_argument( 0 ) );
2215 emit_mov(
2216 func,
2217 get_output_base(),
2218 get_argument( 1 ) );
2219 emit_mov(
2220 func,
2221 get_const_base(),
2222 get_argument( 2 ) );
2223 emit_mov(
2224 func,
2225 get_temp_base(),
2226 get_argument( 3 ) );
2227
2228 tgsi_parse_init( &parse, tokens );
2229
2230 while( !tgsi_parse_end_of_tokens( &parse ) ) {
2231 tgsi_parse_token( &parse );
2232
2233 switch( parse.FullToken.Token.Type ) {
2234 case TGSI_TOKEN_TYPE_DECLARATION:
2235 break;
2236
2237 case TGSI_TOKEN_TYPE_INSTRUCTION:
2238 emit_instruction(
2239 func,
2240 &parse.FullToken.FullInstruction );
2241 break;
2242
2243 default:
2244 assert( 0 );
2245 }
2246 }
2247
2248 tgsi_parse_free( &parse );
2249
2250 DUMP_END();
2251
2252 return 1;
2253 }
2254
2255 /**
2256 * Fragment shaders are responsible for interpolating shader inputs. Because on
2257 * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
2258 * output, const, temp and coef), the code is split into two phases --
2259 * DECLARATION and INSTRUCTION phase.
2260 * GP register holding the output argument is aliased with the coeff argument,
2261 * as outputs are not needed in the DECLARATION phase.
2262 */
2263 unsigned
2264 tgsi_emit_sse2_fs(
2265 struct tgsi_token *tokens,
2266 struct x86_function *func )
2267 {
2268 struct tgsi_parse_context parse;
2269 boolean instruction_phase = FALSE;
2270
2271 DUMP_START();
2272
2273 func->csr = func->store;
2274
2275 /* DECLARATION phase, do not load output argument. */
2276 emit_mov(
2277 func,
2278 get_input_base(),
2279 get_argument( 0 ) );
2280 emit_mov(
2281 func,
2282 get_const_base(),
2283 get_argument( 2 ) );
2284 emit_mov(
2285 func,
2286 get_temp_base(),
2287 get_argument( 3 ) );
2288 emit_mov(
2289 func,
2290 get_coef_base(),
2291 get_argument( 4 ) );
2292
2293 tgsi_parse_init( &parse, tokens );
2294
2295 while( !tgsi_parse_end_of_tokens( &parse ) ) {
2296 tgsi_parse_token( &parse );
2297
2298 switch( parse.FullToken.Token.Type ) {
2299 case TGSI_TOKEN_TYPE_DECLARATION:
2300 emit_declaration(
2301 func,
2302 &parse.FullToken.FullDeclaration );
2303 break;
2304
2305 case TGSI_TOKEN_TYPE_INSTRUCTION:
2306 if( !instruction_phase ) {
2307 /* INSTRUCTION phase, overwrite coeff with output. */
2308 instruction_phase = TRUE;
2309 emit_mov(
2310 func,
2311 get_output_base(),
2312 get_argument( 1 ) );
2313 }
2314 emit_instruction(
2315 func,
2316 &parse.FullToken.FullInstruction );
2317 break;
2318
2319 default:
2320 assert( 0 );
2321 }
2322 }
2323
2324 tgsi_parse_free( &parse );
2325
2326 DUMP_END();
2327
2328 return 1;
2329 }
2330
2331 #endif /* i386 */