f8660e7ad14b198d93234f3b777ff426da55703a
[mesa.git] / src / mesa / pipe / tgsi / exec / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "pipe/tgsi/util/tgsi_parse.h"
31 #include "pipe/tgsi/util/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
34
35 #include "x86/rtasm/x86sse.h"
36
37 #if defined(__i386__) || defined(__386__)
38
39 #define DUMP_SSE 0
40
41 #if DUMP_SSE
42
43 static void
44 _print_reg(
45 struct x86_reg reg )
46 {
47 switch( reg.file ) {
48 case file_REG32:
49 switch( reg.idx ) {
50 case reg_AX:
51 printf( "EAX" );
52 break;
53 case reg_CX:
54 printf( "ECX" );
55 break;
56 case reg_DX:
57 printf( "EDX" );
58 break;
59 case reg_BX:
60 printf( "EBX" );
61 break;
62 case reg_SP:
63 printf( "ESP" );
64 break;
65 case reg_BP:
66 printf( "EBP" );
67 break;
68 case reg_SI:
69 printf( "ESI" );
70 break;
71 case reg_DI:
72 printf( "EDI" );
73 break;
74 }
75 break;
76 case file_MMX:
77 assert( 0 );
78 break;
79 case file_XMM:
80 printf( "XMM%u", reg.idx );
81 break;
82 case file_x87:
83 assert( 0 );
84 break;
85 }
86 }
87
88 static void
89 _fill(
90 const char *op )
91 {
92 unsigned count = 10 - strlen( op );
93
94 while( count-- ) {
95 printf( " " );
96 }
97 }
98
99 #define DUMP_START() printf( "\nsse-dump start ----------------" )
100 #define DUMP_END() printf( "\nsse-dump end ----------------\n" )
101 #define DUMP( OP ) printf( "\n%s", OP )
102 #define DUMP_I( OP, I ) do {\
103 printf( "\n%s", OP );\
104 _fill( OP );\
105 printf( "%u", I ); } while( 0 )
106 #define DUMP_R( OP, R0 ) do {\
107 printf( "\n%s", OP );\
108 _fill( OP );\
109 _print_reg( R0 ); } while( 0 )
110 #define DUMP_RR( OP, R0, R1 ) do {\
111 printf( "\n%s", OP );\
112 _fill( OP );\
113 _print_reg( R0 );\
114 printf( ", " );\
115 _print_reg( R1 ); } while( 0 )
116 #define DUMP_RRI( OP, R0, R1, I ) do {\
117 printf( "\n%s", OP );\
118 _fill( OP );\
119 _print_reg( R0 );\
120 printf( ", " );\
121 _print_reg( R1 );\
122 printf( ", " );\
123 printf( "%u", I ); } while( 0 )
124
125 #else
126
127 #define DUMP_START()
128 #define DUMP_END()
129 #define DUMP( OP )
130 #define DUMP_I( OP, I )
131 #define DUMP_R( OP, R0 )
132 #define DUMP_RR( OP, R0, R1 )
133 #define DUMP_RRI( OP, R0, R1, I )
134
135 #endif
136
137 #define FOR_EACH_CHANNEL( CHAN )\
138 for( CHAN = 0; CHAN < 4; CHAN++ )
139
140 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
141 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
142
143 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
144 if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
145
146 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
147 FOR_EACH_CHANNEL( CHAN )\
148 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
149
150 #define CHAN_X 0
151 #define CHAN_Y 1
152 #define CHAN_Z 2
153 #define CHAN_W 3
154
155 #define TEMP_R0 TGSI_EXEC_TEMP_R0
156
157 /**
158 * X86 utility functions.
159 */
160
161 static struct x86_reg
162 make_xmm(
163 unsigned xmm )
164 {
165 return x86_make_reg(
166 file_XMM,
167 (enum x86_reg_name) xmm );
168 }
169
170 /**
171 * X86 register mapping helpers.
172 */
173
174 static struct x86_reg
175 get_const_base( void )
176 {
177 return x86_make_reg(
178 file_REG32,
179 reg_CX );
180 }
181
182 static struct x86_reg
183 get_input_base( void )
184 {
185 return x86_make_reg(
186 file_REG32,
187 reg_AX );
188 }
189
190 static struct x86_reg
191 get_output_base( void )
192 {
193 return x86_make_reg(
194 file_REG32,
195 reg_DX );
196 }
197
198 static struct x86_reg
199 get_temp_base( void )
200 {
201 return x86_make_reg(
202 file_REG32,
203 reg_BX );
204 }
205
206 static struct x86_reg
207 get_coef_base( void )
208 {
209 return get_output_base();
210 }
211
212 /**
213 * Data access helpers.
214 */
215
216 static struct x86_reg
217 get_argument(
218 unsigned index )
219 {
220 return x86_make_disp(
221 x86_make_reg( file_REG32, reg_SP ),
222 (index + 1) * 4 );
223 }
224
225 static struct x86_reg
226 get_const(
227 unsigned vec,
228 unsigned chan )
229 {
230 return x86_make_disp(
231 get_const_base(),
232 (vec * 4 + chan) * 4 );
233 }
234
235 static struct x86_reg
236 get_input(
237 unsigned vec,
238 unsigned chan )
239 {
240 return x86_make_disp(
241 get_input_base(),
242 (vec * 4 + chan) * 16 );
243 }
244
245 static struct x86_reg
246 get_output(
247 unsigned vec,
248 unsigned chan )
249 {
250 return x86_make_disp(
251 get_output_base(),
252 (vec * 4 + chan) * 16 );
253 }
254
255 static struct x86_reg
256 get_temp(
257 unsigned vec,
258 unsigned chan )
259 {
260 return x86_make_disp(
261 get_temp_base(),
262 (vec * 4 + chan) * 16 );
263 }
264
265 static struct x86_reg
266 get_coef(
267 unsigned vec,
268 unsigned chan,
269 unsigned member )
270 {
271 return x86_make_disp(
272 get_coef_base(),
273 ((vec * 3 + member) * 4 + chan) * 4 );
274 }
275
276 /**
277 * X86 rtasm wrappers.
278 */
279
280 static void
281 emit_addps(
282 struct x86_function *func,
283 struct x86_reg dst,
284 struct x86_reg src )
285 {
286 DUMP_RR( "ADDPS", dst, src );
287 sse_addps( func, dst, src );
288 }
289
290 static void
291 emit_andnps(
292 struct x86_function *func,
293 struct x86_reg dst,
294 struct x86_reg src )
295 {
296 DUMP_RR( "ANDNPS", dst, src );
297 sse_andnps( func, dst, src );
298 }
299
300 static void
301 emit_andps(
302 struct x86_function *func,
303 struct x86_reg dst,
304 struct x86_reg src )
305 {
306 DUMP_RR( "ANDPS", dst, src );
307 sse_andps( func, dst, src );
308 }
309
310 static void
311 emit_call(
312 struct x86_function *func,
313 void (* addr)() )
314 {
315 DUMP_I( "CALL", addr );
316 x86_call( func, addr );
317 }
318
319 static void
320 emit_cmpps(
321 struct x86_function *func,
322 struct x86_reg dst,
323 struct x86_reg src,
324 enum sse_cc cc )
325 {
326 DUMP_RRI( "CMPPS", dst, src, cc );
327 sse_cmpps( func, dst, src, cc );
328 }
329
330 static void
331 emit_cvttps2dq(
332 struct x86_function *func,
333 struct x86_reg dst,
334 struct x86_reg src )
335 {
336 DUMP_RR( "CVTTPS2DQ", dst, src );
337 sse2_cvttps2dq( func, dst, src );
338 }
339
340 static void
341 emit_maxps(
342 struct x86_function *func,
343 struct x86_reg dst,
344 struct x86_reg src )
345 {
346 DUMP_RR( "MAXPS", dst, src );
347 sse_maxps( func, dst, src );
348 }
349
350 static void
351 emit_minps(
352 struct x86_function *func,
353 struct x86_reg dst,
354 struct x86_reg src )
355 {
356 DUMP_RR( "MINPS", dst, src );
357 sse_minps( func, dst, src );
358 }
359
360 static void
361 emit_mov(
362 struct x86_function *func,
363 struct x86_reg dst,
364 struct x86_reg src )
365 {
366 DUMP_RR( "MOV", dst, src );
367 x86_mov( func, dst, src );
368 }
369
370 static void
371 emit_movaps(
372 struct x86_function *func,
373 struct x86_reg dst,
374 struct x86_reg src )
375 {
376 DUMP_RR( "MOVAPS", dst, src );
377 sse_movaps( func, dst, src );
378 }
379
380 static void
381 emit_movss(
382 struct x86_function *func,
383 struct x86_reg dst,
384 struct x86_reg src )
385 {
386 DUMP_RR( "MOVSS", dst, src );
387 sse_movss( func, dst, src );
388 }
389
390 static void
391 emit_movups(
392 struct x86_function *func,
393 struct x86_reg dst,
394 struct x86_reg src )
395 {
396 DUMP_RR( "MOVUPS", dst, src );
397 sse_movups( func, dst, src );
398 }
399
400 static void
401 emit_mulps(
402 struct x86_function *func,
403 struct x86_reg dst,
404 struct x86_reg src )
405 {
406 DUMP_RR( "MULPS", dst, src );
407 sse_mulps( func, dst, src );
408 }
409
410 static void
411 emit_or(
412 struct x86_function *func,
413 struct x86_reg dst,
414 struct x86_reg src )
415 {
416 DUMP_RR( "OR", dst, src );
417 x86_or( func, dst, src );
418 }
419
420 static void
421 emit_orps(
422 struct x86_function *func,
423 struct x86_reg dst,
424 struct x86_reg src )
425 {
426 DUMP_RR( "ORPS", dst, src );
427 sse_orps( func, dst, src );
428 }
429
430 static void
431 emit_pmovmskb(
432 struct x86_function *func,
433 struct x86_reg dst,
434 struct x86_reg src )
435 {
436 DUMP_RR( "PMOVMSKB", dst, src );
437 sse_pmovmskb( func, dst, src );
438 }
439
440 static void
441 emit_pop(
442 struct x86_function *func,
443 struct x86_reg dst )
444 {
445 DUMP_R( "POP", dst );
446 x86_pop( func, dst );
447 }
448
449 static void
450 emit_push(
451 struct x86_function *func,
452 struct x86_reg dst )
453 {
454 DUMP_R( "PUSH", dst );
455 x86_push( func, dst );
456 }
457
458 static void
459 emit_rcpps(
460 struct x86_function *func,
461 struct x86_reg dst,
462 struct x86_reg src )
463 {
464 DUMP_RR( "RCPPS", dst, src );
465 sse2_rcpps( func, dst, src );
466 }
467
468 #ifdef WIN32
469 static void
470 emit_retw(
471 struct x86_function *func,
472 unsigned size )
473 {
474 DUMP_I( "RET", size );
475 x86_retw( func, size );
476 }
477 #else
478 static void
479 emit_ret(
480 struct x86_function *func )
481 {
482 DUMP( "RET" );
483 x86_ret( func );
484 }
485 #endif
486
487 static void
488 emit_rsqrtps(
489 struct x86_function *func,
490 struct x86_reg dst,
491 struct x86_reg src )
492 {
493 DUMP_RR( "RSQRTPS", dst, src );
494 sse_rsqrtps( func, dst, src );
495 }
496
497 static void
498 emit_shufps(
499 struct x86_function *func,
500 struct x86_reg dst,
501 struct x86_reg src,
502 unsigned char shuf )
503 {
504 DUMP_RRI( "SHUFPS", dst, src, shuf );
505 sse_shufps( func, dst, src, shuf );
506 }
507
508 static void
509 emit_subps(
510 struct x86_function *func,
511 struct x86_reg dst,
512 struct x86_reg src )
513 {
514 DUMP_RR( "SUBPS", dst, src );
515 sse_subps( func, dst, src );
516 }
517
518 static void
519 emit_xorps(
520 struct x86_function *func,
521 struct x86_reg dst,
522 struct x86_reg src )
523 {
524 DUMP_RR( "XORPS", dst, src );
525 sse_xorps( func, dst, src );
526 }
527
528 /**
529 * Data fetch helpers.
530 */
531
532 static void
533 emit_const(
534 struct x86_function *func,
535 unsigned xmm,
536 unsigned vec,
537 unsigned chan )
538 {
539 emit_movss(
540 func,
541 make_xmm( xmm ),
542 get_const( vec, chan ) );
543 emit_shufps(
544 func,
545 make_xmm( xmm ),
546 make_xmm( xmm ),
547 SHUF( 0, 0, 0, 0 ) );
548 }
549
550 static void
551 emit_inputf(
552 struct x86_function *func,
553 unsigned xmm,
554 unsigned vec,
555 unsigned chan )
556 {
557 emit_movups(
558 func,
559 make_xmm( xmm ),
560 get_input( vec, chan ) );
561 }
562
563 static void
564 emit_output(
565 struct x86_function *func,
566 unsigned xmm,
567 unsigned vec,
568 unsigned chan )
569 {
570 emit_movups(
571 func,
572 get_output( vec, chan ),
573 make_xmm( xmm ) );
574 }
575
576 static void
577 emit_tempf(
578 struct x86_function *func,
579 unsigned xmm,
580 unsigned vec,
581 unsigned chan )
582 {
583 emit_movaps(
584 func,
585 make_xmm( xmm ),
586 get_temp( vec, chan ) );
587 }
588
589 static void
590 emit_coef(
591 struct x86_function *func,
592 unsigned xmm,
593 unsigned vec,
594 unsigned chan,
595 unsigned member )
596 {
597 emit_movss(
598 func,
599 make_xmm( xmm ),
600 get_coef( vec, chan, member ) );
601 emit_shufps(
602 func,
603 make_xmm( xmm ),
604 make_xmm( xmm ),
605 SHUF( 0, 0, 0, 0 ) );
606 }
607
608 /**
609 * Data store helpers.
610 */
611
612 static void
613 emit_inputs(
614 struct x86_function *func,
615 unsigned xmm,
616 unsigned vec,
617 unsigned chan )
618 {
619 emit_movups(
620 func,
621 get_input( vec, chan ),
622 make_xmm( xmm ) );
623 }
624
625 static void
626 emit_temps(
627 struct x86_function *func,
628 unsigned xmm,
629 unsigned vec,
630 unsigned chan )
631 {
632 emit_movaps(
633 func,
634 get_temp( vec, chan ),
635 make_xmm( xmm ) );
636 }
637
638 static void
639 emit_addrs(
640 struct x86_function *func,
641 unsigned xmm,
642 unsigned vec,
643 unsigned chan )
644 {
645 emit_temps(
646 func,
647 xmm,
648 vec + TGSI_EXEC_NUM_TEMPS,
649 chan );
650 }
651
652 /**
653 * Coefficent fetch helpers.
654 */
655
656 static void
657 emit_coef_a0(
658 struct x86_function *func,
659 unsigned xmm,
660 unsigned vec,
661 unsigned chan )
662 {
663 emit_coef(
664 func,
665 xmm,
666 vec,
667 chan,
668 0 );
669 }
670
671 static void
672 emit_coef_dadx(
673 struct x86_function *func,
674 unsigned xmm,
675 unsigned vec,
676 unsigned chan )
677 {
678 emit_coef(
679 func,
680 xmm,
681 vec,
682 chan,
683 1 );
684 }
685
686 static void
687 emit_coef_dady(
688 struct x86_function *func,
689 unsigned xmm,
690 unsigned vec,
691 unsigned chan )
692 {
693 emit_coef(
694 func,
695 xmm,
696 vec,
697 chan,
698 2 );
699 }
700
701 /**
702 * Function call helpers.
703 */
704
705 static void
706 emit_push_gp(
707 struct x86_function *func )
708 {
709 emit_push(
710 func,
711 get_const_base() );
712 emit_push(
713 func,
714 get_input_base() );
715 emit_push(
716 func,
717 get_output_base() );
718
719 /* It is important on non-win32 platforms that temp base is pushed last.
720 */
721 emit_push(
722 func,
723 get_temp_base() );
724 }
725
726 static void
727 emit_pop_gp(
728 struct x86_function *func )
729 {
730 /* Restore GP registers in a reverse order.
731 */
732 emit_pop(
733 func,
734 get_temp_base() );
735 emit_pop(
736 func,
737 get_output_base() );
738 emit_pop(
739 func,
740 get_input_base() );
741 emit_pop(
742 func,
743 get_const_base() );
744 }
745
746 static void
747 emit_func_call_dst(
748 struct x86_function *func,
749 unsigned xmm_dst,
750 void (*code)() )
751 {
752 emit_movaps(
753 func,
754 get_temp( TEMP_R0, 0 ),
755 make_xmm( xmm_dst ) );
756
757 emit_push_gp(
758 func );
759
760 #ifdef WIN32
761 emit_push(
762 func,
763 get_temp( TEMP_R0, 0 ) );
764 #endif
765
766 emit_call(
767 func,
768 code );
769
770 emit_pop_gp(
771 func );
772
773 emit_movaps(
774 func,
775 make_xmm( xmm_dst ),
776 get_temp( TEMP_R0, 0 ) );
777 }
778
779 static void
780 emit_func_call_dst_src(
781 struct x86_function *func,
782 unsigned xmm_dst,
783 unsigned xmm_src,
784 void (*code)() )
785 {
786 emit_movaps(
787 func,
788 get_temp( TEMP_R0, 1 ),
789 make_xmm( xmm_src ) );
790
791 emit_func_call_dst(
792 func,
793 xmm_dst,
794 code );
795 }
796
797 /**
798 * Low-level instruction translators.
799 */
800
801 static void
802 emit_abs(
803 struct x86_function *func,
804 unsigned xmm )
805 {
806 emit_andps(
807 func,
808 make_xmm( xmm ),
809 get_temp(
810 TGSI_EXEC_TEMP_7FFFFFFF_I,
811 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
812 }
813
814 static void
815 emit_add(
816 struct x86_function *func,
817 unsigned xmm_dst,
818 unsigned xmm_src )
819 {
820 emit_addps(
821 func,
822 make_xmm( xmm_dst ),
823 make_xmm( xmm_src ) );
824 }
825
826 static void XSTDCALL
827 cos4f(
828 float *store )
829 {
830 #ifdef WIN32
831 store[0] = (float) cos( (double) store[0] );
832 store[1] = (float) cos( (double) store[1] );
833 store[2] = (float) cos( (double) store[2] );
834 store[3] = (float) cos( (double) store[3] );
835 #else
836 const unsigned X = TEMP_R0 * 16;
837 store[X + 0] = cosf( store[X + 0] );
838 store[X + 1] = cosf( store[X + 1] );
839 store[X + 2] = cosf( store[X + 2] );
840 store[X + 3] = cosf( store[X + 3] );
841 #endif
842 }
843
844 static void
845 emit_cos(
846 struct x86_function *func,
847 unsigned xmm_dst )
848 {
849 emit_func_call_dst(
850 func,
851 xmm_dst,
852 cos4f );
853 }
854
855 static void XSTDCALL
856 ex24f(
857 float *store )
858 {
859 #ifdef WIN32
860 store[0] = (float) pow( 2.0, (double) store[0] );
861 store[1] = (float) pow( 2.0, (double) store[1] );
862 store[2] = (float) pow( 2.0, (double) store[2] );
863 store[3] = (float) pow( 2.0, (double) store[3] );
864 #else
865 const unsigned X = TEMP_R0 * 16;
866 store[X + 0] = powf( 2.0f, store[X + 0] );
867 store[X + 1] = powf( 2.0f, store[X + 1] );
868 store[X + 2] = powf( 2.0f, store[X + 2] );
869 store[X + 3] = powf( 2.0f, store[X + 3] );
870 #endif
871 }
872
873 static void
874 emit_ex2(
875 struct x86_function *func,
876 unsigned xmm_dst )
877 {
878 emit_func_call_dst(
879 func,
880 xmm_dst,
881 ex24f );
882 }
883
884 static void
885 emit_f2it(
886 struct x86_function *func,
887 unsigned xmm )
888 {
889 emit_cvttps2dq(
890 func,
891 make_xmm( xmm ),
892 make_xmm( xmm ) );
893 }
894
895 static void XSTDCALL
896 flr4f(
897 float *store )
898 {
899 #ifdef WIN32
900 const unsigned X = 0;
901 #else
902 const unsigned X = TEMP_R0 * 16;
903 #endif
904 store[X + 0] = (float) floor( (double) store[X + 0] );
905 store[X + 1] = (float) floor( (double) store[X + 1] );
906 store[X + 2] = (float) floor( (double) store[X + 2] );
907 store[X + 3] = (float) floor( (double) store[X + 3] );
908 }
909
910 static void
911 emit_flr(
912 struct x86_function *func,
913 unsigned xmm_dst )
914 {
915 emit_func_call_dst(
916 func,
917 xmm_dst,
918 flr4f );
919 }
920
921 static void XSTDCALL
922 frc4f(
923 float *store )
924 {
925 #ifdef WIN32
926 const unsigned X = 0;
927 #else
928 const unsigned X = TEMP_R0 * 16;
929 #endif
930 store[X + 0] -= (float) floor( (double) store[X + 0] );
931 store[X + 1] -= (float) floor( (double) store[X + 1] );
932 store[X + 2] -= (float) floor( (double) store[X + 2] );
933 store[X + 3] -= (float) floor( (double) store[X + 3] );
934 }
935
936 static void
937 emit_frc(
938 struct x86_function *func,
939 unsigned xmm_dst )
940 {
941 emit_func_call_dst(
942 func,
943 xmm_dst,
944 frc4f );
945 }
946
947 static void XSTDCALL
948 lg24f(
949 float *store )
950 {
951 #ifdef WIN32
952 const unsigned X = 0;
953 #else
954 const unsigned X = TEMP_R0 * 16;
955 #endif
956 store[X + 0] = LOG2( store[X + 0] );
957 store[X + 1] = LOG2( store[X + 1] );
958 store[X + 2] = LOG2( store[X + 2] );
959 store[X + 3] = LOG2( store[X + 3] );
960 }
961
962 static void
963 emit_lg2(
964 struct x86_function *func,
965 unsigned xmm_dst )
966 {
967 emit_func_call_dst(
968 func,
969 xmm_dst,
970 lg24f );
971 }
972
973 static void
974 emit_MOV(
975 struct x86_function *func,
976 unsigned xmm_dst,
977 unsigned xmm_src )
978 {
979 emit_movups(
980 func,
981 make_xmm( xmm_dst ),
982 make_xmm( xmm_src ) );
983 }
984
985 static void
986 emit_mul (struct x86_function *func,
987 unsigned xmm_dst,
988 unsigned xmm_src)
989 {
990 emit_mulps(
991 func,
992 make_xmm( xmm_dst ),
993 make_xmm( xmm_src ) );
994 }
995
996 static void
997 emit_neg(
998 struct x86_function *func,
999 unsigned xmm )
1000 {
1001 emit_xorps(
1002 func,
1003 make_xmm( xmm ),
1004 get_temp(
1005 TGSI_EXEC_TEMP_80000000_I,
1006 TGSI_EXEC_TEMP_80000000_C ) );
1007 }
1008
1009 static void XSTDCALL
1010 pow4f(
1011 float *store )
1012 {
1013 #ifdef WIN32
1014 store[0] = (float) pow( (double) store[0], (double) store[4] );
1015 store[1] = (float) pow( (double) store[1], (double) store[5] );
1016 store[2] = (float) pow( (double) store[2], (double) store[6] );
1017 store[3] = (float) pow( (double) store[3], (double) store[7] );
1018 #else
1019 const unsigned X = TEMP_R0 * 16;
1020 store[X + 0] = powf( store[X + 0], store[X + 4] );
1021 store[X + 1] = powf( store[X + 1], store[X + 5] );
1022 store[X + 2] = powf( store[X + 2], store[X + 6] );
1023 store[X + 3] = powf( store[X + 3], store[X + 7] );
1024 #endif
1025 }
1026
1027 static void
1028 emit_pow(
1029 struct x86_function *func,
1030 unsigned xmm_dst,
1031 unsigned xmm_src )
1032 {
1033 emit_func_call_dst_src(
1034 func,
1035 xmm_dst,
1036 xmm_src,
1037 pow4f );
1038 }
1039
1040 static void
1041 emit_rcp (
1042 struct x86_function *func,
1043 unsigned xmm_dst,
1044 unsigned xmm_src )
1045 {
1046 emit_rcpps(
1047 func,
1048 make_xmm( xmm_dst ),
1049 make_xmm( xmm_src ) );
1050 }
1051
1052 static void
1053 emit_rsqrt(
1054 struct x86_function *func,
1055 unsigned xmm_dst,
1056 unsigned xmm_src )
1057 {
1058 emit_rsqrtps(
1059 func,
1060 make_xmm( xmm_dst ),
1061 make_xmm( xmm_src ) );
1062 }
1063
1064 static void
1065 emit_setsign(
1066 struct x86_function *func,
1067 unsigned xmm )
1068 {
1069 emit_orps(
1070 func,
1071 make_xmm( xmm ),
1072 get_temp(
1073 TGSI_EXEC_TEMP_80000000_I,
1074 TGSI_EXEC_TEMP_80000000_C ) );
1075 }
1076
1077 static void XSTDCALL
1078 sin4f(
1079 float *store )
1080 {
1081 #ifdef WIN32
1082 store[0] = (float) sin( (double) store[0] );
1083 store[1] = (float) sin( (double) store[1] );
1084 store[2] = (float) sin( (double) store[2] );
1085 store[3] = (float) sin( (double) store[3] );
1086 #else
1087 const unsigned X = TEMP_R0 * 16;
1088 store[X + 0] = sinf( store[X + 0] );
1089 store[X + 1] = sinf( store[X + 1] );
1090 store[X + 2] = sinf( store[X + 2] );
1091 store[X + 3] = sinf( store[X + 3] );
1092 #endif
1093 }
1094
1095 static void
1096 emit_sin (struct x86_function *func,
1097 unsigned xmm_dst)
1098 {
1099 emit_func_call_dst(
1100 func,
1101 xmm_dst,
1102 sin4f );
1103 }
1104
1105 static void
1106 emit_sub(
1107 struct x86_function *func,
1108 unsigned xmm_dst,
1109 unsigned xmm_src )
1110 {
1111 emit_subps(
1112 func,
1113 make_xmm( xmm_dst ),
1114 make_xmm( xmm_src ) );
1115 }
1116
1117 /**
1118 * Register fetch.
1119 */
1120
1121 static void
1122 emit_fetch(
1123 struct x86_function *func,
1124 unsigned xmm,
1125 const struct tgsi_full_src_register *reg,
1126 const unsigned chan_index )
1127 {
1128 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1129
1130 switch( swizzle ) {
1131 case TGSI_EXTSWIZZLE_X:
1132 case TGSI_EXTSWIZZLE_Y:
1133 case TGSI_EXTSWIZZLE_Z:
1134 case TGSI_EXTSWIZZLE_W:
1135 switch( reg->SrcRegister.File ) {
1136 case TGSI_FILE_CONSTANT:
1137 emit_const(
1138 func,
1139 xmm,
1140 reg->SrcRegister.Index,
1141 swizzle );
1142 break;
1143
1144 case TGSI_FILE_INPUT:
1145 emit_inputf(
1146 func,
1147 xmm,
1148 reg->SrcRegister.Index,
1149 swizzle );
1150 break;
1151
1152 case TGSI_FILE_TEMPORARY:
1153 emit_tempf(
1154 func,
1155 xmm,
1156 reg->SrcRegister.Index,
1157 swizzle );
1158 break;
1159
1160 default:
1161 assert( 0 );
1162 }
1163 break;
1164
1165 case TGSI_EXTSWIZZLE_ZERO:
1166 emit_tempf(
1167 func,
1168 xmm,
1169 TGSI_EXEC_TEMP_00000000_I,
1170 TGSI_EXEC_TEMP_00000000_C );
1171 break;
1172
1173 case TGSI_EXTSWIZZLE_ONE:
1174 emit_tempf(
1175 func,
1176 xmm,
1177 TGSI_EXEC_TEMP_ONE_I,
1178 TGSI_EXEC_TEMP_ONE_C );
1179 break;
1180
1181 default:
1182 assert( 0 );
1183 }
1184
1185 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1186 case TGSI_UTIL_SIGN_CLEAR:
1187 emit_abs( func, xmm );
1188 break;
1189
1190 case TGSI_UTIL_SIGN_SET:
1191 emit_setsign( func, xmm );
1192 break;
1193
1194 case TGSI_UTIL_SIGN_TOGGLE:
1195 emit_neg( func, xmm );
1196 break;
1197
1198 case TGSI_UTIL_SIGN_KEEP:
1199 break;
1200 }
1201 }
1202
1203 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1204 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1205
1206 /**
1207 * Register store.
1208 */
1209
1210 static void
1211 emit_store(
1212 struct x86_function *func,
1213 unsigned xmm,
1214 const struct tgsi_full_dst_register *reg,
1215 const struct tgsi_full_instruction *inst,
1216 unsigned chan_index )
1217 {
1218 switch( reg->DstRegister.File ) {
1219 case TGSI_FILE_OUTPUT:
1220 emit_output(
1221 func,
1222 xmm,
1223 reg->DstRegister.Index,
1224 chan_index );
1225 break;
1226
1227 case TGSI_FILE_TEMPORARY:
1228 emit_temps(
1229 func,
1230 xmm,
1231 reg->DstRegister.Index,
1232 chan_index );
1233 break;
1234
1235 case TGSI_FILE_ADDRESS:
1236 emit_addrs(
1237 func,
1238 xmm,
1239 reg->DstRegister.Index,
1240 chan_index );
1241 break;
1242
1243 default:
1244 assert( 0 );
1245 }
1246
1247 switch( inst->Instruction.Saturate ) {
1248 case TGSI_SAT_NONE:
1249 break;
1250
1251 case TGSI_SAT_ZERO_ONE:
1252 // assert( 0 );
1253 break;
1254
1255 case TGSI_SAT_MINUS_PLUS_ONE:
1256 assert( 0 );
1257 break;
1258 }
1259 }
1260
1261 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1262 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1263
1264 /**
1265 * High-level instruction translators.
1266 */
1267
1268 static void
1269 emit_kil(
1270 struct x86_function *func,
1271 const struct tgsi_full_src_register *reg )
1272 {
1273 unsigned uniquemask;
1274 unsigned registers[4];
1275 unsigned nextregister = 0;
1276 unsigned firstchan = ~0;
1277 unsigned chan_index;
1278
1279 /* This mask stores component bits that were already tested. Note that
1280 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1281 * tested. */
1282 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1283
1284 FOR_EACH_CHANNEL( chan_index ) {
1285 unsigned swizzle;
1286
1287 /* unswizzle channel */
1288 swizzle = tgsi_util_get_full_src_register_extswizzle(
1289 reg,
1290 chan_index );
1291
1292 /* check if the component has not been already tested */
1293 if( !(uniquemask & (1 << swizzle)) ) {
1294 uniquemask |= 1 << swizzle;
1295
1296 /* allocate register */
1297 registers[chan_index] = nextregister;
1298 emit_fetch(
1299 func,
1300 nextregister,
1301 reg,
1302 chan_index );
1303 nextregister++;
1304
1305 /* mark the first channel used */
1306 if( firstchan == ~0 ) {
1307 firstchan = chan_index;
1308 }
1309 }
1310 }
1311
1312 emit_push(
1313 func,
1314 x86_make_reg( file_REG32, reg_AX ) );
1315 emit_push(
1316 func,
1317 x86_make_reg( file_REG32, reg_DX ) );
1318
1319 FOR_EACH_CHANNEL( chan_index ) {
1320 if( uniquemask & (1 << chan_index) ) {
1321 emit_cmpps(
1322 func,
1323 make_xmm( registers[chan_index] ),
1324 get_temp(
1325 TGSI_EXEC_TEMP_00000000_I,
1326 TGSI_EXEC_TEMP_00000000_C ),
1327 cc_LessThan );
1328
1329 if( chan_index == firstchan ) {
1330 emit_pmovmskb(
1331 func,
1332 x86_make_reg( file_REG32, reg_AX ),
1333 make_xmm( registers[chan_index] ) );
1334 }
1335 else {
1336 emit_pmovmskb(
1337 func,
1338 x86_make_reg( file_REG32, reg_DX ),
1339 make_xmm( registers[chan_index] ) );
1340 emit_or(
1341 func,
1342 x86_make_reg( file_REG32, reg_AX ),
1343 x86_make_reg( file_REG32, reg_DX ) );
1344 }
1345 }
1346 }
1347
1348 emit_or(
1349 func,
1350 get_temp(
1351 TGSI_EXEC_TEMP_KILMASK_I,
1352 TGSI_EXEC_TEMP_KILMASK_C ),
1353 x86_make_reg( file_REG32, reg_AX ) );
1354
1355 emit_pop(
1356 func,
1357 x86_make_reg( file_REG32, reg_DX ) );
1358 emit_pop(
1359 func,
1360 x86_make_reg( file_REG32, reg_AX ) );
1361 }
1362
1363 static void
1364 emit_setcc(
1365 struct x86_function *func,
1366 struct tgsi_full_instruction *inst,
1367 enum sse_cc cc )
1368 {
1369 unsigned chan_index;
1370
1371 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1372 FETCH( func, *inst, 0, 0, chan_index );
1373 FETCH( func, *inst, 1, 1, chan_index );
1374 emit_cmpps(
1375 func,
1376 make_xmm( 0 ),
1377 make_xmm( 1 ),
1378 cc );
1379 emit_andps(
1380 func,
1381 make_xmm( 0 ),
1382 get_temp(
1383 TGSI_EXEC_TEMP_ONE_I,
1384 TGSI_EXEC_TEMP_ONE_C ) );
1385 STORE( func, *inst, 0, 0, chan_index );
1386 }
1387 }
1388
1389 static void
1390 emit_cmp(
1391 struct x86_function *func,
1392 struct tgsi_full_instruction *inst )
1393 {
1394 unsigned chan_index;
1395
1396 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1397 FETCH( func, *inst, 0, 0, chan_index );
1398 FETCH( func, *inst, 1, 1, chan_index );
1399 FETCH( func, *inst, 2, 2, chan_index );
1400 emit_cmpps(
1401 func,
1402 make_xmm( 0 ),
1403 get_temp(
1404 TGSI_EXEC_TEMP_00000000_I,
1405 TGSI_EXEC_TEMP_00000000_C ),
1406 cc_LessThan );
1407 emit_andps(
1408 func,
1409 make_xmm( 1 ),
1410 make_xmm( 0 ) );
1411 emit_andnps(
1412 func,
1413 make_xmm( 0 ),
1414 make_xmm( 2 ) );
1415 emit_orps(
1416 func,
1417 make_xmm( 0 ),
1418 make_xmm( 1 ) );
1419 STORE( func, *inst, 0, 0, chan_index );
1420 }
1421 }
1422
1423 static void
1424 emit_instruction(
1425 struct x86_function *func,
1426 struct tgsi_full_instruction *inst )
1427 {
1428 unsigned chan_index;
1429
1430 switch( inst->Instruction.Opcode ) {
1431 case TGSI_OPCODE_ARL:
1432 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1433 FETCH( func, *inst, 0, 0, chan_index );
1434 emit_f2it( func, 0 );
1435 STORE( func, *inst, 0, 0, chan_index );
1436 }
1437 break;
1438
1439 case TGSI_OPCODE_MOV:
1440 /* TGSI_OPCODE_SWZ */
1441 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1442 FETCH( func, *inst, 0, 0, chan_index );
1443 STORE( func, *inst, 0, 0, chan_index );
1444 }
1445 break;
1446
1447 case TGSI_OPCODE_LIT:
1448 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1449 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1450 emit_tempf(
1451 func,
1452 0,
1453 TGSI_EXEC_TEMP_ONE_I,
1454 TGSI_EXEC_TEMP_ONE_C);
1455 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1456 STORE( func, *inst, 0, 0, CHAN_X );
1457 }
1458 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1459 STORE( func, *inst, 0, 0, CHAN_W );
1460 }
1461 }
1462 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1463 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1464 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1465 FETCH( func, *inst, 0, 0, CHAN_X );
1466 emit_maxps(
1467 func,
1468 make_xmm( 0 ),
1469 get_temp(
1470 TGSI_EXEC_TEMP_00000000_I,
1471 TGSI_EXEC_TEMP_00000000_C ) );
1472 STORE( func, *inst, 0, 0, CHAN_Y );
1473 }
1474 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1475 FETCH( func, *inst, 1, 0, CHAN_Y );
1476 emit_maxps(
1477 func,
1478 make_xmm( 1 ),
1479 get_temp(
1480 TGSI_EXEC_TEMP_00000000_I,
1481 TGSI_EXEC_TEMP_00000000_C ) );
1482 FETCH( func, *inst, 2, 0, CHAN_W );
1483 emit_minps(
1484 func,
1485 make_xmm( 2 ),
1486 get_temp(
1487 TGSI_EXEC_TEMP_128_I,
1488 TGSI_EXEC_TEMP_128_C ) );
1489 emit_maxps(
1490 func,
1491 make_xmm( 2 ),
1492 get_temp(
1493 TGSI_EXEC_TEMP_MINUS_128_I,
1494 TGSI_EXEC_TEMP_MINUS_128_C ) );
1495 emit_pow( func, 1, 2 );
1496 FETCH( func, *inst, 0, 0, CHAN_X );
1497 emit_xorps(
1498 func,
1499 make_xmm( 2 ),
1500 make_xmm( 2 ) );
1501 emit_cmpps(
1502 func,
1503 make_xmm( 2 ),
1504 make_xmm( 0 ),
1505 cc_LessThanEqual );
1506 emit_andps(
1507 func,
1508 make_xmm( 2 ),
1509 make_xmm( 1 ) );
1510 STORE( func, *inst, 2, 0, CHAN_Z );
1511 }
1512 }
1513 break;
1514
1515 case TGSI_OPCODE_RCP:
1516 /* TGSI_OPCODE_RECIP */
1517 FETCH( func, *inst, 0, 0, CHAN_X );
1518 emit_rcp( func, 0, 0 );
1519 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1520 STORE( func, *inst, 0, 0, chan_index );
1521 }
1522 break;
1523
1524 case TGSI_OPCODE_RSQ:
1525 /* TGSI_OPCODE_RECIPSQRT */
1526 FETCH( func, *inst, 0, 0, CHAN_X );
1527 emit_rsqrt( func, 0, 0 );
1528 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1529 STORE( func, *inst, 0, 0, chan_index );
1530 }
1531 break;
1532
1533 case TGSI_OPCODE_EXP:
1534 assert( 0 );
1535 break;
1536
1537 case TGSI_OPCODE_LOG:
1538 assert( 0 );
1539 break;
1540
1541 case TGSI_OPCODE_MUL:
1542 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1543 FETCH( func, *inst, 0, 0, chan_index );
1544 FETCH( func, *inst, 1, 1, chan_index );
1545 emit_mul( func, 0, 1 );
1546 STORE( func, *inst, 0, 0, chan_index );
1547 }
1548 break;
1549
1550 case TGSI_OPCODE_ADD:
1551 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1552 FETCH( func, *inst, 0, 0, chan_index );
1553 FETCH( func, *inst, 1, 1, chan_index );
1554 emit_add( func, 0, 1 );
1555 STORE( func, *inst, 0, 0, chan_index );
1556 }
1557 break;
1558
1559 case TGSI_OPCODE_DP3:
1560 /* TGSI_OPCODE_DOT3 */
1561 FETCH( func, *inst, 0, 0, CHAN_X );
1562 FETCH( func, *inst, 1, 1, CHAN_X );
1563 emit_mul( func, 0, 1 );
1564 FETCH( func, *inst, 1, 0, CHAN_Y );
1565 FETCH( func, *inst, 2, 1, CHAN_Y );
1566 emit_mul( func, 1, 2 );
1567 emit_add( func, 0, 1 );
1568 FETCH( func, *inst, 1, 0, CHAN_Z );
1569 FETCH( func, *inst, 2, 1, CHAN_Z );
1570 emit_mul( func, 1, 2 );
1571 emit_add( func, 0, 1 );
1572 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1573 STORE( func, *inst, 0, 0, chan_index );
1574 }
1575 break;
1576
1577 case TGSI_OPCODE_DP4:
1578 /* TGSI_OPCODE_DOT4 */
1579 FETCH( func, *inst, 0, 0, CHAN_X );
1580 FETCH( func, *inst, 1, 1, CHAN_X );
1581 emit_mul( func, 0, 1 );
1582 FETCH( func, *inst, 1, 0, CHAN_Y );
1583 FETCH( func, *inst, 2, 1, CHAN_Y );
1584 emit_mul( func, 1, 2 );
1585 emit_add( func, 0, 1 );
1586 FETCH( func, *inst, 1, 0, CHAN_Z );
1587 FETCH( func, *inst, 2, 1, CHAN_Z );
1588 emit_mul(func, 1, 2 );
1589 emit_add(func, 0, 1 );
1590 FETCH( func, *inst, 1, 0, CHAN_W );
1591 FETCH( func, *inst, 2, 1, CHAN_W );
1592 emit_mul( func, 1, 2 );
1593 emit_add( func, 0, 1 );
1594 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1595 STORE( func, *inst, 0, 0, chan_index );
1596 }
1597 break;
1598
1599 case TGSI_OPCODE_DST:
1600 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1601 emit_tempf(
1602 func,
1603 0,
1604 TGSI_EXEC_TEMP_ONE_I,
1605 TGSI_EXEC_TEMP_ONE_C );
1606 STORE( func, *inst, 0, 0, CHAN_X );
1607 }
1608 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1609 FETCH( func, *inst, 0, 0, CHAN_Y );
1610 FETCH( func, *inst, 1, 1, CHAN_Y );
1611 emit_mul( func, 0, 1 );
1612 STORE( func, *inst, 0, 0, CHAN_Y );
1613 }
1614 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1615 FETCH( func, *inst, 0, 0, CHAN_Z );
1616 STORE( func, *inst, 0, 0, CHAN_Z );
1617 }
1618 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1619 FETCH( func, *inst, 0, 1, CHAN_W );
1620 STORE( func, *inst, 0, 0, CHAN_W );
1621 }
1622 break;
1623
1624 case TGSI_OPCODE_MIN:
1625 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1626 FETCH( func, *inst, 0, 0, chan_index );
1627 FETCH( func, *inst, 1, 1, chan_index );
1628 emit_minps(
1629 func,
1630 make_xmm( 0 ),
1631 make_xmm( 1 ) );
1632 STORE( func, *inst, 0, 0, chan_index );
1633 }
1634 break;
1635
1636 case TGSI_OPCODE_MAX:
1637 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1638 FETCH( func, *inst, 0, 0, chan_index );
1639 FETCH( func, *inst, 1, 1, chan_index );
1640 emit_maxps(
1641 func,
1642 make_xmm( 0 ),
1643 make_xmm( 1 ) );
1644 STORE( func, *inst, 0, 0, chan_index );
1645 }
1646 break;
1647
1648 case TGSI_OPCODE_SLT:
1649 /* TGSI_OPCODE_SETLT */
1650 emit_setcc( func, inst, cc_LessThan );
1651 break;
1652
1653 case TGSI_OPCODE_SGE:
1654 /* TGSI_OPCODE_SETGE */
1655 emit_setcc( func, inst, cc_NotLessThan );
1656 break;
1657
1658 case TGSI_OPCODE_MAD:
1659 /* TGSI_OPCODE_MADD */
1660 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1661 FETCH( func, *inst, 0, 0, chan_index );
1662 FETCH( func, *inst, 1, 1, chan_index );
1663 FETCH( func, *inst, 2, 2, chan_index );
1664 emit_mul( func, 0, 1 );
1665 emit_add( func, 0, 2 );
1666 STORE( func, *inst, 0, 0, chan_index );
1667 }
1668 break;
1669
1670 case TGSI_OPCODE_SUB:
1671 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1672 FETCH( func, *inst, 0, 0, chan_index );
1673 FETCH( func, *inst, 1, 1, chan_index );
1674 emit_sub( func, 0, 1 );
1675 STORE( func, *inst, 0, 0, chan_index );
1676 }
1677 break;
1678
1679 case TGSI_OPCODE_LERP:
1680 /* TGSI_OPCODE_LRP */
1681 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1682 FETCH( func, *inst, 0, 0, chan_index );
1683 FETCH( func, *inst, 1, 1, chan_index );
1684 FETCH( func, *inst, 2, 2, chan_index );
1685 emit_sub( func, 1, 2 );
1686 emit_mul( func, 0, 1 );
1687 emit_add( func, 0, 2 );
1688 STORE( func, *inst, 0, 0, chan_index );
1689 }
1690 break;
1691
1692 case TGSI_OPCODE_CND:
1693 assert( 0 );
1694 break;
1695
1696 case TGSI_OPCODE_CND0:
1697 assert( 0 );
1698 break;
1699
1700 case TGSI_OPCODE_DOT2ADD:
1701 /* TGSI_OPCODE_DP2A */
1702 assert( 0 );
1703 break;
1704
1705 case TGSI_OPCODE_INDEX:
1706 assert( 0 );
1707 break;
1708
1709 case TGSI_OPCODE_NEGATE:
1710 assert( 0 );
1711 break;
1712
1713 case TGSI_OPCODE_FRAC:
1714 /* TGSI_OPCODE_FRC */
1715 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1716 FETCH( func, *inst, 0, 0, chan_index );
1717 emit_frc( func, 0 );
1718 STORE( func, *inst, 0, 0, chan_index );
1719 }
1720 break;
1721
1722 case TGSI_OPCODE_CLAMP:
1723 assert( 0 );
1724 break;
1725
1726 case TGSI_OPCODE_FLOOR:
1727 /* TGSI_OPCODE_FLR */
1728 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1729 FETCH( func, *inst, 0, 0, chan_index );
1730 emit_flr( func, 0 );
1731 STORE( func, *inst, 0, 0, chan_index );
1732 }
1733 break;
1734
1735 case TGSI_OPCODE_ROUND:
1736 assert( 0 );
1737 break;
1738
1739 case TGSI_OPCODE_EXPBASE2:
1740 /* TGSI_OPCODE_EX2 */
1741 FETCH( func, *inst, 0, 0, CHAN_X );
1742 emit_ex2( func, 0 );
1743 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1744 STORE( func, *inst, 0, 0, chan_index );
1745 }
1746 break;
1747
1748 case TGSI_OPCODE_LOGBASE2:
1749 /* TGSI_OPCODE_LG2 */
1750 FETCH( func, *inst, 0, 0, CHAN_X );
1751 emit_lg2( func, 0 );
1752 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753 STORE( func, *inst, 0, 0, chan_index );
1754 }
1755 break;
1756
1757 case TGSI_OPCODE_POWER:
1758 /* TGSI_OPCODE_POW */
1759 FETCH( func, *inst, 0, 0, CHAN_X );
1760 FETCH( func, *inst, 1, 1, CHAN_X );
1761 emit_pow( func, 0, 1 );
1762 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1763 STORE( func, *inst, 0, 0, chan_index );
1764 }
1765 break;
1766
1767 case TGSI_OPCODE_CROSSPRODUCT:
1768 /* TGSI_OPCODE_XPD */
1769 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1770 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1771 FETCH( func, *inst, 1, 1, CHAN_Z );
1772 FETCH( func, *inst, 3, 0, CHAN_Z );
1773 }
1774 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1775 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1776 FETCH( func, *inst, 0, 0, CHAN_Y );
1777 FETCH( func, *inst, 4, 1, CHAN_Y );
1778 }
1779 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1780 emit_MOV( func, 2, 0 );
1781 emit_mul( func, 2, 1 );
1782 emit_MOV( func, 5, 3 );
1783 emit_mul( func, 5, 4 );
1784 emit_sub( func, 2, 5 );
1785 STORE( func, *inst, 2, 0, CHAN_X );
1786 }
1787 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1788 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1789 FETCH( func, *inst, 2, 1, CHAN_X );
1790 FETCH( func, *inst, 5, 0, CHAN_X );
1791 }
1792 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1793 emit_mul( func, 3, 2 );
1794 emit_mul( func, 1, 5 );
1795 emit_sub( func, 3, 1 );
1796 STORE( func, *inst, 3, 0, CHAN_Y );
1797 }
1798 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1799 emit_mul( func, 5, 4 );
1800 emit_mul( func, 0, 2 );
1801 emit_sub( func, 5, 0 );
1802 STORE( func, *inst, 5, 0, CHAN_Z );
1803 }
1804 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1805 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
1806 STORE( func, *inst, 0, 0, CHAN_W );
1807 }
1808 break;
1809
1810 case TGSI_OPCODE_MULTIPLYMATRIX:
1811 assert( 0 );
1812 break;
1813
1814 case TGSI_OPCODE_ABS:
1815 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1816 FETCH( func, *inst, 0, 0, chan_index );
1817 emit_abs( func, 0) ;
1818
1819 STORE( func, *inst, 0, 0, chan_index );
1820 }
1821 break;
1822
1823 case TGSI_OPCODE_RCC:
1824 assert( 0 );
1825 break;
1826
1827 case TGSI_OPCODE_DPH:
1828 FETCH( func, *inst, 0, 0, CHAN_X );
1829 FETCH( func, *inst, 1, 1, CHAN_X );
1830 emit_mul( func, 0, 1 );
1831 FETCH( func, *inst, 1, 0, CHAN_Y );
1832 FETCH( func, *inst, 2, 1, CHAN_Y );
1833 emit_mul( func, 1, 2 );
1834 emit_add( func, 0, 1 );
1835 FETCH( func, *inst, 1, 0, CHAN_Z );
1836 FETCH( func, *inst, 2, 1, CHAN_Z );
1837 emit_mul( func, 1, 2 );
1838 emit_add( func, 0, 1 );
1839 FETCH( func, *inst, 1, 1, CHAN_W );
1840 emit_add( func, 0, 1 );
1841 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1842 STORE( func, *inst, 0, 0, chan_index );
1843 }
1844 break;
1845
1846 case TGSI_OPCODE_COS:
1847 FETCH( func, *inst, 0, 0, CHAN_X );
1848 emit_cos( func, 0 );
1849 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1850 STORE( func, *inst, 0, 0, chan_index );
1851 }
1852 break;
1853
1854 case TGSI_OPCODE_DDX:
1855 assert( 0 );
1856 break;
1857
1858 case TGSI_OPCODE_DDY:
1859 assert( 0 );
1860 break;
1861
1862 case TGSI_OPCODE_KIL:
1863 emit_kil( func, &inst->FullSrcRegisters[0] );
1864 break;
1865
1866 case TGSI_OPCODE_PK2H:
1867 assert( 0 );
1868 break;
1869
1870 case TGSI_OPCODE_PK2US:
1871 assert( 0 );
1872 break;
1873
1874 case TGSI_OPCODE_PK4B:
1875 assert( 0 );
1876 break;
1877
1878 case TGSI_OPCODE_PK4UB:
1879 assert( 0 );
1880 break;
1881
1882 case TGSI_OPCODE_RFL:
1883 assert( 0 );
1884 break;
1885
1886 case TGSI_OPCODE_SEQ:
1887 assert( 0 );
1888 break;
1889
1890 case TGSI_OPCODE_SFL:
1891 assert( 0 );
1892 break;
1893
1894 case TGSI_OPCODE_SGT:
1895 assert( 0 );
1896 break;
1897
1898 case TGSI_OPCODE_SIN:
1899 FETCH( func, *inst, 0, 0, CHAN_X );
1900 emit_sin( func, 0 );
1901 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1902 STORE( func, *inst, 0, 0, chan_index );
1903 }
1904 break;
1905
1906 case TGSI_OPCODE_SLE:
1907 assert( 0 );
1908 break;
1909
1910 case TGSI_OPCODE_SNE:
1911 assert( 0 );
1912 break;
1913
1914 case TGSI_OPCODE_STR:
1915 assert( 0 );
1916 break;
1917
1918 case TGSI_OPCODE_TEX:
1919 emit_tempf(
1920 func,
1921 0,
1922 TGSI_EXEC_TEMP_ONE_I,
1923 TGSI_EXEC_TEMP_ONE_C );
1924 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1925 STORE( func, *inst, 0, 0, chan_index );
1926 }
1927 break;
1928
1929 case TGSI_OPCODE_TXD:
1930 assert( 0 );
1931 break;
1932
1933 case TGSI_OPCODE_UP2H:
1934 assert( 0 );
1935 break;
1936
1937 case TGSI_OPCODE_UP2US:
1938 assert( 0 );
1939 break;
1940
1941 case TGSI_OPCODE_UP4B:
1942 assert( 0 );
1943 break;
1944
1945 case TGSI_OPCODE_UP4UB:
1946 assert( 0 );
1947 break;
1948
1949 case TGSI_OPCODE_X2D:
1950 assert( 0 );
1951 break;
1952
1953 case TGSI_OPCODE_ARA:
1954 assert( 0 );
1955 break;
1956
1957 case TGSI_OPCODE_ARR:
1958 assert( 0 );
1959 break;
1960
1961 case TGSI_OPCODE_BRA:
1962 assert( 0 );
1963 break;
1964
1965 case TGSI_OPCODE_CAL:
1966 assert( 0 );
1967 break;
1968
1969 case TGSI_OPCODE_RET:
1970 case TGSI_OPCODE_END:
1971 #ifdef WIN32
1972 emit_retw( func, 16 );
1973 #else
1974 emit_ret( func );
1975 #endif
1976 break;
1977
1978 case TGSI_OPCODE_SSG:
1979 assert( 0 );
1980 break;
1981
1982 case TGSI_OPCODE_CMP:
1983 emit_cmp (func, inst);
1984 break;
1985
1986 case TGSI_OPCODE_SCS:
1987 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1988 FETCH( func, *inst, 0, 0, CHAN_X );
1989 emit_cos( func, 0 );
1990 STORE( func, *inst, 0, 0, CHAN_X );
1991 }
1992 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1993 FETCH( func, *inst, 0, 0, CHAN_Y );
1994 emit_sin( func, 0 );
1995 STORE( func, *inst, 0, 0, CHAN_Y );
1996 }
1997 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1998 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C );
1999 STORE( func, *inst, 0, 0, CHAN_Z );
2000 }
2001 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2002 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
2003 STORE( func, *inst, 0, 0, CHAN_W );
2004 }
2005 break;
2006
2007 case TGSI_OPCODE_TXB:
2008 assert( 0 );
2009 break;
2010
2011 case TGSI_OPCODE_NRM:
2012 assert( 0 );
2013 break;
2014
2015 case TGSI_OPCODE_DIV:
2016 assert( 0 );
2017 break;
2018
2019 case TGSI_OPCODE_DP2:
2020 assert( 0 );
2021 break;
2022
2023 case TGSI_OPCODE_TXL:
2024 assert( 0 );
2025 break;
2026
2027 case TGSI_OPCODE_BRK:
2028 assert( 0 );
2029 break;
2030
2031 case TGSI_OPCODE_IF:
2032 assert( 0 );
2033 break;
2034
2035 case TGSI_OPCODE_LOOP:
2036 assert( 0 );
2037 break;
2038
2039 case TGSI_OPCODE_REP:
2040 assert( 0 );
2041 break;
2042
2043 case TGSI_OPCODE_ELSE:
2044 assert( 0 );
2045 break;
2046
2047 case TGSI_OPCODE_ENDIF:
2048 assert( 0 );
2049 break;
2050
2051 case TGSI_OPCODE_ENDLOOP:
2052 assert( 0 );
2053 break;
2054
2055 case TGSI_OPCODE_ENDREP:
2056 assert( 0 );
2057 break;
2058
2059 case TGSI_OPCODE_PUSHA:
2060 assert( 0 );
2061 break;
2062
2063 case TGSI_OPCODE_POPA:
2064 assert( 0 );
2065 break;
2066
2067 case TGSI_OPCODE_CEIL:
2068 assert( 0 );
2069 break;
2070
2071 case TGSI_OPCODE_I2F:
2072 assert( 0 );
2073 break;
2074
2075 case TGSI_OPCODE_NOT:
2076 assert( 0 );
2077 break;
2078
2079 case TGSI_OPCODE_TRUNC:
2080 assert( 0 );
2081 break;
2082
2083 case TGSI_OPCODE_SHL:
2084 assert( 0 );
2085 break;
2086
2087 case TGSI_OPCODE_SHR:
2088 assert( 0 );
2089 break;
2090
2091 case TGSI_OPCODE_AND:
2092 assert( 0 );
2093 break;
2094
2095 case TGSI_OPCODE_OR:
2096 assert( 0 );
2097 break;
2098
2099 case TGSI_OPCODE_MOD:
2100 assert( 0 );
2101 break;
2102
2103 case TGSI_OPCODE_XOR:
2104 assert( 0 );
2105 break;
2106
2107 case TGSI_OPCODE_SAD:
2108 assert( 0 );
2109 break;
2110
2111 case TGSI_OPCODE_TXF:
2112 assert( 0 );
2113 break;
2114
2115 case TGSI_OPCODE_TXQ:
2116 assert( 0 );
2117 break;
2118
2119 case TGSI_OPCODE_CONT:
2120 assert( 0 );
2121 break;
2122
2123 case TGSI_OPCODE_EMIT:
2124 assert( 0 );
2125 break;
2126
2127 case TGSI_OPCODE_ENDPRIM:
2128 assert( 0 );
2129 break;
2130
2131 default:
2132 assert( 0 );
2133 }
2134 }
2135
2136 static void
2137 emit_declaration(
2138 struct x86_function *func,
2139 struct tgsi_full_declaration *decl )
2140 {
2141 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2142 unsigned first, last, mask;
2143 unsigned i, j;
2144
2145 assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
2146
2147 first = decl->u.DeclarationRange.First;
2148 last = decl->u.DeclarationRange.Last;
2149 mask = decl->Declaration.UsageMask;
2150
2151 /* Do not touch WPOS.xy */
2152 if( first == 0 ) {
2153 mask &= ~TGSI_WRITEMASK_XY;
2154 if( mask == TGSI_WRITEMASK_NONE ) {
2155 first++;
2156 }
2157 }
2158
2159 for( i = first; i <= last; i++ ) {
2160 for( j = 0; j < NUM_CHANNELS; j++ ) {
2161 if( mask & (1 << j) ) {
2162 switch( decl->Interpolation.Interpolate ) {
2163 case TGSI_INTERPOLATE_CONSTANT:
2164 emit_coef_a0( func, 0, i, j );
2165 emit_inputs( func, 0, i, j );
2166 break;
2167
2168 case TGSI_INTERPOLATE_LINEAR:
2169 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2170 emit_coef_dadx( func, 1, i, j );
2171 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2172 emit_coef_dady( func, 3, i, j );
2173 emit_mul( func, 0, 1 ); /* x * dadx */
2174 emit_coef_a0( func, 4, i, j );
2175 emit_mul( func, 2, 3 ); /* y * dady */
2176 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2177 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2178 emit_inputs( func, 0, i, j );
2179 break;
2180
2181 case TGSI_INTERPOLATE_PERSPECTIVE:
2182 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2183 emit_coef_dadx( func, 1, i, j );
2184 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2185 emit_coef_dady( func, 3, i, j );
2186 emit_mul( func, 0, 1 ); /* x * dadx */
2187 emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
2188 emit_coef_a0( func, 5, i, j );
2189 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2190 emit_mul( func, 2, 3 ); /* y * dady */
2191 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2192 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2193 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2194 emit_inputs( func, 0, i, j );
2195 break;
2196
2197 default:
2198 assert( 0 );
2199 }
2200 }
2201 }
2202 }
2203 }
2204 }
2205
2206 unsigned
2207 tgsi_emit_sse2(
2208 struct tgsi_token *tokens,
2209 struct x86_function *func )
2210 {
2211 struct tgsi_parse_context parse;
2212
2213 DUMP_START();
2214
2215 func->csr = func->store;
2216
2217 emit_mov(
2218 func,
2219 get_input_base(),
2220 get_argument( 0 ) );
2221 emit_mov(
2222 func,
2223 get_output_base(),
2224 get_argument( 1 ) );
2225 emit_mov(
2226 func,
2227 get_const_base(),
2228 get_argument( 2 ) );
2229 emit_mov(
2230 func,
2231 get_temp_base(),
2232 get_argument( 3 ) );
2233
2234 tgsi_parse_init( &parse, tokens );
2235
2236 while( !tgsi_parse_end_of_tokens( &parse ) ) {
2237 tgsi_parse_token( &parse );
2238
2239 switch( parse.FullToken.Token.Type ) {
2240 case TGSI_TOKEN_TYPE_DECLARATION:
2241 break;
2242
2243 case TGSI_TOKEN_TYPE_INSTRUCTION:
2244 emit_instruction(
2245 func,
2246 &parse.FullToken.FullInstruction );
2247 break;
2248
2249 case TGSI_TOKEN_TYPE_IMMEDIATE:
2250 /* XXX implement this */
2251 assert(0);
2252 break;
2253
2254 default:
2255 assert( 0 );
2256 }
2257 }
2258
2259 tgsi_parse_free( &parse );
2260
2261 DUMP_END();
2262
2263 return 1;
2264 }
2265
2266 /**
2267 * Fragment shaders are responsible for interpolating shader inputs. Because on
2268 * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
2269 * output, const, temp and coef), the code is split into two phases --
2270 * DECLARATION and INSTRUCTION phase.
2271 * GP register holding the output argument is aliased with the coeff argument,
2272 * as outputs are not needed in the DECLARATION phase.
2273 */
2274 unsigned
2275 tgsi_emit_sse2_fs(
2276 struct tgsi_token *tokens,
2277 struct x86_function *func )
2278 {
2279 struct tgsi_parse_context parse;
2280 boolean instruction_phase = FALSE;
2281
2282 DUMP_START();
2283
2284 func->csr = func->store;
2285
2286 /* DECLARATION phase, do not load output argument. */
2287 emit_mov(
2288 func,
2289 get_input_base(),
2290 get_argument( 0 ) );
2291 emit_mov(
2292 func,
2293 get_const_base(),
2294 get_argument( 2 ) );
2295 emit_mov(
2296 func,
2297 get_temp_base(),
2298 get_argument( 3 ) );
2299 emit_mov(
2300 func,
2301 get_coef_base(),
2302 get_argument( 4 ) );
2303
2304 tgsi_parse_init( &parse, tokens );
2305
2306 while( !tgsi_parse_end_of_tokens( &parse ) ) {
2307 tgsi_parse_token( &parse );
2308
2309 switch( parse.FullToken.Token.Type ) {
2310 case TGSI_TOKEN_TYPE_DECLARATION:
2311 emit_declaration(
2312 func,
2313 &parse.FullToken.FullDeclaration );
2314 break;
2315
2316 case TGSI_TOKEN_TYPE_INSTRUCTION:
2317 if( !instruction_phase ) {
2318 /* INSTRUCTION phase, overwrite coeff with output. */
2319 instruction_phase = TRUE;
2320 emit_mov(
2321 func,
2322 get_output_base(),
2323 get_argument( 1 ) );
2324 }
2325 emit_instruction(
2326 func,
2327 &parse.FullToken.FullInstruction );
2328 break;
2329
2330 case TGSI_TOKEN_TYPE_IMMEDIATE:
2331 /* XXX implement this */
2332 assert(0);
2333 break;
2334
2335 default:
2336 assert( 0 );
2337 }
2338 }
2339
2340 tgsi_parse_free( &parse );
2341
2342 DUMP_END();
2343
2344 return 1;
2345 }
2346
2347 #endif /* i386 */