6df7588c925aeb479efa8c9b49713306b536b1af
[mesa.git] / src / mesa / pipe / tgsi / exec / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "pipe/tgsi/util/tgsi_parse.h"
31 #include "pipe/tgsi/util/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
34
35 #include "x86/rtasm/x86sse.h"
36
37 #if defined(__i386__) || defined(__386__)
38
39 #define DUMP_SSE 0
40
41 #if DUMP_SSE
42
43 static void
44 _print_reg(
45 struct x86_reg reg )
46 {
47 if (reg.mod != mod_REG)
48 debug_printf( "[" );
49
50 switch( reg.file ) {
51 case file_REG32:
52 switch( reg.idx ) {
53 case reg_AX:
54 debug_printf( "EAX" );
55 break;
56 case reg_CX:
57 debug_printf( "ECX" );
58 break;
59 case reg_DX:
60 debug_printf( "EDX" );
61 break;
62 case reg_BX:
63 debug_printf( "EBX" );
64 break;
65 case reg_SP:
66 debug_printf( "ESP" );
67 break;
68 case reg_BP:
69 debug_printf( "EBP" );
70 break;
71 case reg_SI:
72 debug_printf( "ESI" );
73 break;
74 case reg_DI:
75 debug_printf( "EDI" );
76 break;
77 }
78 break;
79 case file_MMX:
80 assert( 0 );
81 break;
82 case file_XMM:
83 debug_printf( "XMM%u", reg.idx );
84 break;
85 case file_x87:
86 assert( 0 );
87 break;
88 }
89
90 if (reg.mod == mod_DISP8 ||
91 reg.mod == mod_DISP32)
92 debug_printf("+%d", reg.disp);
93
94 if (reg.mod != mod_REG)
95 debug_printf( "]" );
96 }
97
98 static void
99 _fill(
100 const char *op )
101 {
102 unsigned count = 10 - strlen( op );
103
104 while( count-- ) {
105 debug_printf( " " );
106 }
107 }
108
109 #define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
110 #define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
111 #define DUMP( OP ) debug_printf( "\n%s", OP )
112 #define DUMP_I( OP, I ) do {\
113 debug_printf( "\n%s", OP );\
114 _fill( OP );\
115 debug_printf( "%u", I ); } while( 0 )
116 #define DUMP_R( OP, R0 ) do {\
117 debug_printf( "\n%s", OP );\
118 _fill( OP );\
119 _print_reg( R0 ); } while( 0 )
120 #define DUMP_RR( OP, R0, R1 ) do {\
121 debug_printf( "\n%s", OP );\
122 _fill( OP );\
123 _print_reg( R0 );\
124 debug_printf( ", " );\
125 _print_reg( R1 ); } while( 0 )
126 #define DUMP_RRI( OP, R0, R1, I ) do {\
127 debug_printf( "\n%s", OP );\
128 _fill( OP );\
129 _print_reg( R0 );\
130 debug_printf( ", " );\
131 _print_reg( R1 );\
132 debug_printf( ", " );\
133 debug_printf( "%u", I ); } while( 0 )
134
135 #else
136
137 #define DUMP_START()
138 #define DUMP_END()
139 #define DUMP( OP )
140 #define DUMP_I( OP, I )
141 #define DUMP_R( OP, R0 )
142 #define DUMP_RR( OP, R0, R1 )
143 #define DUMP_RRI( OP, R0, R1, I )
144
145 #endif
146
147 #define FOR_EACH_CHANNEL( CHAN )\
148 for( CHAN = 0; CHAN < 4; CHAN++ )
149
150 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
151 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
152
153 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
154 if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
155
156 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
157 FOR_EACH_CHANNEL( CHAN )\
158 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
159
160 #define CHAN_X 0
161 #define CHAN_Y 1
162 #define CHAN_Z 2
163 #define CHAN_W 3
164
165 #define TEMP_R0 TGSI_EXEC_TEMP_R0
166
167 /**
168 * X86 utility functions.
169 */
170
171 static struct x86_reg
172 make_xmm(
173 unsigned xmm )
174 {
175 return x86_make_reg(
176 file_XMM,
177 (enum x86_reg_name) xmm );
178 }
179
180 /**
181 * X86 register mapping helpers.
182 */
183
184 static struct x86_reg
185 get_const_base( void )
186 {
187 return x86_make_reg(
188 file_REG32,
189 reg_CX );
190 }
191
192 static struct x86_reg
193 get_input_base( void )
194 {
195 return x86_make_reg(
196 file_REG32,
197 reg_AX );
198 }
199
200 static struct x86_reg
201 get_output_base( void )
202 {
203 return x86_make_reg(
204 file_REG32,
205 reg_DX );
206 }
207
208 static struct x86_reg
209 get_temp_base( void )
210 {
211 #ifdef WIN32
212 return x86_make_reg(
213 file_REG32,
214 reg_BX );
215 #else
216 return x86_make_reg(
217 file_REG32,
218 reg_SI );
219 #endif
220 }
221
222 static struct x86_reg
223 get_coef_base( void )
224 {
225 return get_output_base();
226 }
227
228 /**
229 * Data access helpers.
230 */
231
232 static struct x86_reg
233 get_argument(
234 unsigned index )
235 {
236 return x86_make_disp(
237 x86_make_reg( file_REG32, reg_SP ),
238 (index + 1) * 4 );
239 }
240
241 static struct x86_reg
242 get_const(
243 unsigned vec,
244 unsigned chan )
245 {
246 return x86_make_disp(
247 get_const_base(),
248 (vec * 4 + chan) * 4 );
249 }
250
251 static struct x86_reg
252 get_input(
253 unsigned vec,
254 unsigned chan )
255 {
256 return x86_make_disp(
257 get_input_base(),
258 (vec * 4 + chan) * 16 );
259 }
260
261 static struct x86_reg
262 get_output(
263 unsigned vec,
264 unsigned chan )
265 {
266 return x86_make_disp(
267 get_output_base(),
268 (vec * 4 + chan) * 16 );
269 }
270
271 static struct x86_reg
272 get_temp(
273 unsigned vec,
274 unsigned chan )
275 {
276 return x86_make_disp(
277 get_temp_base(),
278 (vec * 4 + chan) * 16 );
279 }
280
281 static struct x86_reg
282 get_coef(
283 unsigned vec,
284 unsigned chan,
285 unsigned member )
286 {
287 return x86_make_disp(
288 get_coef_base(),
289 ((vec * 3 + member) * 4 + chan) * 4 );
290 }
291
292 /**
293 * X86 rtasm wrappers.
294 */
295
296 static void
297 emit_addps(
298 struct x86_function *func,
299 struct x86_reg dst,
300 struct x86_reg src )
301 {
302 DUMP_RR( "ADDPS", dst, src );
303 sse_addps( func, dst, src );
304 }
305
306 static void
307 emit_andnps(
308 struct x86_function *func,
309 struct x86_reg dst,
310 struct x86_reg src )
311 {
312 DUMP_RR( "ANDNPS", dst, src );
313 sse_andnps( func, dst, src );
314 }
315
316 static void
317 emit_andps(
318 struct x86_function *func,
319 struct x86_reg dst,
320 struct x86_reg src )
321 {
322 DUMP_RR( "ANDPS", dst, src );
323 sse_andps( func, dst, src );
324 }
325
326 static void
327 emit_call(
328 struct x86_function *func,
329 void (* addr)() )
330 {
331 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
332
333 DUMP_I( "CALL", addr );
334 x86_mov_reg_imm( func, ecx, (unsigned long) addr );
335 x86_call( func, ecx );
336 }
337
338 static void
339 emit_cmpps(
340 struct x86_function *func,
341 struct x86_reg dst,
342 struct x86_reg src,
343 enum sse_cc cc )
344 {
345 DUMP_RRI( "CMPPS", dst, src, cc );
346 sse_cmpps( func, dst, src, cc );
347 }
348
349 static void
350 emit_cvttps2dq(
351 struct x86_function *func,
352 struct x86_reg dst,
353 struct x86_reg src )
354 {
355 DUMP_RR( "CVTTPS2DQ", dst, src );
356 sse2_cvttps2dq( func, dst, src );
357 }
358
359 static void
360 emit_maxps(
361 struct x86_function *func,
362 struct x86_reg dst,
363 struct x86_reg src )
364 {
365 DUMP_RR( "MAXPS", dst, src );
366 sse_maxps( func, dst, src );
367 }
368
369 static void
370 emit_minps(
371 struct x86_function *func,
372 struct x86_reg dst,
373 struct x86_reg src )
374 {
375 DUMP_RR( "MINPS", dst, src );
376 sse_minps( func, dst, src );
377 }
378
379 static void
380 emit_mov(
381 struct x86_function *func,
382 struct x86_reg dst,
383 struct x86_reg src )
384 {
385 DUMP_RR( "MOV", dst, src );
386 x86_mov( func, dst, src );
387 }
388
389 static void
390 emit_movaps(
391 struct x86_function *func,
392 struct x86_reg dst,
393 struct x86_reg src )
394 {
395 DUMP_RR( "MOVAPS", dst, src );
396 sse_movaps( func, dst, src );
397 }
398
399 static void
400 emit_movss(
401 struct x86_function *func,
402 struct x86_reg dst,
403 struct x86_reg src )
404 {
405 DUMP_RR( "MOVSS", dst, src );
406 sse_movss( func, dst, src );
407 }
408
409 static void
410 emit_movups(
411 struct x86_function *func,
412 struct x86_reg dst,
413 struct x86_reg src )
414 {
415 DUMP_RR( "MOVUPS", dst, src );
416 sse_movups( func, dst, src );
417 }
418
419 static void
420 emit_mulps(
421 struct x86_function *func,
422 struct x86_reg dst,
423 struct x86_reg src )
424 {
425 DUMP_RR( "MULPS", dst, src );
426 sse_mulps( func, dst, src );
427 }
428
429 static void
430 emit_or(
431 struct x86_function *func,
432 struct x86_reg dst,
433 struct x86_reg src )
434 {
435 DUMP_RR( "OR", dst, src );
436 x86_or( func, dst, src );
437 }
438
439 static void
440 emit_orps(
441 struct x86_function *func,
442 struct x86_reg dst,
443 struct x86_reg src )
444 {
445 DUMP_RR( "ORPS", dst, src );
446 sse_orps( func, dst, src );
447 }
448
449 static void
450 emit_pmovmskb(
451 struct x86_function *func,
452 struct x86_reg dst,
453 struct x86_reg src )
454 {
455 DUMP_RR( "PMOVMSKB", dst, src );
456 sse_pmovmskb( func, dst, src );
457 }
458
459 static void
460 emit_pop(
461 struct x86_function *func,
462 struct x86_reg dst )
463 {
464 DUMP_R( "POP", dst );
465 x86_pop( func, dst );
466 }
467
468 static void
469 emit_push(
470 struct x86_function *func,
471 struct x86_reg dst )
472 {
473 DUMP_R( "PUSH", dst );
474 x86_push( func, dst );
475 }
476
477 static void
478 emit_rcpps(
479 struct x86_function *func,
480 struct x86_reg dst,
481 struct x86_reg src )
482 {
483 DUMP_RR( "RCPPS", dst, src );
484 sse2_rcpps( func, dst, src );
485 }
486
487 #ifdef WIN32
488 static void
489 emit_retw(
490 struct x86_function *func,
491 unsigned size )
492 {
493 DUMP_I( "RET", size );
494 x86_retw( func, size );
495 }
496 #else
497 static void
498 emit_ret(
499 struct x86_function *func )
500 {
501 DUMP( "RET" );
502 x86_ret( func );
503 }
504 #endif
505
506 static void
507 emit_rsqrtps(
508 struct x86_function *func,
509 struct x86_reg dst,
510 struct x86_reg src )
511 {
512 DUMP_RR( "RSQRTPS", dst, src );
513 sse_rsqrtps( func, dst, src );
514 }
515
516 static void
517 emit_shufps(
518 struct x86_function *func,
519 struct x86_reg dst,
520 struct x86_reg src,
521 unsigned char shuf )
522 {
523 DUMP_RRI( "SHUFPS", dst, src, shuf );
524 sse_shufps( func, dst, src, shuf );
525 }
526
527 static void
528 emit_subps(
529 struct x86_function *func,
530 struct x86_reg dst,
531 struct x86_reg src )
532 {
533 DUMP_RR( "SUBPS", dst, src );
534 sse_subps( func, dst, src );
535 }
536
537 static void
538 emit_xorps(
539 struct x86_function *func,
540 struct x86_reg dst,
541 struct x86_reg src )
542 {
543 DUMP_RR( "XORPS", dst, src );
544 sse_xorps( func, dst, src );
545 }
546
547 /**
548 * Data fetch helpers.
549 */
550
551 static void
552 emit_const(
553 struct x86_function *func,
554 unsigned xmm,
555 unsigned vec,
556 unsigned chan )
557 {
558 emit_movss(
559 func,
560 make_xmm( xmm ),
561 get_const( vec, chan ) );
562 emit_shufps(
563 func,
564 make_xmm( xmm ),
565 make_xmm( xmm ),
566 SHUF( 0, 0, 0, 0 ) );
567 }
568
569 static void
570 emit_inputf(
571 struct x86_function *func,
572 unsigned xmm,
573 unsigned vec,
574 unsigned chan )
575 {
576 emit_movups(
577 func,
578 make_xmm( xmm ),
579 get_input( vec, chan ) );
580 }
581
582 static void
583 emit_output(
584 struct x86_function *func,
585 unsigned xmm,
586 unsigned vec,
587 unsigned chan )
588 {
589 emit_movups(
590 func,
591 get_output( vec, chan ),
592 make_xmm( xmm ) );
593 }
594
595 static void
596 emit_tempf(
597 struct x86_function *func,
598 unsigned xmm,
599 unsigned vec,
600 unsigned chan )
601 {
602 emit_movaps(
603 func,
604 make_xmm( xmm ),
605 get_temp( vec, chan ) );
606 }
607
608 static void
609 emit_coef(
610 struct x86_function *func,
611 unsigned xmm,
612 unsigned vec,
613 unsigned chan,
614 unsigned member )
615 {
616 emit_movss(
617 func,
618 make_xmm( xmm ),
619 get_coef( vec, chan, member ) );
620 emit_shufps(
621 func,
622 make_xmm( xmm ),
623 make_xmm( xmm ),
624 SHUF( 0, 0, 0, 0 ) );
625 }
626
627 /**
628 * Data store helpers.
629 */
630
631 static void
632 emit_inputs(
633 struct x86_function *func,
634 unsigned xmm,
635 unsigned vec,
636 unsigned chan )
637 {
638 emit_movups(
639 func,
640 get_input( vec, chan ),
641 make_xmm( xmm ) );
642 }
643
644 static void
645 emit_temps(
646 struct x86_function *func,
647 unsigned xmm,
648 unsigned vec,
649 unsigned chan )
650 {
651 emit_movaps(
652 func,
653 get_temp( vec, chan ),
654 make_xmm( xmm ) );
655 }
656
657 static void
658 emit_addrs(
659 struct x86_function *func,
660 unsigned xmm,
661 unsigned vec,
662 unsigned chan )
663 {
664 emit_temps(
665 func,
666 xmm,
667 vec + TGSI_EXEC_NUM_TEMPS,
668 chan );
669 }
670
671 /**
672 * Coefficent fetch helpers.
673 */
674
675 static void
676 emit_coef_a0(
677 struct x86_function *func,
678 unsigned xmm,
679 unsigned vec,
680 unsigned chan )
681 {
682 emit_coef(
683 func,
684 xmm,
685 vec,
686 chan,
687 0 );
688 }
689
690 static void
691 emit_coef_dadx(
692 struct x86_function *func,
693 unsigned xmm,
694 unsigned vec,
695 unsigned chan )
696 {
697 emit_coef(
698 func,
699 xmm,
700 vec,
701 chan,
702 1 );
703 }
704
705 static void
706 emit_coef_dady(
707 struct x86_function *func,
708 unsigned xmm,
709 unsigned vec,
710 unsigned chan )
711 {
712 emit_coef(
713 func,
714 xmm,
715 vec,
716 chan,
717 2 );
718 }
719
720 /**
721 * Function call helpers.
722 */
723
724 static void
725 emit_push_gp(
726 struct x86_function *func )
727 {
728 emit_push(
729 func,
730 get_const_base() );
731 emit_push(
732 func,
733 get_input_base() );
734 emit_push(
735 func,
736 get_output_base() );
737
738 /* It is important on non-win32 platforms that temp base is pushed last.
739 */
740 emit_push(
741 func,
742 get_temp_base() );
743 }
744
745 static void
746 emit_pop_gp(
747 struct x86_function *func )
748 {
749 /* Restore GP registers in a reverse order.
750 */
751 emit_pop(
752 func,
753 get_temp_base() );
754 emit_pop(
755 func,
756 get_output_base() );
757 emit_pop(
758 func,
759 get_input_base() );
760 emit_pop(
761 func,
762 get_const_base() );
763 }
764
765 static void
766 emit_func_call_dst(
767 struct x86_function *func,
768 unsigned xmm_dst,
769 void (*code)() )
770 {
771 emit_movaps(
772 func,
773 get_temp( TEMP_R0, 0 ),
774 make_xmm( xmm_dst ) );
775
776 emit_push_gp(
777 func );
778
779 #ifdef WIN32
780 emit_push(
781 func,
782 get_temp( TEMP_R0, 0 ) );
783 #endif
784
785 emit_call(
786 func,
787 code );
788
789 emit_pop_gp(
790 func );
791
792 emit_movaps(
793 func,
794 make_xmm( xmm_dst ),
795 get_temp( TEMP_R0, 0 ) );
796 }
797
798 static void
799 emit_func_call_dst_src(
800 struct x86_function *func,
801 unsigned xmm_dst,
802 unsigned xmm_src,
803 void (*code)() )
804 {
805 emit_movaps(
806 func,
807 get_temp( TEMP_R0, 1 ),
808 make_xmm( xmm_src ) );
809
810 emit_func_call_dst(
811 func,
812 xmm_dst,
813 code );
814 }
815
816 /**
817 * Low-level instruction translators.
818 */
819
820 static void
821 emit_abs(
822 struct x86_function *func,
823 unsigned xmm )
824 {
825 emit_andps(
826 func,
827 make_xmm( xmm ),
828 get_temp(
829 TGSI_EXEC_TEMP_7FFFFFFF_I,
830 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
831 }
832
833 static void
834 emit_add(
835 struct x86_function *func,
836 unsigned xmm_dst,
837 unsigned xmm_src )
838 {
839 emit_addps(
840 func,
841 make_xmm( xmm_dst ),
842 make_xmm( xmm_src ) );
843 }
844
845 static void XSTDCALL
846 cos4f(
847 float *store )
848 {
849 #ifdef WIN32
850 store[0] = (float) cos( (double) store[0] );
851 store[1] = (float) cos( (double) store[1] );
852 store[2] = (float) cos( (double) store[2] );
853 store[3] = (float) cos( (double) store[3] );
854 #else
855 const unsigned X = TEMP_R0 * 16;
856 store[X + 0] = cosf( store[X + 0] );
857 store[X + 1] = cosf( store[X + 1] );
858 store[X + 2] = cosf( store[X + 2] );
859 store[X + 3] = cosf( store[X + 3] );
860 #endif
861 }
862
863 static void
864 emit_cos(
865 struct x86_function *func,
866 unsigned xmm_dst )
867 {
868 emit_func_call_dst(
869 func,
870 xmm_dst,
871 cos4f );
872 }
873
874 static void XSTDCALL
875 ex24f(
876 float *store )
877 {
878 #ifdef WIN32
879 store[0] = (float) pow( 2.0, (double) store[0] );
880 store[1] = (float) pow( 2.0, (double) store[1] );
881 store[2] = (float) pow( 2.0, (double) store[2] );
882 store[3] = (float) pow( 2.0, (double) store[3] );
883 #else
884 const unsigned X = TEMP_R0 * 16;
885 store[X + 0] = powf( 2.0f, store[X + 0] );
886 store[X + 1] = powf( 2.0f, store[X + 1] );
887 store[X + 2] = powf( 2.0f, store[X + 2] );
888 store[X + 3] = powf( 2.0f, store[X + 3] );
889 #endif
890 }
891
892 static void
893 emit_ex2(
894 struct x86_function *func,
895 unsigned xmm_dst )
896 {
897 emit_func_call_dst(
898 func,
899 xmm_dst,
900 ex24f );
901 }
902
903 static void
904 emit_f2it(
905 struct x86_function *func,
906 unsigned xmm )
907 {
908 emit_cvttps2dq(
909 func,
910 make_xmm( xmm ),
911 make_xmm( xmm ) );
912 }
913
914 static void XSTDCALL
915 flr4f(
916 float *store )
917 {
918 #ifdef WIN32
919 const unsigned X = 0;
920 #else
921 const unsigned X = TEMP_R0 * 16;
922 #endif
923 store[X + 0] = (float) floor( (double) store[X + 0] );
924 store[X + 1] = (float) floor( (double) store[X + 1] );
925 store[X + 2] = (float) floor( (double) store[X + 2] );
926 store[X + 3] = (float) floor( (double) store[X + 3] );
927 }
928
929 static void
930 emit_flr(
931 struct x86_function *func,
932 unsigned xmm_dst )
933 {
934 emit_func_call_dst(
935 func,
936 xmm_dst,
937 flr4f );
938 }
939
940 static void XSTDCALL
941 frc4f(
942 float *store )
943 {
944 #ifdef WIN32
945 const unsigned X = 0;
946 #else
947 const unsigned X = TEMP_R0 * 16;
948 #endif
949 store[X + 0] -= (float) floor( (double) store[X + 0] );
950 store[X + 1] -= (float) floor( (double) store[X + 1] );
951 store[X + 2] -= (float) floor( (double) store[X + 2] );
952 store[X + 3] -= (float) floor( (double) store[X + 3] );
953 }
954
955 static void
956 emit_frc(
957 struct x86_function *func,
958 unsigned xmm_dst )
959 {
960 emit_func_call_dst(
961 func,
962 xmm_dst,
963 frc4f );
964 }
965
966 static void XSTDCALL
967 lg24f(
968 float *store )
969 {
970 #ifdef WIN32
971 const unsigned X = 0;
972 #else
973 const unsigned X = TEMP_R0 * 16;
974 #endif
975 store[X + 0] = LOG2( store[X + 0] );
976 store[X + 1] = LOG2( store[X + 1] );
977 store[X + 2] = LOG2( store[X + 2] );
978 store[X + 3] = LOG2( store[X + 3] );
979 }
980
981 static void
982 emit_lg2(
983 struct x86_function *func,
984 unsigned xmm_dst )
985 {
986 emit_func_call_dst(
987 func,
988 xmm_dst,
989 lg24f );
990 }
991
992 static void
993 emit_MOV(
994 struct x86_function *func,
995 unsigned xmm_dst,
996 unsigned xmm_src )
997 {
998 emit_movups(
999 func,
1000 make_xmm( xmm_dst ),
1001 make_xmm( xmm_src ) );
1002 }
1003
1004 static void
1005 emit_mul (struct x86_function *func,
1006 unsigned xmm_dst,
1007 unsigned xmm_src)
1008 {
1009 emit_mulps(
1010 func,
1011 make_xmm( xmm_dst ),
1012 make_xmm( xmm_src ) );
1013 }
1014
1015 static void
1016 emit_neg(
1017 struct x86_function *func,
1018 unsigned xmm )
1019 {
1020 emit_xorps(
1021 func,
1022 make_xmm( xmm ),
1023 get_temp(
1024 TGSI_EXEC_TEMP_80000000_I,
1025 TGSI_EXEC_TEMP_80000000_C ) );
1026 }
1027
1028 static void XSTDCALL
1029 pow4f(
1030 float *store )
1031 {
1032 #ifdef WIN32
1033 store[0] = (float) pow( (double) store[0], (double) store[4] );
1034 store[1] = (float) pow( (double) store[1], (double) store[5] );
1035 store[2] = (float) pow( (double) store[2], (double) store[6] );
1036 store[3] = (float) pow( (double) store[3], (double) store[7] );
1037 #else
1038 const unsigned X = TEMP_R0 * 16;
1039 store[X + 0] = powf( store[X + 0], store[X + 4] );
1040 store[X + 1] = powf( store[X + 1], store[X + 5] );
1041 store[X + 2] = powf( store[X + 2], store[X + 6] );
1042 store[X + 3] = powf( store[X + 3], store[X + 7] );
1043 #endif
1044 }
1045
1046 static void
1047 emit_pow(
1048 struct x86_function *func,
1049 unsigned xmm_dst,
1050 unsigned xmm_src )
1051 {
1052 emit_func_call_dst_src(
1053 func,
1054 xmm_dst,
1055 xmm_src,
1056 pow4f );
1057 }
1058
1059 static void
1060 emit_rcp (
1061 struct x86_function *func,
1062 unsigned xmm_dst,
1063 unsigned xmm_src )
1064 {
1065 emit_rcpps(
1066 func,
1067 make_xmm( xmm_dst ),
1068 make_xmm( xmm_src ) );
1069 }
1070
1071 static void
1072 emit_rsqrt(
1073 struct x86_function *func,
1074 unsigned xmm_dst,
1075 unsigned xmm_src )
1076 {
1077 emit_rsqrtps(
1078 func,
1079 make_xmm( xmm_dst ),
1080 make_xmm( xmm_src ) );
1081 }
1082
1083 static void
1084 emit_setsign(
1085 struct x86_function *func,
1086 unsigned xmm )
1087 {
1088 emit_orps(
1089 func,
1090 make_xmm( xmm ),
1091 get_temp(
1092 TGSI_EXEC_TEMP_80000000_I,
1093 TGSI_EXEC_TEMP_80000000_C ) );
1094 }
1095
1096 static void XSTDCALL
1097 sin4f(
1098 float *store )
1099 {
1100 #ifdef WIN32
1101 store[0] = (float) sin( (double) store[0] );
1102 store[1] = (float) sin( (double) store[1] );
1103 store[2] = (float) sin( (double) store[2] );
1104 store[3] = (float) sin( (double) store[3] );
1105 #else
1106 const unsigned X = TEMP_R0 * 16;
1107 store[X + 0] = sinf( store[X + 0] );
1108 store[X + 1] = sinf( store[X + 1] );
1109 store[X + 2] = sinf( store[X + 2] );
1110 store[X + 3] = sinf( store[X + 3] );
1111 #endif
1112 }
1113
1114 static void
1115 emit_sin (struct x86_function *func,
1116 unsigned xmm_dst)
1117 {
1118 emit_func_call_dst(
1119 func,
1120 xmm_dst,
1121 sin4f );
1122 }
1123
1124 static void
1125 emit_sub(
1126 struct x86_function *func,
1127 unsigned xmm_dst,
1128 unsigned xmm_src )
1129 {
1130 emit_subps(
1131 func,
1132 make_xmm( xmm_dst ),
1133 make_xmm( xmm_src ) );
1134 }
1135
1136 /**
1137 * Register fetch.
1138 */
1139
1140 static void
1141 emit_fetch(
1142 struct x86_function *func,
1143 unsigned xmm,
1144 const struct tgsi_full_src_register *reg,
1145 const unsigned chan_index )
1146 {
1147 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1148
1149 switch( swizzle ) {
1150 case TGSI_EXTSWIZZLE_X:
1151 case TGSI_EXTSWIZZLE_Y:
1152 case TGSI_EXTSWIZZLE_Z:
1153 case TGSI_EXTSWIZZLE_W:
1154 switch( reg->SrcRegister.File ) {
1155 case TGSI_FILE_CONSTANT:
1156 emit_const(
1157 func,
1158 xmm,
1159 reg->SrcRegister.Index,
1160 swizzle );
1161 break;
1162
1163 case TGSI_FILE_INPUT:
1164 emit_inputf(
1165 func,
1166 xmm,
1167 reg->SrcRegister.Index,
1168 swizzle );
1169 break;
1170
1171 case TGSI_FILE_TEMPORARY:
1172 emit_tempf(
1173 func,
1174 xmm,
1175 reg->SrcRegister.Index,
1176 swizzle );
1177 break;
1178
1179 default:
1180 assert( 0 );
1181 }
1182 break;
1183
1184 case TGSI_EXTSWIZZLE_ZERO:
1185 emit_tempf(
1186 func,
1187 xmm,
1188 TGSI_EXEC_TEMP_00000000_I,
1189 TGSI_EXEC_TEMP_00000000_C );
1190 break;
1191
1192 case TGSI_EXTSWIZZLE_ONE:
1193 emit_tempf(
1194 func,
1195 xmm,
1196 TGSI_EXEC_TEMP_ONE_I,
1197 TGSI_EXEC_TEMP_ONE_C );
1198 break;
1199
1200 default:
1201 assert( 0 );
1202 }
1203
1204 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1205 case TGSI_UTIL_SIGN_CLEAR:
1206 emit_abs( func, xmm );
1207 break;
1208
1209 case TGSI_UTIL_SIGN_SET:
1210 emit_setsign( func, xmm );
1211 break;
1212
1213 case TGSI_UTIL_SIGN_TOGGLE:
1214 emit_neg( func, xmm );
1215 break;
1216
1217 case TGSI_UTIL_SIGN_KEEP:
1218 break;
1219 }
1220 }
1221
1222 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1223 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1224
1225 /**
1226 * Register store.
1227 */
1228
1229 static void
1230 emit_store(
1231 struct x86_function *func,
1232 unsigned xmm,
1233 const struct tgsi_full_dst_register *reg,
1234 const struct tgsi_full_instruction *inst,
1235 unsigned chan_index )
1236 {
1237 switch( reg->DstRegister.File ) {
1238 case TGSI_FILE_OUTPUT:
1239 emit_output(
1240 func,
1241 xmm,
1242 reg->DstRegister.Index,
1243 chan_index );
1244 break;
1245
1246 case TGSI_FILE_TEMPORARY:
1247 emit_temps(
1248 func,
1249 xmm,
1250 reg->DstRegister.Index,
1251 chan_index );
1252 break;
1253
1254 case TGSI_FILE_ADDRESS:
1255 emit_addrs(
1256 func,
1257 xmm,
1258 reg->DstRegister.Index,
1259 chan_index );
1260 break;
1261
1262 default:
1263 assert( 0 );
1264 }
1265
1266 switch( inst->Instruction.Saturate ) {
1267 case TGSI_SAT_NONE:
1268 break;
1269
1270 case TGSI_SAT_ZERO_ONE:
1271 // assert( 0 );
1272 break;
1273
1274 case TGSI_SAT_MINUS_PLUS_ONE:
1275 assert( 0 );
1276 break;
1277 }
1278 }
1279
1280 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1281 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1282
1283 /**
1284 * High-level instruction translators.
1285 */
1286
1287 static void
1288 emit_kil(
1289 struct x86_function *func,
1290 const struct tgsi_full_src_register *reg )
1291 {
1292 unsigned uniquemask;
1293 unsigned registers[4];
1294 unsigned nextregister = 0;
1295 unsigned firstchan = ~0;
1296 unsigned chan_index;
1297
1298 /* This mask stores component bits that were already tested. Note that
1299 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1300 * tested. */
1301 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1302
1303 FOR_EACH_CHANNEL( chan_index ) {
1304 unsigned swizzle;
1305
1306 /* unswizzle channel */
1307 swizzle = tgsi_util_get_full_src_register_extswizzle(
1308 reg,
1309 chan_index );
1310
1311 /* check if the component has not been already tested */
1312 if( !(uniquemask & (1 << swizzle)) ) {
1313 uniquemask |= 1 << swizzle;
1314
1315 /* allocate register */
1316 registers[chan_index] = nextregister;
1317 emit_fetch(
1318 func,
1319 nextregister,
1320 reg,
1321 chan_index );
1322 nextregister++;
1323
1324 /* mark the first channel used */
1325 if( firstchan == ~0 ) {
1326 firstchan = chan_index;
1327 }
1328 }
1329 }
1330
1331 emit_push(
1332 func,
1333 x86_make_reg( file_REG32, reg_AX ) );
1334 emit_push(
1335 func,
1336 x86_make_reg( file_REG32, reg_DX ) );
1337
1338 FOR_EACH_CHANNEL( chan_index ) {
1339 if( uniquemask & (1 << chan_index) ) {
1340 emit_cmpps(
1341 func,
1342 make_xmm( registers[chan_index] ),
1343 get_temp(
1344 TGSI_EXEC_TEMP_00000000_I,
1345 TGSI_EXEC_TEMP_00000000_C ),
1346 cc_LessThan );
1347
1348 if( chan_index == firstchan ) {
1349 emit_pmovmskb(
1350 func,
1351 x86_make_reg( file_REG32, reg_AX ),
1352 make_xmm( registers[chan_index] ) );
1353 }
1354 else {
1355 emit_pmovmskb(
1356 func,
1357 x86_make_reg( file_REG32, reg_DX ),
1358 make_xmm( registers[chan_index] ) );
1359 emit_or(
1360 func,
1361 x86_make_reg( file_REG32, reg_AX ),
1362 x86_make_reg( file_REG32, reg_DX ) );
1363 }
1364 }
1365 }
1366
1367 emit_or(
1368 func,
1369 get_temp(
1370 TGSI_EXEC_TEMP_KILMASK_I,
1371 TGSI_EXEC_TEMP_KILMASK_C ),
1372 x86_make_reg( file_REG32, reg_AX ) );
1373
1374 emit_pop(
1375 func,
1376 x86_make_reg( file_REG32, reg_DX ) );
1377 emit_pop(
1378 func,
1379 x86_make_reg( file_REG32, reg_AX ) );
1380 }
1381
1382 static void
1383 emit_setcc(
1384 struct x86_function *func,
1385 struct tgsi_full_instruction *inst,
1386 enum sse_cc cc )
1387 {
1388 unsigned chan_index;
1389
1390 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1391 FETCH( func, *inst, 0, 0, chan_index );
1392 FETCH( func, *inst, 1, 1, chan_index );
1393 emit_cmpps(
1394 func,
1395 make_xmm( 0 ),
1396 make_xmm( 1 ),
1397 cc );
1398 emit_andps(
1399 func,
1400 make_xmm( 0 ),
1401 get_temp(
1402 TGSI_EXEC_TEMP_ONE_I,
1403 TGSI_EXEC_TEMP_ONE_C ) );
1404 STORE( func, *inst, 0, 0, chan_index );
1405 }
1406 }
1407
1408 static void
1409 emit_cmp(
1410 struct x86_function *func,
1411 struct tgsi_full_instruction *inst )
1412 {
1413 unsigned chan_index;
1414
1415 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1416 FETCH( func, *inst, 0, 0, chan_index );
1417 FETCH( func, *inst, 1, 1, chan_index );
1418 FETCH( func, *inst, 2, 2, chan_index );
1419 emit_cmpps(
1420 func,
1421 make_xmm( 0 ),
1422 get_temp(
1423 TGSI_EXEC_TEMP_00000000_I,
1424 TGSI_EXEC_TEMP_00000000_C ),
1425 cc_LessThan );
1426 emit_andps(
1427 func,
1428 make_xmm( 1 ),
1429 make_xmm( 0 ) );
1430 emit_andnps(
1431 func,
1432 make_xmm( 0 ),
1433 make_xmm( 2 ) );
1434 emit_orps(
1435 func,
1436 make_xmm( 0 ),
1437 make_xmm( 1 ) );
1438 STORE( func, *inst, 0, 0, chan_index );
1439 }
1440 }
1441
1442 static int
1443 emit_instruction(
1444 struct x86_function *func,
1445 struct tgsi_full_instruction *inst )
1446 {
1447 unsigned chan_index;
1448
1449 switch( inst->Instruction.Opcode ) {
1450 case TGSI_OPCODE_ARL:
1451 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1452 FETCH( func, *inst, 0, 0, chan_index );
1453 emit_f2it( func, 0 );
1454 STORE( func, *inst, 0, 0, chan_index );
1455 }
1456 break;
1457
1458 case TGSI_OPCODE_MOV:
1459 /* TGSI_OPCODE_SWZ */
1460 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1461 FETCH( func, *inst, 0, 0, chan_index );
1462 STORE( func, *inst, 0, 0, chan_index );
1463 }
1464 break;
1465
1466 case TGSI_OPCODE_LIT:
1467 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1468 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1469 emit_tempf(
1470 func,
1471 0,
1472 TGSI_EXEC_TEMP_ONE_I,
1473 TGSI_EXEC_TEMP_ONE_C);
1474 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1475 STORE( func, *inst, 0, 0, CHAN_X );
1476 }
1477 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1478 STORE( func, *inst, 0, 0, CHAN_W );
1479 }
1480 }
1481 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1482 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1483 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1484 FETCH( func, *inst, 0, 0, CHAN_X );
1485 emit_maxps(
1486 func,
1487 make_xmm( 0 ),
1488 get_temp(
1489 TGSI_EXEC_TEMP_00000000_I,
1490 TGSI_EXEC_TEMP_00000000_C ) );
1491 STORE( func, *inst, 0, 0, CHAN_Y );
1492 }
1493 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1494 FETCH( func, *inst, 1, 0, CHAN_Y );
1495 emit_maxps(
1496 func,
1497 make_xmm( 1 ),
1498 get_temp(
1499 TGSI_EXEC_TEMP_00000000_I,
1500 TGSI_EXEC_TEMP_00000000_C ) );
1501 FETCH( func, *inst, 2, 0, CHAN_W );
1502 emit_minps(
1503 func,
1504 make_xmm( 2 ),
1505 get_temp(
1506 TGSI_EXEC_TEMP_128_I,
1507 TGSI_EXEC_TEMP_128_C ) );
1508 emit_maxps(
1509 func,
1510 make_xmm( 2 ),
1511 get_temp(
1512 TGSI_EXEC_TEMP_MINUS_128_I,
1513 TGSI_EXEC_TEMP_MINUS_128_C ) );
1514 emit_pow( func, 1, 2 );
1515 FETCH( func, *inst, 0, 0, CHAN_X );
1516 emit_xorps(
1517 func,
1518 make_xmm( 2 ),
1519 make_xmm( 2 ) );
1520 emit_cmpps(
1521 func,
1522 make_xmm( 2 ),
1523 make_xmm( 0 ),
1524 cc_LessThanEqual );
1525 emit_andps(
1526 func,
1527 make_xmm( 2 ),
1528 make_xmm( 1 ) );
1529 STORE( func, *inst, 2, 0, CHAN_Z );
1530 }
1531 }
1532 break;
1533
1534 case TGSI_OPCODE_RCP:
1535 /* TGSI_OPCODE_RECIP */
1536 FETCH( func, *inst, 0, 0, CHAN_X );
1537 emit_rcp( func, 0, 0 );
1538 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1539 STORE( func, *inst, 0, 0, chan_index );
1540 }
1541 break;
1542
1543 case TGSI_OPCODE_RSQ:
1544 /* TGSI_OPCODE_RECIPSQRT */
1545 FETCH( func, *inst, 0, 0, CHAN_X );
1546 emit_rsqrt( func, 0, 0 );
1547 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1548 STORE( func, *inst, 0, 0, chan_index );
1549 }
1550 break;
1551
1552 case TGSI_OPCODE_EXP:
1553 return 0;
1554 break;
1555
1556 case TGSI_OPCODE_LOG:
1557 return 0;
1558 break;
1559
1560 case TGSI_OPCODE_MUL:
1561 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1562 FETCH( func, *inst, 0, 0, chan_index );
1563 FETCH( func, *inst, 1, 1, chan_index );
1564 emit_mul( func, 0, 1 );
1565 STORE( func, *inst, 0, 0, chan_index );
1566 }
1567 break;
1568
1569 case TGSI_OPCODE_ADD:
1570 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1571 FETCH( func, *inst, 0, 0, chan_index );
1572 FETCH( func, *inst, 1, 1, chan_index );
1573 emit_add( func, 0, 1 );
1574 STORE( func, *inst, 0, 0, chan_index );
1575 }
1576 break;
1577
1578 case TGSI_OPCODE_DP3:
1579 /* TGSI_OPCODE_DOT3 */
1580 FETCH( func, *inst, 0, 0, CHAN_X );
1581 FETCH( func, *inst, 1, 1, CHAN_X );
1582 emit_mul( func, 0, 1 );
1583 FETCH( func, *inst, 1, 0, CHAN_Y );
1584 FETCH( func, *inst, 2, 1, CHAN_Y );
1585 emit_mul( func, 1, 2 );
1586 emit_add( func, 0, 1 );
1587 FETCH( func, *inst, 1, 0, CHAN_Z );
1588 FETCH( func, *inst, 2, 1, CHAN_Z );
1589 emit_mul( func, 1, 2 );
1590 emit_add( func, 0, 1 );
1591 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1592 STORE( func, *inst, 0, 0, chan_index );
1593 }
1594 break;
1595
1596 case TGSI_OPCODE_DP4:
1597 /* TGSI_OPCODE_DOT4 */
1598 FETCH( func, *inst, 0, 0, CHAN_X );
1599 FETCH( func, *inst, 1, 1, CHAN_X );
1600 emit_mul( func, 0, 1 );
1601 FETCH( func, *inst, 1, 0, CHAN_Y );
1602 FETCH( func, *inst, 2, 1, CHAN_Y );
1603 emit_mul( func, 1, 2 );
1604 emit_add( func, 0, 1 );
1605 FETCH( func, *inst, 1, 0, CHAN_Z );
1606 FETCH( func, *inst, 2, 1, CHAN_Z );
1607 emit_mul(func, 1, 2 );
1608 emit_add(func, 0, 1 );
1609 FETCH( func, *inst, 1, 0, CHAN_W );
1610 FETCH( func, *inst, 2, 1, CHAN_W );
1611 emit_mul( func, 1, 2 );
1612 emit_add( func, 0, 1 );
1613 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1614 STORE( func, *inst, 0, 0, chan_index );
1615 }
1616 break;
1617
1618 case TGSI_OPCODE_DST:
1619 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1620 emit_tempf(
1621 func,
1622 0,
1623 TGSI_EXEC_TEMP_ONE_I,
1624 TGSI_EXEC_TEMP_ONE_C );
1625 STORE( func, *inst, 0, 0, CHAN_X );
1626 }
1627 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1628 FETCH( func, *inst, 0, 0, CHAN_Y );
1629 FETCH( func, *inst, 1, 1, CHAN_Y );
1630 emit_mul( func, 0, 1 );
1631 STORE( func, *inst, 0, 0, CHAN_Y );
1632 }
1633 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1634 FETCH( func, *inst, 0, 0, CHAN_Z );
1635 STORE( func, *inst, 0, 0, CHAN_Z );
1636 }
1637 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1638 FETCH( func, *inst, 0, 1, CHAN_W );
1639 STORE( func, *inst, 0, 0, CHAN_W );
1640 }
1641 break;
1642
1643 case TGSI_OPCODE_MIN:
1644 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1645 FETCH( func, *inst, 0, 0, chan_index );
1646 FETCH( func, *inst, 1, 1, chan_index );
1647 emit_minps(
1648 func,
1649 make_xmm( 0 ),
1650 make_xmm( 1 ) );
1651 STORE( func, *inst, 0, 0, chan_index );
1652 }
1653 break;
1654
1655 case TGSI_OPCODE_MAX:
1656 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1657 FETCH( func, *inst, 0, 0, chan_index );
1658 FETCH( func, *inst, 1, 1, chan_index );
1659 emit_maxps(
1660 func,
1661 make_xmm( 0 ),
1662 make_xmm( 1 ) );
1663 STORE( func, *inst, 0, 0, chan_index );
1664 }
1665 break;
1666
1667 case TGSI_OPCODE_SLT:
1668 /* TGSI_OPCODE_SETLT */
1669 emit_setcc( func, inst, cc_LessThan );
1670 break;
1671
1672 case TGSI_OPCODE_SGE:
1673 /* TGSI_OPCODE_SETGE */
1674 emit_setcc( func, inst, cc_NotLessThan );
1675 break;
1676
1677 case TGSI_OPCODE_MAD:
1678 /* TGSI_OPCODE_MADD */
1679 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1680 FETCH( func, *inst, 0, 0, chan_index );
1681 FETCH( func, *inst, 1, 1, chan_index );
1682 FETCH( func, *inst, 2, 2, chan_index );
1683 emit_mul( func, 0, 1 );
1684 emit_add( func, 0, 2 );
1685 STORE( func, *inst, 0, 0, chan_index );
1686 }
1687 break;
1688
1689 case TGSI_OPCODE_SUB:
1690 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1691 FETCH( func, *inst, 0, 0, chan_index );
1692 FETCH( func, *inst, 1, 1, chan_index );
1693 emit_sub( func, 0, 1 );
1694 STORE( func, *inst, 0, 0, chan_index );
1695 }
1696 break;
1697
1698 case TGSI_OPCODE_LERP:
1699 /* TGSI_OPCODE_LRP */
1700 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1701 FETCH( func, *inst, 0, 0, chan_index );
1702 FETCH( func, *inst, 1, 1, chan_index );
1703 FETCH( func, *inst, 2, 2, chan_index );
1704 emit_sub( func, 1, 2 );
1705 emit_mul( func, 0, 1 );
1706 emit_add( func, 0, 2 );
1707 STORE( func, *inst, 0, 0, chan_index );
1708 }
1709 break;
1710
1711 case TGSI_OPCODE_CND:
1712 return 0;
1713 break;
1714
1715 case TGSI_OPCODE_CND0:
1716 return 0;
1717 break;
1718
1719 case TGSI_OPCODE_DOT2ADD:
1720 /* TGSI_OPCODE_DP2A */
1721 return 0;
1722 break;
1723
1724 case TGSI_OPCODE_INDEX:
1725 return 0;
1726 break;
1727
1728 case TGSI_OPCODE_NEGATE:
1729 return 0;
1730 break;
1731
1732 case TGSI_OPCODE_FRAC:
1733 /* TGSI_OPCODE_FRC */
1734 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1735 FETCH( func, *inst, 0, 0, chan_index );
1736 emit_frc( func, 0 );
1737 STORE( func, *inst, 0, 0, chan_index );
1738 }
1739 break;
1740
1741 case TGSI_OPCODE_CLAMP:
1742 return 0;
1743 break;
1744
1745 case TGSI_OPCODE_FLOOR:
1746 /* TGSI_OPCODE_FLR */
1747 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1748 FETCH( func, *inst, 0, 0, chan_index );
1749 emit_flr( func, 0 );
1750 STORE( func, *inst, 0, 0, chan_index );
1751 }
1752 break;
1753
1754 case TGSI_OPCODE_ROUND:
1755 return 0;
1756 break;
1757
1758 case TGSI_OPCODE_EXPBASE2:
1759 /* TGSI_OPCODE_EX2 */
1760 FETCH( func, *inst, 0, 0, CHAN_X );
1761 emit_ex2( func, 0 );
1762 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1763 STORE( func, *inst, 0, 0, chan_index );
1764 }
1765 break;
1766
1767 case TGSI_OPCODE_LOGBASE2:
1768 /* TGSI_OPCODE_LG2 */
1769 FETCH( func, *inst, 0, 0, CHAN_X );
1770 emit_lg2( func, 0 );
1771 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1772 STORE( func, *inst, 0, 0, chan_index );
1773 }
1774 break;
1775
1776 case TGSI_OPCODE_POWER:
1777 /* TGSI_OPCODE_POW */
1778 FETCH( func, *inst, 0, 0, CHAN_X );
1779 FETCH( func, *inst, 1, 1, CHAN_X );
1780 emit_pow( func, 0, 1 );
1781 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1782 STORE( func, *inst, 0, 0, chan_index );
1783 }
1784 break;
1785
1786 case TGSI_OPCODE_CROSSPRODUCT:
1787 /* TGSI_OPCODE_XPD */
1788 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1789 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1790 FETCH( func, *inst, 1, 1, CHAN_Z );
1791 FETCH( func, *inst, 3, 0, CHAN_Z );
1792 }
1793 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1794 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1795 FETCH( func, *inst, 0, 0, CHAN_Y );
1796 FETCH( func, *inst, 4, 1, CHAN_Y );
1797 }
1798 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1799 emit_MOV( func, 2, 0 );
1800 emit_mul( func, 2, 1 );
1801 emit_MOV( func, 5, 3 );
1802 emit_mul( func, 5, 4 );
1803 emit_sub( func, 2, 5 );
1804 STORE( func, *inst, 2, 0, CHAN_X );
1805 }
1806 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1807 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1808 FETCH( func, *inst, 2, 1, CHAN_X );
1809 FETCH( func, *inst, 5, 0, CHAN_X );
1810 }
1811 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1812 emit_mul( func, 3, 2 );
1813 emit_mul( func, 1, 5 );
1814 emit_sub( func, 3, 1 );
1815 STORE( func, *inst, 3, 0, CHAN_Y );
1816 }
1817 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1818 emit_mul( func, 5, 4 );
1819 emit_mul( func, 0, 2 );
1820 emit_sub( func, 5, 0 );
1821 STORE( func, *inst, 5, 0, CHAN_Z );
1822 }
1823 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1824 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
1825 STORE( func, *inst, 0, 0, CHAN_W );
1826 }
1827 break;
1828
1829 case TGSI_OPCODE_MULTIPLYMATRIX:
1830 return 0;
1831 break;
1832
1833 case TGSI_OPCODE_ABS:
1834 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1835 FETCH( func, *inst, 0, 0, chan_index );
1836 emit_abs( func, 0) ;
1837
1838 STORE( func, *inst, 0, 0, chan_index );
1839 }
1840 break;
1841
1842 case TGSI_OPCODE_RCC:
1843 return 0;
1844 break;
1845
1846 case TGSI_OPCODE_DPH:
1847 FETCH( func, *inst, 0, 0, CHAN_X );
1848 FETCH( func, *inst, 1, 1, CHAN_X );
1849 emit_mul( func, 0, 1 );
1850 FETCH( func, *inst, 1, 0, CHAN_Y );
1851 FETCH( func, *inst, 2, 1, CHAN_Y );
1852 emit_mul( func, 1, 2 );
1853 emit_add( func, 0, 1 );
1854 FETCH( func, *inst, 1, 0, CHAN_Z );
1855 FETCH( func, *inst, 2, 1, CHAN_Z );
1856 emit_mul( func, 1, 2 );
1857 emit_add( func, 0, 1 );
1858 FETCH( func, *inst, 1, 1, CHAN_W );
1859 emit_add( func, 0, 1 );
1860 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1861 STORE( func, *inst, 0, 0, chan_index );
1862 }
1863 break;
1864
1865 case TGSI_OPCODE_COS:
1866 FETCH( func, *inst, 0, 0, CHAN_X );
1867 emit_cos( func, 0 );
1868 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1869 STORE( func, *inst, 0, 0, chan_index );
1870 }
1871 break;
1872
1873 case TGSI_OPCODE_DDX:
1874 return 0;
1875 break;
1876
1877 case TGSI_OPCODE_DDY:
1878 return 0;
1879 break;
1880
1881 case TGSI_OPCODE_KIL:
1882 emit_kil( func, &inst->FullSrcRegisters[0] );
1883 break;
1884
1885 case TGSI_OPCODE_PK2H:
1886 return 0;
1887 break;
1888
1889 case TGSI_OPCODE_PK2US:
1890 return 0;
1891 break;
1892
1893 case TGSI_OPCODE_PK4B:
1894 return 0;
1895 break;
1896
1897 case TGSI_OPCODE_PK4UB:
1898 return 0;
1899 break;
1900
1901 case TGSI_OPCODE_RFL:
1902 return 0;
1903 break;
1904
1905 case TGSI_OPCODE_SEQ:
1906 return 0;
1907 break;
1908
1909 case TGSI_OPCODE_SFL:
1910 return 0;
1911 break;
1912
1913 case TGSI_OPCODE_SGT:
1914 return 0;
1915 break;
1916
1917 case TGSI_OPCODE_SIN:
1918 FETCH( func, *inst, 0, 0, CHAN_X );
1919 emit_sin( func, 0 );
1920 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1921 STORE( func, *inst, 0, 0, chan_index );
1922 }
1923 break;
1924
1925 case TGSI_OPCODE_SLE:
1926 return 0;
1927 break;
1928
1929 case TGSI_OPCODE_SNE:
1930 return 0;
1931 break;
1932
1933 case TGSI_OPCODE_STR:
1934 return 0;
1935 break;
1936
1937 case TGSI_OPCODE_TEX:
1938 if (0) {
1939 /* Disable dummy texture code:
1940 */
1941 emit_tempf(
1942 func,
1943 0,
1944 TGSI_EXEC_TEMP_ONE_I,
1945 TGSI_EXEC_TEMP_ONE_C );
1946 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1947 STORE( func, *inst, 0, 0, chan_index );
1948 }
1949 }
1950 else {
1951 return 0;
1952 }
1953 break;
1954
1955 case TGSI_OPCODE_TXD:
1956 return 0;
1957 break;
1958
1959 case TGSI_OPCODE_UP2H:
1960 return 0;
1961 break;
1962
1963 case TGSI_OPCODE_UP2US:
1964 return 0;
1965 break;
1966
1967 case TGSI_OPCODE_UP4B:
1968 return 0;
1969 break;
1970
1971 case TGSI_OPCODE_UP4UB:
1972 return 0;
1973 break;
1974
1975 case TGSI_OPCODE_X2D:
1976 return 0;
1977 break;
1978
1979 case TGSI_OPCODE_ARA:
1980 return 0;
1981 break;
1982
1983 case TGSI_OPCODE_ARR:
1984 return 0;
1985 break;
1986
1987 case TGSI_OPCODE_BRA:
1988 return 0;
1989 break;
1990
1991 case TGSI_OPCODE_CAL:
1992 return 0;
1993 break;
1994
1995 case TGSI_OPCODE_RET:
1996 case TGSI_OPCODE_END:
1997 #ifdef WIN32
1998 emit_retw( func, 16 );
1999 #else
2000 emit_ret( func );
2001 #endif
2002 break;
2003
2004 case TGSI_OPCODE_SSG:
2005 return 0;
2006 break;
2007
2008 case TGSI_OPCODE_CMP:
2009 emit_cmp (func, inst);
2010 break;
2011
2012 case TGSI_OPCODE_SCS:
2013 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2014 FETCH( func, *inst, 0, 0, CHAN_X );
2015 emit_cos( func, 0 );
2016 STORE( func, *inst, 0, 0, CHAN_X );
2017 }
2018 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2019 FETCH( func, *inst, 0, 0, CHAN_Y );
2020 emit_sin( func, 0 );
2021 STORE( func, *inst, 0, 0, CHAN_Y );
2022 }
2023 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2024 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C );
2025 STORE( func, *inst, 0, 0, CHAN_Z );
2026 }
2027 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2028 FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
2029 STORE( func, *inst, 0, 0, CHAN_W );
2030 }
2031 break;
2032
2033 case TGSI_OPCODE_TXB:
2034 return 0;
2035 break;
2036
2037 case TGSI_OPCODE_NRM:
2038 return 0;
2039 break;
2040
2041 case TGSI_OPCODE_DIV:
2042 return 0;
2043 break;
2044
2045 case TGSI_OPCODE_DP2:
2046 return 0;
2047 break;
2048
2049 case TGSI_OPCODE_TXL:
2050 return 0;
2051 break;
2052
2053 case TGSI_OPCODE_BRK:
2054 return 0;
2055 break;
2056
2057 case TGSI_OPCODE_IF:
2058 return 0;
2059 break;
2060
2061 case TGSI_OPCODE_LOOP:
2062 return 0;
2063 break;
2064
2065 case TGSI_OPCODE_REP:
2066 return 0;
2067 break;
2068
2069 case TGSI_OPCODE_ELSE:
2070 return 0;
2071 break;
2072
2073 case TGSI_OPCODE_ENDIF:
2074 return 0;
2075 break;
2076
2077 case TGSI_OPCODE_ENDLOOP:
2078 return 0;
2079 break;
2080
2081 case TGSI_OPCODE_ENDREP:
2082 return 0;
2083 break;
2084
2085 case TGSI_OPCODE_PUSHA:
2086 return 0;
2087 break;
2088
2089 case TGSI_OPCODE_POPA:
2090 return 0;
2091 break;
2092
2093 case TGSI_OPCODE_CEIL:
2094 return 0;
2095 break;
2096
2097 case TGSI_OPCODE_I2F:
2098 return 0;
2099 break;
2100
2101 case TGSI_OPCODE_NOT:
2102 return 0;
2103 break;
2104
2105 case TGSI_OPCODE_TRUNC:
2106 return 0;
2107 break;
2108
2109 case TGSI_OPCODE_SHL:
2110 return 0;
2111 break;
2112
2113 case TGSI_OPCODE_SHR:
2114 return 0;
2115 break;
2116
2117 case TGSI_OPCODE_AND:
2118 return 0;
2119 break;
2120
2121 case TGSI_OPCODE_OR:
2122 return 0;
2123 break;
2124
2125 case TGSI_OPCODE_MOD:
2126 return 0;
2127 break;
2128
2129 case TGSI_OPCODE_XOR:
2130 return 0;
2131 break;
2132
2133 case TGSI_OPCODE_SAD:
2134 return 0;
2135 break;
2136
2137 case TGSI_OPCODE_TXF:
2138 return 0;
2139 break;
2140
2141 case TGSI_OPCODE_TXQ:
2142 return 0;
2143 break;
2144
2145 case TGSI_OPCODE_CONT:
2146 return 0;
2147 break;
2148
2149 case TGSI_OPCODE_EMIT:
2150 return 0;
2151 break;
2152
2153 case TGSI_OPCODE_ENDPRIM:
2154 return 0;
2155 break;
2156
2157 default:
2158 return 0;
2159 }
2160
2161 return 1;
2162 }
2163
2164 static void
2165 emit_declaration(
2166 struct x86_function *func,
2167 struct tgsi_full_declaration *decl )
2168 {
2169 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2170 unsigned first, last, mask;
2171 unsigned i, j;
2172
2173 assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
2174
2175 first = decl->u.DeclarationRange.First;
2176 last = decl->u.DeclarationRange.Last;
2177 mask = decl->Declaration.UsageMask;
2178
2179 for( i = first; i <= last; i++ ) {
2180 for( j = 0; j < NUM_CHANNELS; j++ ) {
2181 if( mask & (1 << j) ) {
2182 switch( decl->Interpolation.Interpolate ) {
2183 case TGSI_INTERPOLATE_CONSTANT:
2184 emit_coef_a0( func, 0, i, j );
2185 emit_inputs( func, 0, i, j );
2186 break;
2187
2188 case TGSI_INTERPOLATE_LINEAR:
2189 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2190 emit_coef_dadx( func, 1, i, j );
2191 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2192 emit_coef_dady( func, 3, i, j );
2193 emit_mul( func, 0, 1 ); /* x * dadx */
2194 emit_coef_a0( func, 4, i, j );
2195 emit_mul( func, 2, 3 ); /* y * dady */
2196 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2197 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2198 emit_inputs( func, 0, i, j );
2199 break;
2200
2201 case TGSI_INTERPOLATE_PERSPECTIVE:
2202 emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2203 emit_coef_dadx( func, 1, i, j );
2204 emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2205 emit_coef_dady( func, 3, i, j );
2206 emit_mul( func, 0, 1 ); /* x * dadx */
2207 emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
2208 emit_coef_a0( func, 5, i, j );
2209 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2210 emit_mul( func, 2, 3 ); /* y * dady */
2211 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2212 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2213 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2214 emit_inputs( func, 0, i, j );
2215 break;
2216
2217 default:
2218 assert( 0 );
2219 break;
2220 }
2221 }
2222 }
2223 }
2224 }
2225 }
2226
2227 unsigned
2228 tgsi_emit_sse2(
2229 struct tgsi_token *tokens,
2230 struct x86_function *func )
2231 {
2232 struct tgsi_parse_context parse;
2233 unsigned ok = 1;
2234
2235 DUMP_START();
2236
2237 func->csr = func->store;
2238
2239 emit_mov(
2240 func,
2241 get_input_base(),
2242 get_argument( 0 ) );
2243 emit_mov(
2244 func,
2245 get_output_base(),
2246 get_argument( 1 ) );
2247 emit_mov(
2248 func,
2249 get_const_base(),
2250 get_argument( 2 ) );
2251 emit_mov(
2252 func,
2253 get_temp_base(),
2254 get_argument( 3 ) );
2255
2256 tgsi_parse_init( &parse, tokens );
2257
2258 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2259 tgsi_parse_token( &parse );
2260
2261 switch( parse.FullToken.Token.Type ) {
2262 case TGSI_TOKEN_TYPE_DECLARATION:
2263 break;
2264
2265 case TGSI_TOKEN_TYPE_INSTRUCTION:
2266 ok = emit_instruction(
2267 func,
2268 &parse.FullToken.FullInstruction );
2269
2270 if (!ok) {
2271 debug_printf("failed to translate tgsi opcode %d\n",
2272 parse.FullToken.FullInstruction.Instruction.Opcode );
2273 }
2274 break;
2275
2276 case TGSI_TOKEN_TYPE_IMMEDIATE:
2277 /* XXX implement this */
2278 ok = 0;
2279 debug_printf("failed to emit immediate value\n");
2280 break;
2281
2282 default:
2283 assert( 0 );
2284 ok = 0;
2285 break;
2286 }
2287 }
2288
2289 tgsi_parse_free( &parse );
2290
2291 DUMP_END();
2292
2293 return ok;
2294 }
2295
2296 /**
2297 * Fragment shaders are responsible for interpolating shader inputs. Because on
2298 * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
2299 * output, const, temp and coef), the code is split into two phases --
2300 * DECLARATION and INSTRUCTION phase.
2301 * GP register holding the output argument is aliased with the coeff argument,
2302 * as outputs are not needed in the DECLARATION phase.
2303 */
2304 unsigned
2305 tgsi_emit_sse2_fs(
2306 struct tgsi_token *tokens,
2307 struct x86_function *func )
2308 {
2309 struct tgsi_parse_context parse;
2310 boolean instruction_phase = FALSE;
2311
2312 DUMP_START();
2313
2314 func->csr = func->store;
2315
2316 /* DECLARATION phase, do not load output argument. */
2317 emit_mov(
2318 func,
2319 get_input_base(),
2320 get_argument( 0 ) );
2321 emit_mov(
2322 func,
2323 get_const_base(),
2324 get_argument( 2 ) );
2325 emit_mov(
2326 func,
2327 get_temp_base(),
2328 get_argument( 3 ) );
2329 emit_mov(
2330 func,
2331 get_coef_base(),
2332 get_argument( 4 ) );
2333
2334 tgsi_parse_init( &parse, tokens );
2335
2336 while( !tgsi_parse_end_of_tokens( &parse ) ) {
2337 tgsi_parse_token( &parse );
2338
2339 switch( parse.FullToken.Token.Type ) {
2340 case TGSI_TOKEN_TYPE_DECLARATION:
2341 emit_declaration(
2342 func,
2343 &parse.FullToken.FullDeclaration );
2344 break;
2345
2346 case TGSI_TOKEN_TYPE_INSTRUCTION:
2347 if( !instruction_phase ) {
2348 /* INSTRUCTION phase, overwrite coeff with output. */
2349 instruction_phase = TRUE;
2350 emit_mov(
2351 func,
2352 get_output_base(),
2353 get_argument( 1 ) );
2354 }
2355 emit_instruction(
2356 func,
2357 &parse.FullToken.FullInstruction );
2358 break;
2359
2360 case TGSI_TOKEN_TYPE_IMMEDIATE:
2361 /* XXX implement this */
2362 assert(0);
2363 break;
2364
2365 default:
2366 assert( 0 );
2367 }
2368 }
2369
2370 tgsi_parse_free( &parse );
2371
2372 DUMP_END();
2373
2374 return 1;
2375 }
2376
2377 #endif /* i386 */