47dc06faf6d8242fd524ae6c3d8b754cd462e0e8
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "tgsi/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
34
35 #include "rtasm/rtasm_x86sse.h"
36
37 #ifdef PIPE_ARCH_X86
38
39 /* for 1/sqrt()
40 *
41 * This costs about 100fps (close to 10%) in gears:
42 */
43 #define HIGH_PRECISION 1
44
45
46 #define FOR_EACH_CHANNEL( CHAN )\
47 for( CHAN = 0; CHAN < 4; CHAN++ )
48
49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
50 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
51
52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
54
55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
56 FOR_EACH_CHANNEL( CHAN )\
57 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
58
59 #define CHAN_X 0
60 #define CHAN_Y 1
61 #define CHAN_Z 2
62 #define CHAN_W 3
63
64 #define TEMP_R0 TGSI_EXEC_TEMP_R0
65
66 /**
67 * X86 utility functions.
68 */
69
70 static struct x86_reg
71 make_xmm(
72 unsigned xmm )
73 {
74 return x86_make_reg(
75 file_XMM,
76 (enum x86_reg_name) xmm );
77 }
78
79 /**
80 * X86 register mapping helpers.
81 */
82
83 static struct x86_reg
84 get_const_base( void )
85 {
86 return x86_make_reg(
87 file_REG32,
88 reg_CX );
89 }
90
91 static struct x86_reg
92 get_input_base( void )
93 {
94 return x86_make_reg(
95 file_REG32,
96 reg_AX );
97 }
98
99 static struct x86_reg
100 get_output_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_DX );
105 }
106
107 static struct x86_reg
108 get_temp_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_BX );
113 }
114
115 static struct x86_reg
116 get_coef_base( void )
117 {
118 return get_output_base();
119 }
120
121 static struct x86_reg
122 get_immediate_base( void )
123 {
124 return x86_make_reg(
125 file_REG32,
126 reg_DI );
127 }
128
129
130 /**
131 * Data access helpers.
132 */
133
134
135 static struct x86_reg
136 get_immediate(
137 unsigned vec,
138 unsigned chan )
139 {
140 return x86_make_disp(
141 get_immediate_base(),
142 (vec * 4 + chan) * 4 );
143 }
144
145 static struct x86_reg
146 get_const(
147 unsigned vec,
148 unsigned chan )
149 {
150 return x86_make_disp(
151 get_const_base(),
152 (vec * 4 + chan) * 4 );
153 }
154
155 static struct x86_reg
156 get_input(
157 unsigned vec,
158 unsigned chan )
159 {
160 return x86_make_disp(
161 get_input_base(),
162 (vec * 4 + chan) * 16 );
163 }
164
165 static struct x86_reg
166 get_output(
167 unsigned vec,
168 unsigned chan )
169 {
170 return x86_make_disp(
171 get_output_base(),
172 (vec * 4 + chan) * 16 );
173 }
174
175 static struct x86_reg
176 get_temp(
177 unsigned vec,
178 unsigned chan )
179 {
180 return x86_make_disp(
181 get_temp_base(),
182 (vec * 4 + chan) * 16 );
183 }
184
185 static struct x86_reg
186 get_coef(
187 unsigned vec,
188 unsigned chan,
189 unsigned member )
190 {
191 return x86_make_disp(
192 get_coef_base(),
193 ((vec * 3 + member) * 4 + chan) * 4 );
194 }
195
196
197 static void
198 emit_ret(
199 struct x86_function *func )
200 {
201 x86_ret( func );
202 }
203
204
205 /**
206 * Data fetch helpers.
207 */
208
209 /**
210 * Copy a shader constant to xmm register
211 * \param xmm the destination xmm register
212 * \param vec the src const buffer index
213 * \param chan src channel to fetch (X, Y, Z or W)
214 */
215 static void
216 emit_const(
217 struct x86_function *func,
218 unsigned xmm,
219 unsigned vec,
220 unsigned chan )
221 {
222 sse_movss(
223 func,
224 make_xmm( xmm ),
225 get_const( vec, chan ) );
226 sse_shufps(
227 func,
228 make_xmm( xmm ),
229 make_xmm( xmm ),
230 SHUF( 0, 0, 0, 0 ) );
231 }
232
233 static void
234 emit_immediate(
235 struct x86_function *func,
236 unsigned xmm,
237 unsigned vec,
238 unsigned chan )
239 {
240 sse_movss(
241 func,
242 make_xmm( xmm ),
243 get_immediate( vec, chan ) );
244 sse_shufps(
245 func,
246 make_xmm( xmm ),
247 make_xmm( xmm ),
248 SHUF( 0, 0, 0, 0 ) );
249 }
250
251
252 /**
253 * Copy a shader input to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src input attrib
256 * \param chan src channel to fetch (X, Y, Z or W)
257 */
258 static void
259 emit_inputf(
260 struct x86_function *func,
261 unsigned xmm,
262 unsigned vec,
263 unsigned chan )
264 {
265 sse_movups(
266 func,
267 make_xmm( xmm ),
268 get_input( vec, chan ) );
269 }
270
271 /**
272 * Store an xmm register to a shader output
273 * \param xmm the source xmm register
274 * \param vec the dest output attrib
275 * \param chan src dest channel to store (X, Y, Z or W)
276 */
277 static void
278 emit_output(
279 struct x86_function *func,
280 unsigned xmm,
281 unsigned vec,
282 unsigned chan )
283 {
284 sse_movups(
285 func,
286 get_output( vec, chan ),
287 make_xmm( xmm ) );
288 }
289
290 /**
291 * Copy a shader temporary to xmm register
292 * \param xmm the destination xmm register
293 * \param vec the src temp register
294 * \param chan src channel to fetch (X, Y, Z or W)
295 */
296 static void
297 emit_tempf(
298 struct x86_function *func,
299 unsigned xmm,
300 unsigned vec,
301 unsigned chan )
302 {
303 sse_movaps(
304 func,
305 make_xmm( xmm ),
306 get_temp( vec, chan ) );
307 }
308
309 /**
310 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
311 * \param xmm the destination xmm register
312 * \param vec the src input/attribute coefficient index
313 * \param chan src channel to fetch (X, Y, Z or W)
314 * \param member 0=a0, 1=dadx, 2=dady
315 */
316 static void
317 emit_coef(
318 struct x86_function *func,
319 unsigned xmm,
320 unsigned vec,
321 unsigned chan,
322 unsigned member )
323 {
324 sse_movss(
325 func,
326 make_xmm( xmm ),
327 get_coef( vec, chan, member ) );
328 sse_shufps(
329 func,
330 make_xmm( xmm ),
331 make_xmm( xmm ),
332 SHUF( 0, 0, 0, 0 ) );
333 }
334
335 /**
336 * Data store helpers.
337 */
338
339 static void
340 emit_inputs(
341 struct x86_function *func,
342 unsigned xmm,
343 unsigned vec,
344 unsigned chan )
345 {
346 sse_movups(
347 func,
348 get_input( vec, chan ),
349 make_xmm( xmm ) );
350 }
351
352 static void
353 emit_temps(
354 struct x86_function *func,
355 unsigned xmm,
356 unsigned vec,
357 unsigned chan )
358 {
359 sse_movaps(
360 func,
361 get_temp( vec, chan ),
362 make_xmm( xmm ) );
363 }
364
365 static void
366 emit_addrs(
367 struct x86_function *func,
368 unsigned xmm,
369 unsigned vec,
370 unsigned chan )
371 {
372 emit_temps(
373 func,
374 xmm,
375 vec + TGSI_EXEC_NUM_TEMPS,
376 chan );
377 }
378
379 /**
380 * Coefficent fetch helpers.
381 */
382
383 static void
384 emit_coef_a0(
385 struct x86_function *func,
386 unsigned xmm,
387 unsigned vec,
388 unsigned chan )
389 {
390 emit_coef(
391 func,
392 xmm,
393 vec,
394 chan,
395 0 );
396 }
397
398 static void
399 emit_coef_dadx(
400 struct x86_function *func,
401 unsigned xmm,
402 unsigned vec,
403 unsigned chan )
404 {
405 emit_coef(
406 func,
407 xmm,
408 vec,
409 chan,
410 1 );
411 }
412
413 static void
414 emit_coef_dady(
415 struct x86_function *func,
416 unsigned xmm,
417 unsigned vec,
418 unsigned chan )
419 {
420 emit_coef(
421 func,
422 xmm,
423 vec,
424 chan,
425 2 );
426 }
427
428 /**
429 * Function call helpers.
430 */
431
432 static void
433 emit_push_gp(
434 struct x86_function *func )
435 {
436 x86_push(
437 func,
438 x86_make_reg( file_REG32, reg_AX) );
439 x86_push(
440 func,
441 x86_make_reg( file_REG32, reg_CX) );
442 x86_push(
443 func,
444 x86_make_reg( file_REG32, reg_DX) );
445 }
446
447 static void
448 x86_pop_gp(
449 struct x86_function *func )
450 {
451 /* Restore GP registers in a reverse order.
452 */
453 x86_pop(
454 func,
455 x86_make_reg( file_REG32, reg_DX) );
456 x86_pop(
457 func,
458 x86_make_reg( file_REG32, reg_CX) );
459 x86_pop(
460 func,
461 x86_make_reg( file_REG32, reg_AX) );
462 }
463
464 static void
465 emit_func_call_dst(
466 struct x86_function *func,
467 unsigned xmm_dst,
468 void (PIPE_CDECL *code)() )
469 {
470 sse_movaps(
471 func,
472 get_temp( TEMP_R0, 0 ),
473 make_xmm( xmm_dst ) );
474
475 emit_push_gp(
476 func );
477
478 {
479 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
480
481 x86_lea(
482 func,
483 ecx,
484 get_temp( TEMP_R0, 0 ) );
485
486 x86_push( func, ecx );
487 x86_mov_reg_imm( func, ecx, (unsigned long) code );
488 x86_call( func, ecx );
489 x86_pop(func, ecx );
490 }
491
492
493 x86_pop_gp(
494 func );
495
496 sse_movaps(
497 func,
498 make_xmm( xmm_dst ),
499 get_temp( TEMP_R0, 0 ) );
500 }
501
502 static void
503 emit_func_call_dst_src(
504 struct x86_function *func,
505 unsigned xmm_dst,
506 unsigned xmm_src,
507 void (PIPE_CDECL *code)() )
508 {
509 sse_movaps(
510 func,
511 get_temp( TEMP_R0, 1 ),
512 make_xmm( xmm_src ) );
513
514 emit_func_call_dst(
515 func,
516 xmm_dst,
517 code );
518 }
519
520 /**
521 * Low-level instruction translators.
522 */
523
524 static void
525 emit_abs(
526 struct x86_function *func,
527 unsigned xmm )
528 {
529 sse_andps(
530 func,
531 make_xmm( xmm ),
532 get_temp(
533 TGSI_EXEC_TEMP_7FFFFFFF_I,
534 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
535 }
536
537 static void
538 emit_add(
539 struct x86_function *func,
540 unsigned xmm_dst,
541 unsigned xmm_src )
542 {
543 sse_addps(
544 func,
545 make_xmm( xmm_dst ),
546 make_xmm( xmm_src ) );
547 }
548
549 static void PIPE_CDECL
550 cos4f(
551 float *store )
552 {
553 const unsigned X = 0;
554
555 store[X + 0] = cosf( store[X + 0] );
556 store[X + 1] = cosf( store[X + 1] );
557 store[X + 2] = cosf( store[X + 2] );
558 store[X + 3] = cosf( store[X + 3] );
559 }
560
561 static void
562 emit_cos(
563 struct x86_function *func,
564 unsigned xmm_dst )
565 {
566 emit_func_call_dst(
567 func,
568 xmm_dst,
569 cos4f );
570 }
571
572 static void PIPE_CDECL
573 ex24f(
574 float *store )
575 {
576 const unsigned X = 0;
577
578 store[X + 0] = powf( 2.0f, store[X + 0] );
579 store[X + 1] = powf( 2.0f, store[X + 1] );
580 store[X + 2] = powf( 2.0f, store[X + 2] );
581 store[X + 3] = powf( 2.0f, store[X + 3] );
582 }
583
584 static void
585 emit_ex2(
586 struct x86_function *func,
587 unsigned xmm_dst )
588 {
589 emit_func_call_dst(
590 func,
591 xmm_dst,
592 ex24f );
593 }
594
595 static void
596 emit_f2it(
597 struct x86_function *func,
598 unsigned xmm )
599 {
600 sse2_cvttps2dq(
601 func,
602 make_xmm( xmm ),
603 make_xmm( xmm ) );
604 }
605
606 static void PIPE_CDECL
607 flr4f(
608 float *store )
609 {
610 const unsigned X = 0;
611
612 store[X + 0] = floorf( store[X + 0] );
613 store[X + 1] = floorf( store[X + 1] );
614 store[X + 2] = floorf( store[X + 2] );
615 store[X + 3] = floorf( store[X + 3] );
616 }
617
618 static void
619 emit_flr(
620 struct x86_function *func,
621 unsigned xmm_dst )
622 {
623 emit_func_call_dst(
624 func,
625 xmm_dst,
626 flr4f );
627 }
628
629 static void PIPE_CDECL
630 frc4f(
631 float *store )
632 {
633 const unsigned X = 0;
634
635 store[X + 0] -= floorf( store[X + 0] );
636 store[X + 1] -= floorf( store[X + 1] );
637 store[X + 2] -= floorf( store[X + 2] );
638 store[X + 3] -= floorf( store[X + 3] );
639 }
640
641 static void
642 emit_frc(
643 struct x86_function *func,
644 unsigned xmm_dst )
645 {
646 emit_func_call_dst(
647 func,
648 xmm_dst,
649 frc4f );
650 }
651
652 static void PIPE_CDECL
653 lg24f(
654 float *store )
655 {
656 const unsigned X = 0;
657
658 store[X + 0] = LOG2( store[X + 0] );
659 store[X + 1] = LOG2( store[X + 1] );
660 store[X + 2] = LOG2( store[X + 2] );
661 store[X + 3] = LOG2( store[X + 3] );
662 }
663
664 static void
665 emit_lg2(
666 struct x86_function *func,
667 unsigned xmm_dst )
668 {
669 emit_func_call_dst(
670 func,
671 xmm_dst,
672 lg24f );
673 }
674
675 static void
676 emit_MOV(
677 struct x86_function *func,
678 unsigned xmm_dst,
679 unsigned xmm_src )
680 {
681 sse_movups(
682 func,
683 make_xmm( xmm_dst ),
684 make_xmm( xmm_src ) );
685 }
686
687 static void
688 emit_mul (struct x86_function *func,
689 unsigned xmm_dst,
690 unsigned xmm_src)
691 {
692 sse_mulps(
693 func,
694 make_xmm( xmm_dst ),
695 make_xmm( xmm_src ) );
696 }
697
698 static void
699 emit_neg(
700 struct x86_function *func,
701 unsigned xmm )
702 {
703 sse_xorps(
704 func,
705 make_xmm( xmm ),
706 get_temp(
707 TGSI_EXEC_TEMP_80000000_I,
708 TGSI_EXEC_TEMP_80000000_C ) );
709 }
710
711 static void PIPE_CDECL
712 pow4f(
713 float *store )
714 {
715 const unsigned X = 0;
716
717 store[X + 0] = powf( store[X + 0], store[X + 4] );
718 store[X + 1] = powf( store[X + 1], store[X + 5] );
719 store[X + 2] = powf( store[X + 2], store[X + 6] );
720 store[X + 3] = powf( store[X + 3], store[X + 7] );
721 }
722
723 static void
724 emit_pow(
725 struct x86_function *func,
726 unsigned xmm_dst,
727 unsigned xmm_src )
728 {
729 emit_func_call_dst_src(
730 func,
731 xmm_dst,
732 xmm_src,
733 pow4f );
734 }
735
736 static void
737 emit_rcp (
738 struct x86_function *func,
739 unsigned xmm_dst,
740 unsigned xmm_src )
741 {
742 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
743 * good enough. Need to either emit a proper divide or use the
744 * iterative technique described below in emit_rsqrt().
745 */
746 sse2_rcpps(
747 func,
748 make_xmm( xmm_dst ),
749 make_xmm( xmm_src ) );
750 }
751
752 static void
753 emit_rsqrt(
754 struct x86_function *func,
755 unsigned xmm_dst,
756 unsigned xmm_src )
757 {
758 #if HIGH_PRECISION
759 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
760 * implementations, it is possible to improve its precision at
761 * fairly low cost, using a newton/raphson step, as below:
762 *
763 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
764 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
765 *
766 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
767 */
768 {
769 struct x86_reg dst = make_xmm( xmm_dst );
770 struct x86_reg src = make_xmm( xmm_src );
771 struct x86_reg tmp0 = make_xmm( 2 );
772 struct x86_reg tmp1 = make_xmm( 3 );
773
774 assert( xmm_dst != xmm_src );
775 assert( xmm_dst != 2 && xmm_dst != 3 );
776 assert( xmm_src != 2 && xmm_src != 3 );
777
778 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
779 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
780 sse_rsqrtps( func, tmp1, src );
781 sse_mulps( func, src, tmp1 );
782 sse_mulps( func, dst, tmp1 );
783 sse_mulps( func, src, tmp1 );
784 sse_subps( func, tmp0, src );
785 sse_mulps( func, dst, tmp0 );
786 }
787 #else
788 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
789 * good enough.
790 */
791 sse_rsqrtps(
792 func,
793 make_xmm( xmm_dst ),
794 make_xmm( xmm_src ) );
795 #endif
796 }
797
798 static void
799 emit_setsign(
800 struct x86_function *func,
801 unsigned xmm )
802 {
803 sse_orps(
804 func,
805 make_xmm( xmm ),
806 get_temp(
807 TGSI_EXEC_TEMP_80000000_I,
808 TGSI_EXEC_TEMP_80000000_C ) );
809 }
810
811 static void PIPE_CDECL
812 sin4f(
813 float *store )
814 {
815 const unsigned X = 0;
816
817 store[X + 0] = sinf( store[X + 0] );
818 store[X + 1] = sinf( store[X + 1] );
819 store[X + 2] = sinf( store[X + 2] );
820 store[X + 3] = sinf( store[X + 3] );
821 }
822
823 static void
824 emit_sin (struct x86_function *func,
825 unsigned xmm_dst)
826 {
827 emit_func_call_dst(
828 func,
829 xmm_dst,
830 sin4f );
831 }
832
833 static void
834 emit_sub(
835 struct x86_function *func,
836 unsigned xmm_dst,
837 unsigned xmm_src )
838 {
839 sse_subps(
840 func,
841 make_xmm( xmm_dst ),
842 make_xmm( xmm_src ) );
843 }
844
845 /**
846 * Register fetch.
847 */
848
849 static void
850 emit_fetch(
851 struct x86_function *func,
852 unsigned xmm,
853 const struct tgsi_full_src_register *reg,
854 const unsigned chan_index )
855 {
856 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
857
858 switch( swizzle ) {
859 case TGSI_EXTSWIZZLE_X:
860 case TGSI_EXTSWIZZLE_Y:
861 case TGSI_EXTSWIZZLE_Z:
862 case TGSI_EXTSWIZZLE_W:
863 switch( reg->SrcRegister.File ) {
864 case TGSI_FILE_CONSTANT:
865 emit_const(
866 func,
867 xmm,
868 reg->SrcRegister.Index,
869 swizzle );
870 break;
871
872 case TGSI_FILE_IMMEDIATE:
873 emit_immediate(
874 func,
875 xmm,
876 reg->SrcRegister.Index,
877 swizzle );
878 break;
879
880 case TGSI_FILE_INPUT:
881 emit_inputf(
882 func,
883 xmm,
884 reg->SrcRegister.Index,
885 swizzle );
886 break;
887
888 case TGSI_FILE_TEMPORARY:
889 emit_tempf(
890 func,
891 xmm,
892 reg->SrcRegister.Index,
893 swizzle );
894 break;
895
896 default:
897 assert( 0 );
898 }
899 break;
900
901 case TGSI_EXTSWIZZLE_ZERO:
902 emit_tempf(
903 func,
904 xmm,
905 TGSI_EXEC_TEMP_00000000_I,
906 TGSI_EXEC_TEMP_00000000_C );
907 break;
908
909 case TGSI_EXTSWIZZLE_ONE:
910 emit_tempf(
911 func,
912 xmm,
913 TGSI_EXEC_TEMP_ONE_I,
914 TGSI_EXEC_TEMP_ONE_C );
915 break;
916
917 default:
918 assert( 0 );
919 }
920
921 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
922 case TGSI_UTIL_SIGN_CLEAR:
923 emit_abs( func, xmm );
924 break;
925
926 case TGSI_UTIL_SIGN_SET:
927 emit_setsign( func, xmm );
928 break;
929
930 case TGSI_UTIL_SIGN_TOGGLE:
931 emit_neg( func, xmm );
932 break;
933
934 case TGSI_UTIL_SIGN_KEEP:
935 break;
936 }
937 }
938
939 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
940 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
941
942 /**
943 * Register store.
944 */
945
946 static void
947 emit_store(
948 struct x86_function *func,
949 unsigned xmm,
950 const struct tgsi_full_dst_register *reg,
951 const struct tgsi_full_instruction *inst,
952 unsigned chan_index )
953 {
954 switch( reg->DstRegister.File ) {
955 case TGSI_FILE_OUTPUT:
956 emit_output(
957 func,
958 xmm,
959 reg->DstRegister.Index,
960 chan_index );
961 break;
962
963 case TGSI_FILE_TEMPORARY:
964 emit_temps(
965 func,
966 xmm,
967 reg->DstRegister.Index,
968 chan_index );
969 break;
970
971 case TGSI_FILE_ADDRESS:
972 emit_addrs(
973 func,
974 xmm,
975 reg->DstRegister.Index,
976 chan_index );
977 break;
978
979 default:
980 assert( 0 );
981 }
982
983 switch( inst->Instruction.Saturate ) {
984 case TGSI_SAT_NONE:
985 break;
986
987 case TGSI_SAT_ZERO_ONE:
988 /* assert( 0 ); */
989 break;
990
991 case TGSI_SAT_MINUS_PLUS_ONE:
992 assert( 0 );
993 break;
994 }
995 }
996
997 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
998 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
999
1000 /**
1001 * High-level instruction translators.
1002 */
1003
1004 static void
1005 emit_kil(
1006 struct x86_function *func,
1007 const struct tgsi_full_src_register *reg )
1008 {
1009 unsigned uniquemask;
1010 unsigned registers[4];
1011 unsigned nextregister = 0;
1012 unsigned firstchan = ~0;
1013 unsigned chan_index;
1014
1015 /* This mask stores component bits that were already tested. Note that
1016 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1017 * tested. */
1018 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1019
1020 FOR_EACH_CHANNEL( chan_index ) {
1021 unsigned swizzle;
1022
1023 /* unswizzle channel */
1024 swizzle = tgsi_util_get_full_src_register_extswizzle(
1025 reg,
1026 chan_index );
1027
1028 /* check if the component has not been already tested */
1029 if( !(uniquemask & (1 << swizzle)) ) {
1030 uniquemask |= 1 << swizzle;
1031
1032 /* allocate register */
1033 registers[chan_index] = nextregister;
1034 emit_fetch(
1035 func,
1036 nextregister,
1037 reg,
1038 chan_index );
1039 nextregister++;
1040
1041 /* mark the first channel used */
1042 if( firstchan == ~0 ) {
1043 firstchan = chan_index;
1044 }
1045 }
1046 }
1047
1048 x86_push(
1049 func,
1050 x86_make_reg( file_REG32, reg_AX ) );
1051 x86_push(
1052 func,
1053 x86_make_reg( file_REG32, reg_DX ) );
1054
1055 FOR_EACH_CHANNEL( chan_index ) {
1056 if( uniquemask & (1 << chan_index) ) {
1057 sse_cmpps(
1058 func,
1059 make_xmm( registers[chan_index] ),
1060 get_temp(
1061 TGSI_EXEC_TEMP_00000000_I,
1062 TGSI_EXEC_TEMP_00000000_C ),
1063 cc_LessThan );
1064
1065 if( chan_index == firstchan ) {
1066 sse_pmovmskb(
1067 func,
1068 x86_make_reg( file_REG32, reg_AX ),
1069 make_xmm( registers[chan_index] ) );
1070 }
1071 else {
1072 sse_pmovmskb(
1073 func,
1074 x86_make_reg( file_REG32, reg_DX ),
1075 make_xmm( registers[chan_index] ) );
1076 x86_or(
1077 func,
1078 x86_make_reg( file_REG32, reg_AX ),
1079 x86_make_reg( file_REG32, reg_DX ) );
1080 }
1081 }
1082 }
1083
1084 x86_or(
1085 func,
1086 get_temp(
1087 TGSI_EXEC_TEMP_KILMASK_I,
1088 TGSI_EXEC_TEMP_KILMASK_C ),
1089 x86_make_reg( file_REG32, reg_AX ) );
1090
1091 x86_pop(
1092 func,
1093 x86_make_reg( file_REG32, reg_DX ) );
1094 x86_pop(
1095 func,
1096 x86_make_reg( file_REG32, reg_AX ) );
1097 }
1098
1099
1100 static void
1101 emit_kilp(
1102 struct x86_function *func )
1103 {
1104 /* XXX todo / fix me */
1105 }
1106
1107
1108 static void
1109 emit_setcc(
1110 struct x86_function *func,
1111 struct tgsi_full_instruction *inst,
1112 enum sse_cc cc )
1113 {
1114 unsigned chan_index;
1115
1116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1117 FETCH( func, *inst, 0, 0, chan_index );
1118 FETCH( func, *inst, 1, 1, chan_index );
1119 sse_cmpps(
1120 func,
1121 make_xmm( 0 ),
1122 make_xmm( 1 ),
1123 cc );
1124 sse_andps(
1125 func,
1126 make_xmm( 0 ),
1127 get_temp(
1128 TGSI_EXEC_TEMP_ONE_I,
1129 TGSI_EXEC_TEMP_ONE_C ) );
1130 STORE( func, *inst, 0, 0, chan_index );
1131 }
1132 }
1133
1134 static void
1135 emit_cmp(
1136 struct x86_function *func,
1137 struct tgsi_full_instruction *inst )
1138 {
1139 unsigned chan_index;
1140
1141 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1142 FETCH( func, *inst, 0, 0, chan_index );
1143 FETCH( func, *inst, 1, 1, chan_index );
1144 FETCH( func, *inst, 2, 2, chan_index );
1145 sse_cmpps(
1146 func,
1147 make_xmm( 0 ),
1148 get_temp(
1149 TGSI_EXEC_TEMP_00000000_I,
1150 TGSI_EXEC_TEMP_00000000_C ),
1151 cc_LessThan );
1152 sse_andps(
1153 func,
1154 make_xmm( 1 ),
1155 make_xmm( 0 ) );
1156 sse_andnps(
1157 func,
1158 make_xmm( 0 ),
1159 make_xmm( 2 ) );
1160 sse_orps(
1161 func,
1162 make_xmm( 0 ),
1163 make_xmm( 1 ) );
1164 STORE( func, *inst, 0, 0, chan_index );
1165 }
1166 }
1167
1168 static int
1169 emit_instruction(
1170 struct x86_function *func,
1171 struct tgsi_full_instruction *inst )
1172 {
1173 unsigned chan_index;
1174
1175 switch( inst->Instruction.Opcode ) {
1176 case TGSI_OPCODE_ARL:
1177 #if 0
1178 /* XXX this isn't working properly (see glean vertProg1 test) */
1179 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1180 FETCH( func, *inst, 0, 0, chan_index );
1181 emit_f2it( func, 0 );
1182 STORE( func, *inst, 0, 0, chan_index );
1183 }
1184 #else
1185 return 0;
1186 #endif
1187 break;
1188
1189 case TGSI_OPCODE_MOV:
1190 case TGSI_OPCODE_SWZ:
1191 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1192 FETCH( func, *inst, 0, 0, chan_index );
1193 STORE( func, *inst, 0, 0, chan_index );
1194 }
1195 break;
1196
1197 case TGSI_OPCODE_LIT:
1198 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1199 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1200 emit_tempf(
1201 func,
1202 0,
1203 TGSI_EXEC_TEMP_ONE_I,
1204 TGSI_EXEC_TEMP_ONE_C);
1205 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1206 STORE( func, *inst, 0, 0, CHAN_X );
1207 }
1208 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1209 STORE( func, *inst, 0, 0, CHAN_W );
1210 }
1211 }
1212 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1213 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1214 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1215 FETCH( func, *inst, 0, 0, CHAN_X );
1216 sse_maxps(
1217 func,
1218 make_xmm( 0 ),
1219 get_temp(
1220 TGSI_EXEC_TEMP_00000000_I,
1221 TGSI_EXEC_TEMP_00000000_C ) );
1222 STORE( func, *inst, 0, 0, CHAN_Y );
1223 }
1224 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1225 /* XMM[1] = SrcReg[0].yyyy */
1226 FETCH( func, *inst, 1, 0, CHAN_Y );
1227 /* XMM[1] = max(XMM[1], 0) */
1228 sse_maxps(
1229 func,
1230 make_xmm( 1 ),
1231 get_temp(
1232 TGSI_EXEC_TEMP_00000000_I,
1233 TGSI_EXEC_TEMP_00000000_C ) );
1234 /* XMM[2] = SrcReg[0].wwww */
1235 FETCH( func, *inst, 2, 0, CHAN_W );
1236 /* XMM[2] = min(XMM[2], 128.0) */
1237 sse_minps(
1238 func,
1239 make_xmm( 2 ),
1240 get_temp(
1241 TGSI_EXEC_TEMP_128_I,
1242 TGSI_EXEC_TEMP_128_C ) );
1243 /* XMM[2] = max(XMM[2], -128.0) */
1244 sse_maxps(
1245 func,
1246 make_xmm( 2 ),
1247 get_temp(
1248 TGSI_EXEC_TEMP_MINUS_128_I,
1249 TGSI_EXEC_TEMP_MINUS_128_C ) );
1250 emit_pow( func, 1, 2 );
1251 FETCH( func, *inst, 0, 0, CHAN_X );
1252 sse_xorps(
1253 func,
1254 make_xmm( 2 ),
1255 make_xmm( 2 ) );
1256 sse_cmpps(
1257 func,
1258 make_xmm( 2 ),
1259 make_xmm( 0 ),
1260 cc_LessThanEqual );
1261 sse_andps(
1262 func,
1263 make_xmm( 2 ),
1264 make_xmm( 1 ) );
1265 STORE( func, *inst, 2, 0, CHAN_Z );
1266 }
1267 }
1268 break;
1269
1270 case TGSI_OPCODE_RCP:
1271 /* TGSI_OPCODE_RECIP */
1272 FETCH( func, *inst, 0, 0, CHAN_X );
1273 emit_rcp( func, 0, 0 );
1274 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1275 STORE( func, *inst, 0, 0, chan_index );
1276 }
1277 break;
1278
1279 case TGSI_OPCODE_RSQ:
1280 /* TGSI_OPCODE_RECIPSQRT */
1281 FETCH( func, *inst, 0, 0, CHAN_X );
1282 emit_rsqrt( func, 1, 0 );
1283 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1284 STORE( func, *inst, 1, 0, chan_index );
1285 }
1286 break;
1287
1288 case TGSI_OPCODE_EXP:
1289 return 0;
1290 break;
1291
1292 case TGSI_OPCODE_LOG:
1293 return 0;
1294 break;
1295
1296 case TGSI_OPCODE_MUL:
1297 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1298 FETCH( func, *inst, 0, 0, chan_index );
1299 FETCH( func, *inst, 1, 1, chan_index );
1300 emit_mul( func, 0, 1 );
1301 STORE( func, *inst, 0, 0, chan_index );
1302 }
1303 break;
1304
1305 case TGSI_OPCODE_ADD:
1306 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1307 FETCH( func, *inst, 0, 0, chan_index );
1308 FETCH( func, *inst, 1, 1, chan_index );
1309 emit_add( func, 0, 1 );
1310 STORE( func, *inst, 0, 0, chan_index );
1311 }
1312 break;
1313
1314 case TGSI_OPCODE_DP3:
1315 /* TGSI_OPCODE_DOT3 */
1316 FETCH( func, *inst, 0, 0, CHAN_X );
1317 FETCH( func, *inst, 1, 1, CHAN_X );
1318 emit_mul( func, 0, 1 );
1319 FETCH( func, *inst, 1, 0, CHAN_Y );
1320 FETCH( func, *inst, 2, 1, CHAN_Y );
1321 emit_mul( func, 1, 2 );
1322 emit_add( func, 0, 1 );
1323 FETCH( func, *inst, 1, 0, CHAN_Z );
1324 FETCH( func, *inst, 2, 1, CHAN_Z );
1325 emit_mul( func, 1, 2 );
1326 emit_add( func, 0, 1 );
1327 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1328 STORE( func, *inst, 0, 0, chan_index );
1329 }
1330 break;
1331
1332 case TGSI_OPCODE_DP4:
1333 /* TGSI_OPCODE_DOT4 */
1334 FETCH( func, *inst, 0, 0, CHAN_X );
1335 FETCH( func, *inst, 1, 1, CHAN_X );
1336 emit_mul( func, 0, 1 );
1337 FETCH( func, *inst, 1, 0, CHAN_Y );
1338 FETCH( func, *inst, 2, 1, CHAN_Y );
1339 emit_mul( func, 1, 2 );
1340 emit_add( func, 0, 1 );
1341 FETCH( func, *inst, 1, 0, CHAN_Z );
1342 FETCH( func, *inst, 2, 1, CHAN_Z );
1343 emit_mul(func, 1, 2 );
1344 emit_add(func, 0, 1 );
1345 FETCH( func, *inst, 1, 0, CHAN_W );
1346 FETCH( func, *inst, 2, 1, CHAN_W );
1347 emit_mul( func, 1, 2 );
1348 emit_add( func, 0, 1 );
1349 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1350 STORE( func, *inst, 0, 0, chan_index );
1351 }
1352 break;
1353
1354 case TGSI_OPCODE_DST:
1355 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1356 emit_tempf(
1357 func,
1358 0,
1359 TGSI_EXEC_TEMP_ONE_I,
1360 TGSI_EXEC_TEMP_ONE_C );
1361 STORE( func, *inst, 0, 0, CHAN_X );
1362 }
1363 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1364 FETCH( func, *inst, 0, 0, CHAN_Y );
1365 FETCH( func, *inst, 1, 1, CHAN_Y );
1366 emit_mul( func, 0, 1 );
1367 STORE( func, *inst, 0, 0, CHAN_Y );
1368 }
1369 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1370 FETCH( func, *inst, 0, 0, CHAN_Z );
1371 STORE( func, *inst, 0, 0, CHAN_Z );
1372 }
1373 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1374 FETCH( func, *inst, 0, 1, CHAN_W );
1375 STORE( func, *inst, 0, 0, CHAN_W );
1376 }
1377 break;
1378
1379 case TGSI_OPCODE_MIN:
1380 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1381 FETCH( func, *inst, 0, 0, chan_index );
1382 FETCH( func, *inst, 1, 1, chan_index );
1383 sse_minps(
1384 func,
1385 make_xmm( 0 ),
1386 make_xmm( 1 ) );
1387 STORE( func, *inst, 0, 0, chan_index );
1388 }
1389 break;
1390
1391 case TGSI_OPCODE_MAX:
1392 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1393 FETCH( func, *inst, 0, 0, chan_index );
1394 FETCH( func, *inst, 1, 1, chan_index );
1395 sse_maxps(
1396 func,
1397 make_xmm( 0 ),
1398 make_xmm( 1 ) );
1399 STORE( func, *inst, 0, 0, chan_index );
1400 }
1401 break;
1402
1403 case TGSI_OPCODE_SLT:
1404 /* TGSI_OPCODE_SETLT */
1405 emit_setcc( func, inst, cc_LessThan );
1406 break;
1407
1408 case TGSI_OPCODE_SGE:
1409 /* TGSI_OPCODE_SETGE */
1410 emit_setcc( func, inst, cc_NotLessThan );
1411 break;
1412
1413 case TGSI_OPCODE_MAD:
1414 /* TGSI_OPCODE_MADD */
1415 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1416 FETCH( func, *inst, 0, 0, chan_index );
1417 FETCH( func, *inst, 1, 1, chan_index );
1418 FETCH( func, *inst, 2, 2, chan_index );
1419 emit_mul( func, 0, 1 );
1420 emit_add( func, 0, 2 );
1421 STORE( func, *inst, 0, 0, chan_index );
1422 }
1423 break;
1424
1425 case TGSI_OPCODE_SUB:
1426 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1427 FETCH( func, *inst, 0, 0, chan_index );
1428 FETCH( func, *inst, 1, 1, chan_index );
1429 emit_sub( func, 0, 1 );
1430 STORE( func, *inst, 0, 0, chan_index );
1431 }
1432 break;
1433
1434 case TGSI_OPCODE_LERP:
1435 /* TGSI_OPCODE_LRP */
1436 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1437 FETCH( func, *inst, 0, 0, chan_index );
1438 FETCH( func, *inst, 1, 1, chan_index );
1439 FETCH( func, *inst, 2, 2, chan_index );
1440 emit_sub( func, 1, 2 );
1441 emit_mul( func, 0, 1 );
1442 emit_add( func, 0, 2 );
1443 STORE( func, *inst, 0, 0, chan_index );
1444 }
1445 break;
1446
1447 case TGSI_OPCODE_CND:
1448 return 0;
1449 break;
1450
1451 case TGSI_OPCODE_CND0:
1452 return 0;
1453 break;
1454
1455 case TGSI_OPCODE_DOT2ADD:
1456 /* TGSI_OPCODE_DP2A */
1457 return 0;
1458 break;
1459
1460 case TGSI_OPCODE_INDEX:
1461 return 0;
1462 break;
1463
1464 case TGSI_OPCODE_NEGATE:
1465 return 0;
1466 break;
1467
1468 case TGSI_OPCODE_FRAC:
1469 /* TGSI_OPCODE_FRC */
1470 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1471 FETCH( func, *inst, 0, 0, chan_index );
1472 emit_frc( func, 0 );
1473 STORE( func, *inst, 0, 0, chan_index );
1474 }
1475 break;
1476
1477 case TGSI_OPCODE_CLAMP:
1478 return 0;
1479 break;
1480
1481 case TGSI_OPCODE_FLOOR:
1482 /* TGSI_OPCODE_FLR */
1483 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1484 FETCH( func, *inst, 0, 0, chan_index );
1485 emit_flr( func, 0 );
1486 STORE( func, *inst, 0, 0, chan_index );
1487 }
1488 break;
1489
1490 case TGSI_OPCODE_ROUND:
1491 return 0;
1492 break;
1493
1494 case TGSI_OPCODE_EXPBASE2:
1495 /* TGSI_OPCODE_EX2 */
1496 FETCH( func, *inst, 0, 0, CHAN_X );
1497 emit_ex2( func, 0 );
1498 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1499 STORE( func, *inst, 0, 0, chan_index );
1500 }
1501 break;
1502
1503 case TGSI_OPCODE_LOGBASE2:
1504 /* TGSI_OPCODE_LG2 */
1505 FETCH( func, *inst, 0, 0, CHAN_X );
1506 emit_lg2( func, 0 );
1507 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1508 STORE( func, *inst, 0, 0, chan_index );
1509 }
1510 break;
1511
1512 case TGSI_OPCODE_POWER:
1513 /* TGSI_OPCODE_POW */
1514 FETCH( func, *inst, 0, 0, CHAN_X );
1515 FETCH( func, *inst, 1, 1, CHAN_X );
1516 emit_pow( func, 0, 1 );
1517 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1518 STORE( func, *inst, 0, 0, chan_index );
1519 }
1520 break;
1521
1522 case TGSI_OPCODE_CROSSPRODUCT:
1523 /* TGSI_OPCODE_XPD */
1524 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1525 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1526 FETCH( func, *inst, 1, 1, CHAN_Z );
1527 FETCH( func, *inst, 3, 0, CHAN_Z );
1528 }
1529 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1530 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1531 FETCH( func, *inst, 0, 0, CHAN_Y );
1532 FETCH( func, *inst, 4, 1, CHAN_Y );
1533 }
1534 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1535 emit_MOV( func, 2, 0 );
1536 emit_mul( func, 2, 1 );
1537 emit_MOV( func, 5, 3 );
1538 emit_mul( func, 5, 4 );
1539 emit_sub( func, 2, 5 );
1540 STORE( func, *inst, 2, 0, CHAN_X );
1541 }
1542 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1543 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1544 FETCH( func, *inst, 2, 1, CHAN_X );
1545 FETCH( func, *inst, 5, 0, CHAN_X );
1546 }
1547 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1548 emit_mul( func, 3, 2 );
1549 emit_mul( func, 1, 5 );
1550 emit_sub( func, 3, 1 );
1551 STORE( func, *inst, 3, 0, CHAN_Y );
1552 }
1553 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1554 emit_mul( func, 5, 4 );
1555 emit_mul( func, 0, 2 );
1556 emit_sub( func, 5, 0 );
1557 STORE( func, *inst, 5, 0, CHAN_Z );
1558 }
1559 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1560 emit_tempf(
1561 func,
1562 0,
1563 TGSI_EXEC_TEMP_ONE_I,
1564 TGSI_EXEC_TEMP_ONE_C );
1565 STORE( func, *inst, 0, 0, CHAN_W );
1566 }
1567 break;
1568
1569 case TGSI_OPCODE_MULTIPLYMATRIX:
1570 return 0;
1571 break;
1572
1573 case TGSI_OPCODE_ABS:
1574 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1575 FETCH( func, *inst, 0, 0, chan_index );
1576 emit_abs( func, 0) ;
1577
1578 STORE( func, *inst, 0, 0, chan_index );
1579 }
1580 break;
1581
1582 case TGSI_OPCODE_RCC:
1583 return 0;
1584 break;
1585
1586 case TGSI_OPCODE_DPH:
1587 FETCH( func, *inst, 0, 0, CHAN_X );
1588 FETCH( func, *inst, 1, 1, CHAN_X );
1589 emit_mul( func, 0, 1 );
1590 FETCH( func, *inst, 1, 0, CHAN_Y );
1591 FETCH( func, *inst, 2, 1, CHAN_Y );
1592 emit_mul( func, 1, 2 );
1593 emit_add( func, 0, 1 );
1594 FETCH( func, *inst, 1, 0, CHAN_Z );
1595 FETCH( func, *inst, 2, 1, CHAN_Z );
1596 emit_mul( func, 1, 2 );
1597 emit_add( func, 0, 1 );
1598 FETCH( func, *inst, 1, 1, CHAN_W );
1599 emit_add( func, 0, 1 );
1600 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1601 STORE( func, *inst, 0, 0, chan_index );
1602 }
1603 break;
1604
1605 case TGSI_OPCODE_COS:
1606 FETCH( func, *inst, 0, 0, CHAN_X );
1607 emit_cos( func, 0 );
1608 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1609 STORE( func, *inst, 0, 0, chan_index );
1610 }
1611 break;
1612
1613 case TGSI_OPCODE_DDX:
1614 return 0;
1615 break;
1616
1617 case TGSI_OPCODE_DDY:
1618 return 0;
1619 break;
1620
1621 case TGSI_OPCODE_KILP:
1622 /* predicated kill */
1623 emit_kilp( func );
1624 return 0; /* XXX fix me */
1625 break;
1626
1627 case TGSI_OPCODE_KIL:
1628 /* conditional kill */
1629 emit_kil( func, &inst->FullSrcRegisters[0] );
1630 break;
1631
1632 case TGSI_OPCODE_PK2H:
1633 return 0;
1634 break;
1635
1636 case TGSI_OPCODE_PK2US:
1637 return 0;
1638 break;
1639
1640 case TGSI_OPCODE_PK4B:
1641 return 0;
1642 break;
1643
1644 case TGSI_OPCODE_PK4UB:
1645 return 0;
1646 break;
1647
1648 case TGSI_OPCODE_RFL:
1649 return 0;
1650 break;
1651
1652 case TGSI_OPCODE_SEQ:
1653 return 0;
1654 break;
1655
1656 case TGSI_OPCODE_SFL:
1657 return 0;
1658 break;
1659
1660 case TGSI_OPCODE_SGT:
1661 return 0;
1662 break;
1663
1664 case TGSI_OPCODE_SIN:
1665 FETCH( func, *inst, 0, 0, CHAN_X );
1666 emit_sin( func, 0 );
1667 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1668 STORE( func, *inst, 0, 0, chan_index );
1669 }
1670 break;
1671
1672 case TGSI_OPCODE_SLE:
1673 return 0;
1674 break;
1675
1676 case TGSI_OPCODE_SNE:
1677 return 0;
1678 break;
1679
1680 case TGSI_OPCODE_STR:
1681 return 0;
1682 break;
1683
1684 case TGSI_OPCODE_TEX:
1685 if (0) {
1686 /* Disable dummy texture code:
1687 */
1688 emit_tempf(
1689 func,
1690 0,
1691 TGSI_EXEC_TEMP_ONE_I,
1692 TGSI_EXEC_TEMP_ONE_C );
1693 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1694 STORE( func, *inst, 0, 0, chan_index );
1695 }
1696 }
1697 else {
1698 return 0;
1699 }
1700 break;
1701
1702 case TGSI_OPCODE_TXD:
1703 return 0;
1704 break;
1705
1706 case TGSI_OPCODE_UP2H:
1707 return 0;
1708 break;
1709
1710 case TGSI_OPCODE_UP2US:
1711 return 0;
1712 break;
1713
1714 case TGSI_OPCODE_UP4B:
1715 return 0;
1716 break;
1717
1718 case TGSI_OPCODE_UP4UB:
1719 return 0;
1720 break;
1721
1722 case TGSI_OPCODE_X2D:
1723 return 0;
1724 break;
1725
1726 case TGSI_OPCODE_ARA:
1727 return 0;
1728 break;
1729
1730 case TGSI_OPCODE_ARR:
1731 return 0;
1732 break;
1733
1734 case TGSI_OPCODE_BRA:
1735 return 0;
1736 break;
1737
1738 case TGSI_OPCODE_CAL:
1739 return 0;
1740 break;
1741
1742 case TGSI_OPCODE_RET:
1743 emit_ret( func );
1744 break;
1745
1746 case TGSI_OPCODE_END:
1747 break;
1748
1749 case TGSI_OPCODE_SSG:
1750 return 0;
1751 break;
1752
1753 case TGSI_OPCODE_CMP:
1754 emit_cmp (func, inst);
1755 break;
1756
1757 case TGSI_OPCODE_SCS:
1758 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1759 FETCH( func, *inst, 0, 0, CHAN_X );
1760 emit_cos( func, 0 );
1761 STORE( func, *inst, 0, 0, CHAN_X );
1762 }
1763 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1764 FETCH( func, *inst, 0, 0, CHAN_X );
1765 emit_sin( func, 0 );
1766 STORE( func, *inst, 0, 0, CHAN_Y );
1767 }
1768 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1769 emit_tempf(
1770 func,
1771 0,
1772 TGSI_EXEC_TEMP_00000000_I,
1773 TGSI_EXEC_TEMP_00000000_C );
1774 STORE( func, *inst, 0, 0, CHAN_Z );
1775 }
1776 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1777 emit_tempf(
1778 func,
1779 0,
1780 TGSI_EXEC_TEMP_ONE_I,
1781 TGSI_EXEC_TEMP_ONE_C );
1782 STORE( func, *inst, 0, 0, CHAN_W );
1783 }
1784 break;
1785
1786 case TGSI_OPCODE_TXB:
1787 return 0;
1788 break;
1789
1790 case TGSI_OPCODE_NRM:
1791 return 0;
1792 break;
1793
1794 case TGSI_OPCODE_DIV:
1795 return 0;
1796 break;
1797
1798 case TGSI_OPCODE_DP2:
1799 return 0;
1800 break;
1801
1802 case TGSI_OPCODE_TXL:
1803 return 0;
1804 break;
1805
1806 case TGSI_OPCODE_BRK:
1807 return 0;
1808 break;
1809
1810 case TGSI_OPCODE_IF:
1811 return 0;
1812 break;
1813
1814 case TGSI_OPCODE_LOOP:
1815 return 0;
1816 break;
1817
1818 case TGSI_OPCODE_REP:
1819 return 0;
1820 break;
1821
1822 case TGSI_OPCODE_ELSE:
1823 return 0;
1824 break;
1825
1826 case TGSI_OPCODE_ENDIF:
1827 return 0;
1828 break;
1829
1830 case TGSI_OPCODE_ENDLOOP:
1831 return 0;
1832 break;
1833
1834 case TGSI_OPCODE_ENDREP:
1835 return 0;
1836 break;
1837
1838 case TGSI_OPCODE_PUSHA:
1839 return 0;
1840 break;
1841
1842 case TGSI_OPCODE_POPA:
1843 return 0;
1844 break;
1845
1846 case TGSI_OPCODE_CEIL:
1847 return 0;
1848 break;
1849
1850 case TGSI_OPCODE_I2F:
1851 return 0;
1852 break;
1853
1854 case TGSI_OPCODE_NOT:
1855 return 0;
1856 break;
1857
1858 case TGSI_OPCODE_TRUNC:
1859 return 0;
1860 break;
1861
1862 case TGSI_OPCODE_SHL:
1863 return 0;
1864 break;
1865
1866 case TGSI_OPCODE_SHR:
1867 return 0;
1868 break;
1869
1870 case TGSI_OPCODE_AND:
1871 return 0;
1872 break;
1873
1874 case TGSI_OPCODE_OR:
1875 return 0;
1876 break;
1877
1878 case TGSI_OPCODE_MOD:
1879 return 0;
1880 break;
1881
1882 case TGSI_OPCODE_XOR:
1883 return 0;
1884 break;
1885
1886 case TGSI_OPCODE_SAD:
1887 return 0;
1888 break;
1889
1890 case TGSI_OPCODE_TXF:
1891 return 0;
1892 break;
1893
1894 case TGSI_OPCODE_TXQ:
1895 return 0;
1896 break;
1897
1898 case TGSI_OPCODE_CONT:
1899 return 0;
1900 break;
1901
1902 case TGSI_OPCODE_EMIT:
1903 return 0;
1904 break;
1905
1906 case TGSI_OPCODE_ENDPRIM:
1907 return 0;
1908 break;
1909
1910 default:
1911 return 0;
1912 }
1913
1914 return 1;
1915 }
1916
1917 static void
1918 emit_declaration(
1919 struct x86_function *func,
1920 struct tgsi_full_declaration *decl )
1921 {
1922 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1923 unsigned first, last, mask;
1924 unsigned i, j;
1925
1926 first = decl->DeclarationRange.First;
1927 last = decl->DeclarationRange.Last;
1928 mask = decl->Declaration.UsageMask;
1929
1930 for( i = first; i <= last; i++ ) {
1931 for( j = 0; j < NUM_CHANNELS; j++ ) {
1932 if( mask & (1 << j) ) {
1933 switch( decl->Declaration.Interpolate ) {
1934 case TGSI_INTERPOLATE_CONSTANT:
1935 emit_coef_a0( func, 0, i, j );
1936 emit_inputs( func, 0, i, j );
1937 break;
1938
1939 case TGSI_INTERPOLATE_LINEAR:
1940 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1941 emit_coef_dadx( func, 1, i, j );
1942 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1943 emit_coef_dady( func, 3, i, j );
1944 emit_mul( func, 0, 1 ); /* x * dadx */
1945 emit_coef_a0( func, 4, i, j );
1946 emit_mul( func, 2, 3 ); /* y * dady */
1947 emit_add( func, 0, 4 ); /* x * dadx + a0 */
1948 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
1949 emit_inputs( func, 0, i, j );
1950 break;
1951
1952 case TGSI_INTERPOLATE_PERSPECTIVE:
1953 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1954 emit_coef_dadx( func, 1, i, j );
1955 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1956 emit_coef_dady( func, 3, i, j );
1957 emit_mul( func, 0, 1 ); /* x * dadx */
1958 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
1959 emit_coef_a0( func, 5, i, j );
1960 emit_rcp( func, 4, 4 ); /* 1.0 / w */
1961 emit_mul( func, 2, 3 ); /* y * dady */
1962 emit_add( func, 0, 5 ); /* x * dadx + a0 */
1963 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
1964 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
1965 emit_inputs( func, 0, i, j );
1966 break;
1967
1968 default:
1969 assert( 0 );
1970 break;
1971 }
1972 }
1973 }
1974 }
1975 }
1976 }
1977
1978 static void aos_to_soa( struct x86_function *func,
1979 uint arg_aos,
1980 uint arg_soa,
1981 uint arg_num,
1982 uint arg_stride )
1983 {
1984 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
1985 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
1986 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
1987 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
1988 int inner_loop;
1989
1990
1991 /* Save EBX */
1992 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
1993
1994 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
1995 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
1996 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
1997 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
1998
1999 /* do */
2000 inner_loop = x86_get_label( func );
2001 {
2002 x86_push( func, aos_input );
2003 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2004 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2005 x86_add( func, aos_input, stride );
2006 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2007 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2008 x86_add( func, aos_input, stride );
2009 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2010 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2011 x86_add( func, aos_input, stride );
2012 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2013 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2014 x86_pop( func, aos_input );
2015
2016 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2017 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2018 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2019 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2020 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2021 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2022
2023 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2024 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2025 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2026 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2027
2028 /* Advance to next input */
2029 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2030 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2031 }
2032 /* while --num_inputs */
2033 x86_dec( func, num_inputs );
2034 x86_jcc( func, cc_NE, inner_loop );
2035
2036 /* Restore EBX */
2037 x86_pop( func, aos_input );
2038 }
2039
2040 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2041 {
2042 struct x86_reg soa_output;
2043 struct x86_reg aos_output;
2044 struct x86_reg num_outputs;
2045 struct x86_reg temp;
2046 int inner_loop;
2047
2048 soa_output = x86_make_reg( file_REG32, reg_AX );
2049 aos_output = x86_make_reg( file_REG32, reg_BX );
2050 num_outputs = x86_make_reg( file_REG32, reg_CX );
2051 temp = x86_make_reg( file_REG32, reg_DX );
2052
2053 /* Save EBX */
2054 x86_push( func, aos_output );
2055
2056 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2057 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2058 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2059
2060 /* do */
2061 inner_loop = x86_get_label( func );
2062 {
2063 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2064 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2065 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2066 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2067
2068 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2069 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2070 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2071 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2072 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2073 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2074
2075 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2076 x86_push( func, aos_output );
2077 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2078 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2079 x86_add( func, aos_output, temp );
2080 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2081 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2082 x86_add( func, aos_output, temp );
2083 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2084 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2085 x86_add( func, aos_output, temp );
2086 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2087 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2088 x86_pop( func, aos_output );
2089
2090 /* Advance to next output */
2091 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2092 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2093 }
2094 /* while --num_outputs */
2095 x86_dec( func, num_outputs );
2096 x86_jcc( func, cc_NE, inner_loop );
2097
2098 /* Restore EBX */
2099 x86_pop( func, aos_output );
2100 }
2101
2102 /**
2103 * Translate a TGSI vertex/fragment shader to SSE2 code.
2104 * Slightly different things are done for vertex vs. fragment shaders.
2105 *
2106 * Note that fragment shaders are responsible for interpolating shader
2107 * inputs. Because on x86 we have only 4 GP registers, and here we
2108 * have 5 shader arguments (input, output, const, temp and coef), the
2109 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2110 * GP register holding the output argument is aliased with the coeff
2111 * argument, as outputs are not needed in the DECLARATION phase.
2112 *
2113 * \param tokens the TGSI input shader
2114 * \param func the output SSE code/function
2115 * \param immediates buffer to place immediates, later passed to SSE func
2116 * \param return 1 for success, 0 if translation failed
2117 */
2118 unsigned
2119 tgsi_emit_sse2(
2120 const struct tgsi_token *tokens,
2121 struct x86_function *func,
2122 float (*immediates)[4],
2123 boolean do_swizzles )
2124 {
2125 struct tgsi_parse_context parse;
2126 boolean instruction_phase = FALSE;
2127 unsigned ok = 1;
2128 uint num_immediates = 0;
2129
2130 func->csr = func->store;
2131
2132 tgsi_parse_init( &parse, tokens );
2133
2134 /* Can't just use EDI, EBX without save/restoring them:
2135 */
2136 x86_push(
2137 func,
2138 get_immediate_base() );
2139
2140 x86_push(
2141 func,
2142 get_temp_base() );
2143
2144
2145 /*
2146 * Different function args for vertex/fragment shaders:
2147 */
2148 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2149 /* DECLARATION phase, do not load output argument. */
2150 x86_mov(
2151 func,
2152 get_input_base(),
2153 x86_fn_arg( func, 1 ) );
2154 /* skipping outputs argument here */
2155 x86_mov(
2156 func,
2157 get_const_base(),
2158 x86_fn_arg( func, 3 ) );
2159 x86_mov(
2160 func,
2161 get_temp_base(),
2162 x86_fn_arg( func, 4 ) );
2163 x86_mov(
2164 func,
2165 get_coef_base(),
2166 x86_fn_arg( func, 5 ) );
2167 x86_mov(
2168 func,
2169 get_immediate_base(),
2170 x86_fn_arg( func, 6 ) );
2171 }
2172 else {
2173 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2174
2175 if (do_swizzles)
2176 aos_to_soa( func,
2177 6, /* aos_input */
2178 1, /* machine->input */
2179 7, /* num_inputs */
2180 8 ); /* input_stride */
2181
2182 x86_mov(
2183 func,
2184 get_input_base(),
2185 x86_fn_arg( func, 1 ) );
2186 x86_mov(
2187 func,
2188 get_output_base(),
2189 x86_fn_arg( func, 2 ) );
2190 x86_mov(
2191 func,
2192 get_const_base(),
2193 x86_fn_arg( func, 3 ) );
2194 x86_mov(
2195 func,
2196 get_temp_base(),
2197 x86_fn_arg( func, 4 ) );
2198 x86_mov(
2199 func,
2200 get_immediate_base(),
2201 x86_fn_arg( func, 5 ) );
2202 }
2203
2204 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2205 tgsi_parse_token( &parse );
2206
2207 switch( parse.FullToken.Token.Type ) {
2208 case TGSI_TOKEN_TYPE_DECLARATION:
2209 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2210 emit_declaration(
2211 func,
2212 &parse.FullToken.FullDeclaration );
2213 }
2214 break;
2215
2216 case TGSI_TOKEN_TYPE_INSTRUCTION:
2217 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2218 if( !instruction_phase ) {
2219 /* INSTRUCTION phase, overwrite coeff with output. */
2220 instruction_phase = TRUE;
2221 x86_mov(
2222 func,
2223 get_output_base(),
2224 x86_fn_arg( func, 2 ) );
2225 }
2226 }
2227
2228 ok = emit_instruction(
2229 func,
2230 &parse.FullToken.FullInstruction );
2231
2232 if (!ok) {
2233 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2234 parse.FullToken.FullInstruction.Instruction.Opcode,
2235 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2236 "vertex shader" : "fragment shader");
2237 }
2238 break;
2239
2240 case TGSI_TOKEN_TYPE_IMMEDIATE:
2241 /* simply copy the immediate values into the next immediates[] slot */
2242 {
2243 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2244 uint i;
2245 assert(size <= 4);
2246 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2247 for( i = 0; i < size; i++ ) {
2248 immediates[num_immediates][i] =
2249 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2250 }
2251 #if 0
2252 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2253 num_immediates,
2254 immediates[num_immediates][0],
2255 immediates[num_immediates][1],
2256 immediates[num_immediates][2],
2257 immediates[num_immediates][3]);
2258 #endif
2259 num_immediates++;
2260 }
2261 break;
2262
2263 default:
2264 ok = 0;
2265 assert( 0 );
2266 }
2267 }
2268
2269 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2270 if (do_swizzles)
2271 soa_to_aos( func, 9, 2, 10, 11 );
2272 }
2273
2274 /* Can't just use EBX, EDI without save/restoring them:
2275 */
2276 x86_pop(
2277 func,
2278 get_temp_base() );
2279
2280 x86_pop(
2281 func,
2282 get_immediate_base() );
2283
2284 emit_ret( func );
2285
2286 tgsi_parse_free( &parse );
2287
2288 return ok;
2289 }
2290
2291 #endif /* PIPE_ARCH_X86 */