tgsi: Fix ARL opcode in SSE2 codegen.
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "tgsi/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
34
35 #include "rtasm/rtasm_x86sse.h"
36
37 #ifdef PIPE_ARCH_X86
38
39 /* for 1/sqrt()
40 *
41 * This costs about 100fps (close to 10%) in gears:
42 */
43 #define HIGH_PRECISION 1
44
45
46 #define FOR_EACH_CHANNEL( CHAN )\
47 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
48
49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
50 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
51
52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
54
55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
56 FOR_EACH_CHANNEL( CHAN )\
57 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
58
59 #define CHAN_X 0
60 #define CHAN_Y 1
61 #define CHAN_Z 2
62 #define CHAN_W 3
63
64 #define TEMP_R0 TGSI_EXEC_TEMP_R0
65 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
66
67 /**
68 * X86 utility functions.
69 */
70
71 static struct x86_reg
72 make_xmm(
73 unsigned xmm )
74 {
75 return x86_make_reg(
76 file_XMM,
77 (enum x86_reg_name) xmm );
78 }
79
80 /**
81 * X86 register mapping helpers.
82 */
83
84 static struct x86_reg
85 get_const_base( void )
86 {
87 return x86_make_reg(
88 file_REG32,
89 reg_CX );
90 }
91
92 static struct x86_reg
93 get_input_base( void )
94 {
95 return x86_make_reg(
96 file_REG32,
97 reg_AX );
98 }
99
100 static struct x86_reg
101 get_output_base( void )
102 {
103 return x86_make_reg(
104 file_REG32,
105 reg_DX );
106 }
107
108 static struct x86_reg
109 get_temp_base( void )
110 {
111 return x86_make_reg(
112 file_REG32,
113 reg_BX );
114 }
115
116 static struct x86_reg
117 get_coef_base( void )
118 {
119 return get_output_base();
120 }
121
122 static struct x86_reg
123 get_immediate_base( void )
124 {
125 return x86_make_reg(
126 file_REG32,
127 reg_DI );
128 }
129
130
131 /**
132 * Data access helpers.
133 */
134
135
136 static struct x86_reg
137 get_immediate(
138 unsigned vec,
139 unsigned chan )
140 {
141 return x86_make_disp(
142 get_immediate_base(),
143 (vec * 4 + chan) * 4 );
144 }
145
146 static struct x86_reg
147 get_const(
148 unsigned vec,
149 unsigned chan )
150 {
151 return x86_make_disp(
152 get_const_base(),
153 (vec * 4 + chan) * 4 );
154 }
155
156 static struct x86_reg
157 get_input(
158 unsigned vec,
159 unsigned chan )
160 {
161 return x86_make_disp(
162 get_input_base(),
163 (vec * 4 + chan) * 16 );
164 }
165
166 static struct x86_reg
167 get_output(
168 unsigned vec,
169 unsigned chan )
170 {
171 return x86_make_disp(
172 get_output_base(),
173 (vec * 4 + chan) * 16 );
174 }
175
176 static struct x86_reg
177 get_temp(
178 unsigned vec,
179 unsigned chan )
180 {
181 return x86_make_disp(
182 get_temp_base(),
183 (vec * 4 + chan) * 16 );
184 }
185
186 static struct x86_reg
187 get_coef(
188 unsigned vec,
189 unsigned chan,
190 unsigned member )
191 {
192 return x86_make_disp(
193 get_coef_base(),
194 ((vec * 3 + member) * 4 + chan) * 4 );
195 }
196
197
198 static void
199 emit_ret(
200 struct x86_function *func )
201 {
202 x86_ret( func );
203 }
204
205
206 /**
207 * Data fetch helpers.
208 */
209
210 /**
211 * Copy a shader constant to xmm register
212 * \param xmm the destination xmm register
213 * \param vec the src const buffer index
214 * \param chan src channel to fetch (X, Y, Z or W)
215 */
216 static void
217 emit_const(
218 struct x86_function *func,
219 uint xmm,
220 int vec,
221 uint chan,
222 uint indirect,
223 uint indirectFile,
224 int indirectIndex )
225 {
226 if (indirect) {
227 struct x86_reg r0 = get_input_base();
228 struct x86_reg r1 = get_output_base();
229 uint i;
230
231 assert( indirectFile == TGSI_FILE_ADDRESS );
232 assert( indirectIndex == 0 );
233
234 x86_push( func, r0 );
235 x86_push( func, r1 );
236
237 for (i = 0; i < QUAD_SIZE; i++) {
238 x86_lea( func, r0, get_const( vec, chan ) );
239 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
240
241 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
242 */
243 x86_add( func, r1, r1 );
244 x86_add( func, r1, r1 );
245 x86_add( func, r1, r1 );
246 x86_add( func, r1, r1 );
247
248 x86_add( func, r0, r1 );
249 x86_mov( func, r1, x86_deref( r0 ) );
250 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
251 }
252
253 x86_pop( func, r1 );
254 x86_pop( func, r0 );
255
256 sse_movaps(
257 func,
258 make_xmm( xmm ),
259 get_temp( TEMP_R0, CHAN_X ) );
260 }
261 else {
262 assert( vec >= 0 );
263
264 sse_movss(
265 func,
266 make_xmm( xmm ),
267 get_const( vec, chan ) );
268 sse_shufps(
269 func,
270 make_xmm( xmm ),
271 make_xmm( xmm ),
272 SHUF( 0, 0, 0, 0 ) );
273 }
274 }
275
276 static void
277 emit_immediate(
278 struct x86_function *func,
279 unsigned xmm,
280 unsigned vec,
281 unsigned chan )
282 {
283 sse_movss(
284 func,
285 make_xmm( xmm ),
286 get_immediate( vec, chan ) );
287 sse_shufps(
288 func,
289 make_xmm( xmm ),
290 make_xmm( xmm ),
291 SHUF( 0, 0, 0, 0 ) );
292 }
293
294
295 /**
296 * Copy a shader input to xmm register
297 * \param xmm the destination xmm register
298 * \param vec the src input attrib
299 * \param chan src channel to fetch (X, Y, Z or W)
300 */
301 static void
302 emit_inputf(
303 struct x86_function *func,
304 unsigned xmm,
305 unsigned vec,
306 unsigned chan )
307 {
308 sse_movups(
309 func,
310 make_xmm( xmm ),
311 get_input( vec, chan ) );
312 }
313
314 /**
315 * Store an xmm register to a shader output
316 * \param xmm the source xmm register
317 * \param vec the dest output attrib
318 * \param chan src dest channel to store (X, Y, Z or W)
319 */
320 static void
321 emit_output(
322 struct x86_function *func,
323 unsigned xmm,
324 unsigned vec,
325 unsigned chan )
326 {
327 sse_movups(
328 func,
329 get_output( vec, chan ),
330 make_xmm( xmm ) );
331 }
332
333 /**
334 * Copy a shader temporary to xmm register
335 * \param xmm the destination xmm register
336 * \param vec the src temp register
337 * \param chan src channel to fetch (X, Y, Z or W)
338 */
339 static void
340 emit_tempf(
341 struct x86_function *func,
342 unsigned xmm,
343 unsigned vec,
344 unsigned chan )
345 {
346 sse_movaps(
347 func,
348 make_xmm( xmm ),
349 get_temp( vec, chan ) );
350 }
351
352 /**
353 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
354 * \param xmm the destination xmm register
355 * \param vec the src input/attribute coefficient index
356 * \param chan src channel to fetch (X, Y, Z or W)
357 * \param member 0=a0, 1=dadx, 2=dady
358 */
359 static void
360 emit_coef(
361 struct x86_function *func,
362 unsigned xmm,
363 unsigned vec,
364 unsigned chan,
365 unsigned member )
366 {
367 sse_movss(
368 func,
369 make_xmm( xmm ),
370 get_coef( vec, chan, member ) );
371 sse_shufps(
372 func,
373 make_xmm( xmm ),
374 make_xmm( xmm ),
375 SHUF( 0, 0, 0, 0 ) );
376 }
377
378 /**
379 * Data store helpers.
380 */
381
382 static void
383 emit_inputs(
384 struct x86_function *func,
385 unsigned xmm,
386 unsigned vec,
387 unsigned chan )
388 {
389 sse_movups(
390 func,
391 get_input( vec, chan ),
392 make_xmm( xmm ) );
393 }
394
395 static void
396 emit_temps(
397 struct x86_function *func,
398 unsigned xmm,
399 unsigned vec,
400 unsigned chan )
401 {
402 sse_movaps(
403 func,
404 get_temp( vec, chan ),
405 make_xmm( xmm ) );
406 }
407
408 static void
409 emit_addrs(
410 struct x86_function *func,
411 unsigned xmm,
412 unsigned vec,
413 unsigned chan )
414 {
415 assert( vec == 0 );
416
417 emit_temps(
418 func,
419 xmm,
420 vec + TGSI_EXEC_TEMP_ADDR,
421 chan );
422 }
423
424 /**
425 * Coefficent fetch helpers.
426 */
427
428 static void
429 emit_coef_a0(
430 struct x86_function *func,
431 unsigned xmm,
432 unsigned vec,
433 unsigned chan )
434 {
435 emit_coef(
436 func,
437 xmm,
438 vec,
439 chan,
440 0 );
441 }
442
443 static void
444 emit_coef_dadx(
445 struct x86_function *func,
446 unsigned xmm,
447 unsigned vec,
448 unsigned chan )
449 {
450 emit_coef(
451 func,
452 xmm,
453 vec,
454 chan,
455 1 );
456 }
457
458 static void
459 emit_coef_dady(
460 struct x86_function *func,
461 unsigned xmm,
462 unsigned vec,
463 unsigned chan )
464 {
465 emit_coef(
466 func,
467 xmm,
468 vec,
469 chan,
470 2 );
471 }
472
473 /**
474 * Function call helpers.
475 */
476
477 static void
478 emit_push_gp(
479 struct x86_function *func )
480 {
481 x86_push(
482 func,
483 x86_make_reg( file_REG32, reg_AX) );
484 x86_push(
485 func,
486 x86_make_reg( file_REG32, reg_CX) );
487 x86_push(
488 func,
489 x86_make_reg( file_REG32, reg_DX) );
490 }
491
492 static void
493 x86_pop_gp(
494 struct x86_function *func )
495 {
496 /* Restore GP registers in a reverse order.
497 */
498 x86_pop(
499 func,
500 x86_make_reg( file_REG32, reg_DX) );
501 x86_pop(
502 func,
503 x86_make_reg( file_REG32, reg_CX) );
504 x86_pop(
505 func,
506 x86_make_reg( file_REG32, reg_AX) );
507 }
508
509 static void
510 emit_func_call_dst(
511 struct x86_function *func,
512 unsigned xmm_dst,
513 void (PIPE_CDECL *code)() )
514 {
515 sse_movaps(
516 func,
517 get_temp( TEMP_R0, 0 ),
518 make_xmm( xmm_dst ) );
519
520 emit_push_gp(
521 func );
522
523 {
524 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
525
526 x86_lea(
527 func,
528 ecx,
529 get_temp( TEMP_R0, 0 ) );
530
531 x86_push( func, ecx );
532 x86_mov_reg_imm( func, ecx, (unsigned long) code );
533 x86_call( func, ecx );
534 x86_pop(func, ecx );
535 }
536
537
538 x86_pop_gp(
539 func );
540
541 sse_movaps(
542 func,
543 make_xmm( xmm_dst ),
544 get_temp( TEMP_R0, 0 ) );
545 }
546
547 static void
548 emit_func_call_dst_src(
549 struct x86_function *func,
550 unsigned xmm_dst,
551 unsigned xmm_src,
552 void (PIPE_CDECL *code)() )
553 {
554 sse_movaps(
555 func,
556 get_temp( TEMP_R0, 1 ),
557 make_xmm( xmm_src ) );
558
559 emit_func_call_dst(
560 func,
561 xmm_dst,
562 code );
563 }
564
565 /**
566 * Low-level instruction translators.
567 */
568
569 static void
570 emit_abs(
571 struct x86_function *func,
572 unsigned xmm )
573 {
574 sse_andps(
575 func,
576 make_xmm( xmm ),
577 get_temp(
578 TGSI_EXEC_TEMP_7FFFFFFF_I,
579 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
580 }
581
582 static void
583 emit_add(
584 struct x86_function *func,
585 unsigned xmm_dst,
586 unsigned xmm_src )
587 {
588 sse_addps(
589 func,
590 make_xmm( xmm_dst ),
591 make_xmm( xmm_src ) );
592 }
593
594 static void PIPE_CDECL
595 cos4f(
596 float *store )
597 {
598 const unsigned X = 0;
599
600 store[X + 0] = cosf( store[X + 0] );
601 store[X + 1] = cosf( store[X + 1] );
602 store[X + 2] = cosf( store[X + 2] );
603 store[X + 3] = cosf( store[X + 3] );
604 }
605
606 static void
607 emit_cos(
608 struct x86_function *func,
609 unsigned xmm_dst )
610 {
611 emit_func_call_dst(
612 func,
613 xmm_dst,
614 cos4f );
615 }
616
617 static void PIPE_CDECL
618 ex24f(
619 float *store )
620 {
621 const unsigned X = 0;
622
623 store[X + 0] = powf( 2.0f, store[X + 0] );
624 store[X + 1] = powf( 2.0f, store[X + 1] );
625 store[X + 2] = powf( 2.0f, store[X + 2] );
626 store[X + 3] = powf( 2.0f, store[X + 3] );
627 }
628
629 static void
630 emit_ex2(
631 struct x86_function *func,
632 unsigned xmm_dst )
633 {
634 emit_func_call_dst(
635 func,
636 xmm_dst,
637 ex24f );
638 }
639
640 static void
641 emit_f2it(
642 struct x86_function *func,
643 unsigned xmm )
644 {
645 sse2_cvttps2dq(
646 func,
647 make_xmm( xmm ),
648 make_xmm( xmm ) );
649 }
650
651 static void PIPE_CDECL
652 flr4f(
653 float *store )
654 {
655 const unsigned X = 0;
656
657 store[X + 0] = floorf( store[X + 0] );
658 store[X + 1] = floorf( store[X + 1] );
659 store[X + 2] = floorf( store[X + 2] );
660 store[X + 3] = floorf( store[X + 3] );
661 }
662
663 static void
664 emit_flr(
665 struct x86_function *func,
666 unsigned xmm_dst )
667 {
668 emit_func_call_dst(
669 func,
670 xmm_dst,
671 flr4f );
672 }
673
674 static void PIPE_CDECL
675 frc4f(
676 float *store )
677 {
678 const unsigned X = 0;
679
680 store[X + 0] -= floorf( store[X + 0] );
681 store[X + 1] -= floorf( store[X + 1] );
682 store[X + 2] -= floorf( store[X + 2] );
683 store[X + 3] -= floorf( store[X + 3] );
684 }
685
686 static void
687 emit_frc(
688 struct x86_function *func,
689 unsigned xmm_dst )
690 {
691 emit_func_call_dst(
692 func,
693 xmm_dst,
694 frc4f );
695 }
696
697 static void PIPE_CDECL
698 lg24f(
699 float *store )
700 {
701 const unsigned X = 0;
702
703 store[X + 0] = LOG2( store[X + 0] );
704 store[X + 1] = LOG2( store[X + 1] );
705 store[X + 2] = LOG2( store[X + 2] );
706 store[X + 3] = LOG2( store[X + 3] );
707 }
708
709 static void
710 emit_lg2(
711 struct x86_function *func,
712 unsigned xmm_dst )
713 {
714 emit_func_call_dst(
715 func,
716 xmm_dst,
717 lg24f );
718 }
719
720 static void
721 emit_MOV(
722 struct x86_function *func,
723 unsigned xmm_dst,
724 unsigned xmm_src )
725 {
726 sse_movups(
727 func,
728 make_xmm( xmm_dst ),
729 make_xmm( xmm_src ) );
730 }
731
732 static void
733 emit_mul (struct x86_function *func,
734 unsigned xmm_dst,
735 unsigned xmm_src)
736 {
737 sse_mulps(
738 func,
739 make_xmm( xmm_dst ),
740 make_xmm( xmm_src ) );
741 }
742
743 static void
744 emit_neg(
745 struct x86_function *func,
746 unsigned xmm )
747 {
748 sse_xorps(
749 func,
750 make_xmm( xmm ),
751 get_temp(
752 TGSI_EXEC_TEMP_80000000_I,
753 TGSI_EXEC_TEMP_80000000_C ) );
754 }
755
756 static void PIPE_CDECL
757 pow4f(
758 float *store )
759 {
760 const unsigned X = 0;
761
762 store[X + 0] = powf( store[X + 0], store[X + 4] );
763 store[X + 1] = powf( store[X + 1], store[X + 5] );
764 store[X + 2] = powf( store[X + 2], store[X + 6] );
765 store[X + 3] = powf( store[X + 3], store[X + 7] );
766 }
767
768 static void
769 emit_pow(
770 struct x86_function *func,
771 unsigned xmm_dst,
772 unsigned xmm_src )
773 {
774 emit_func_call_dst_src(
775 func,
776 xmm_dst,
777 xmm_src,
778 pow4f );
779 }
780
781 static void
782 emit_rcp (
783 struct x86_function *func,
784 unsigned xmm_dst,
785 unsigned xmm_src )
786 {
787 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
788 * good enough. Need to either emit a proper divide or use the
789 * iterative technique described below in emit_rsqrt().
790 */
791 sse2_rcpps(
792 func,
793 make_xmm( xmm_dst ),
794 make_xmm( xmm_src ) );
795 }
796
797 static void
798 emit_rsqrt(
799 struct x86_function *func,
800 unsigned xmm_dst,
801 unsigned xmm_src )
802 {
803 #if HIGH_PRECISION
804 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
805 * implementations, it is possible to improve its precision at
806 * fairly low cost, using a newton/raphson step, as below:
807 *
808 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
809 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
810 *
811 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
812 */
813 {
814 struct x86_reg dst = make_xmm( xmm_dst );
815 struct x86_reg src = make_xmm( xmm_src );
816 struct x86_reg tmp0 = make_xmm( 2 );
817 struct x86_reg tmp1 = make_xmm( 3 );
818
819 assert( xmm_dst != xmm_src );
820 assert( xmm_dst != 2 && xmm_dst != 3 );
821 assert( xmm_src != 2 && xmm_src != 3 );
822
823 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
824 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
825 sse_rsqrtps( func, tmp1, src );
826 sse_mulps( func, src, tmp1 );
827 sse_mulps( func, dst, tmp1 );
828 sse_mulps( func, src, tmp1 );
829 sse_subps( func, tmp0, src );
830 sse_mulps( func, dst, tmp0 );
831 }
832 #else
833 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
834 * good enough.
835 */
836 sse_rsqrtps(
837 func,
838 make_xmm( xmm_dst ),
839 make_xmm( xmm_src ) );
840 #endif
841 }
842
843 static void
844 emit_setsign(
845 struct x86_function *func,
846 unsigned xmm )
847 {
848 sse_orps(
849 func,
850 make_xmm( xmm ),
851 get_temp(
852 TGSI_EXEC_TEMP_80000000_I,
853 TGSI_EXEC_TEMP_80000000_C ) );
854 }
855
856 static void PIPE_CDECL
857 sin4f(
858 float *store )
859 {
860 const unsigned X = 0;
861
862 store[X + 0] = sinf( store[X + 0] );
863 store[X + 1] = sinf( store[X + 1] );
864 store[X + 2] = sinf( store[X + 2] );
865 store[X + 3] = sinf( store[X + 3] );
866 }
867
868 static void
869 emit_sin (struct x86_function *func,
870 unsigned xmm_dst)
871 {
872 emit_func_call_dst(
873 func,
874 xmm_dst,
875 sin4f );
876 }
877
878 static void
879 emit_sub(
880 struct x86_function *func,
881 unsigned xmm_dst,
882 unsigned xmm_src )
883 {
884 sse_subps(
885 func,
886 make_xmm( xmm_dst ),
887 make_xmm( xmm_src ) );
888 }
889
890 /**
891 * Register fetch.
892 */
893
894 static void
895 emit_fetch(
896 struct x86_function *func,
897 unsigned xmm,
898 const struct tgsi_full_src_register *reg,
899 const unsigned chan_index )
900 {
901 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
902
903 switch (swizzle) {
904 case TGSI_EXTSWIZZLE_X:
905 case TGSI_EXTSWIZZLE_Y:
906 case TGSI_EXTSWIZZLE_Z:
907 case TGSI_EXTSWIZZLE_W:
908 switch (reg->SrcRegister.File) {
909 case TGSI_FILE_CONSTANT:
910 emit_const(
911 func,
912 xmm,
913 reg->SrcRegister.Index,
914 swizzle,
915 reg->SrcRegister.Indirect,
916 reg->SrcRegisterInd.File,
917 reg->SrcRegisterInd.Index );
918 break;
919
920 case TGSI_FILE_IMMEDIATE:
921 emit_immediate(
922 func,
923 xmm,
924 reg->SrcRegister.Index,
925 swizzle );
926 break;
927
928 case TGSI_FILE_INPUT:
929 emit_inputf(
930 func,
931 xmm,
932 reg->SrcRegister.Index,
933 swizzle );
934 break;
935
936 case TGSI_FILE_TEMPORARY:
937 emit_tempf(
938 func,
939 xmm,
940 reg->SrcRegister.Index,
941 swizzle );
942 break;
943
944 default:
945 assert( 0 );
946 }
947 break;
948
949 case TGSI_EXTSWIZZLE_ZERO:
950 emit_tempf(
951 func,
952 xmm,
953 TGSI_EXEC_TEMP_00000000_I,
954 TGSI_EXEC_TEMP_00000000_C );
955 break;
956
957 case TGSI_EXTSWIZZLE_ONE:
958 emit_tempf(
959 func,
960 xmm,
961 TGSI_EXEC_TEMP_ONE_I,
962 TGSI_EXEC_TEMP_ONE_C );
963 break;
964
965 default:
966 assert( 0 );
967 }
968
969 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
970 case TGSI_UTIL_SIGN_CLEAR:
971 emit_abs( func, xmm );
972 break;
973
974 case TGSI_UTIL_SIGN_SET:
975 emit_setsign( func, xmm );
976 break;
977
978 case TGSI_UTIL_SIGN_TOGGLE:
979 emit_neg( func, xmm );
980 break;
981
982 case TGSI_UTIL_SIGN_KEEP:
983 break;
984 }
985 }
986
987 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
988 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
989
990 /**
991 * Register store.
992 */
993
994 static void
995 emit_store(
996 struct x86_function *func,
997 unsigned xmm,
998 const struct tgsi_full_dst_register *reg,
999 const struct tgsi_full_instruction *inst,
1000 unsigned chan_index )
1001 {
1002 switch( reg->DstRegister.File ) {
1003 case TGSI_FILE_OUTPUT:
1004 emit_output(
1005 func,
1006 xmm,
1007 reg->DstRegister.Index,
1008 chan_index );
1009 break;
1010
1011 case TGSI_FILE_TEMPORARY:
1012 emit_temps(
1013 func,
1014 xmm,
1015 reg->DstRegister.Index,
1016 chan_index );
1017 break;
1018
1019 case TGSI_FILE_ADDRESS:
1020 emit_addrs(
1021 func,
1022 xmm,
1023 reg->DstRegister.Index,
1024 chan_index );
1025 break;
1026
1027 default:
1028 assert( 0 );
1029 }
1030
1031 switch( inst->Instruction.Saturate ) {
1032 case TGSI_SAT_NONE:
1033 break;
1034
1035 case TGSI_SAT_ZERO_ONE:
1036 /* assert( 0 ); */
1037 break;
1038
1039 case TGSI_SAT_MINUS_PLUS_ONE:
1040 assert( 0 );
1041 break;
1042 }
1043 }
1044
1045 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1046 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1047
1048 /**
1049 * High-level instruction translators.
1050 */
1051
1052 static void
1053 emit_kil(
1054 struct x86_function *func,
1055 const struct tgsi_full_src_register *reg )
1056 {
1057 unsigned uniquemask;
1058 unsigned registers[4];
1059 unsigned nextregister = 0;
1060 unsigned firstchan = ~0;
1061 unsigned chan_index;
1062
1063 /* This mask stores component bits that were already tested. Note that
1064 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1065 * tested. */
1066 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1067
1068 FOR_EACH_CHANNEL( chan_index ) {
1069 unsigned swizzle;
1070
1071 /* unswizzle channel */
1072 swizzle = tgsi_util_get_full_src_register_extswizzle(
1073 reg,
1074 chan_index );
1075
1076 /* check if the component has not been already tested */
1077 if( !(uniquemask & (1 << swizzle)) ) {
1078 uniquemask |= 1 << swizzle;
1079
1080 /* allocate register */
1081 registers[chan_index] = nextregister;
1082 emit_fetch(
1083 func,
1084 nextregister,
1085 reg,
1086 chan_index );
1087 nextregister++;
1088
1089 /* mark the first channel used */
1090 if( firstchan == ~0 ) {
1091 firstchan = chan_index;
1092 }
1093 }
1094 }
1095
1096 x86_push(
1097 func,
1098 x86_make_reg( file_REG32, reg_AX ) );
1099 x86_push(
1100 func,
1101 x86_make_reg( file_REG32, reg_DX ) );
1102
1103 FOR_EACH_CHANNEL( chan_index ) {
1104 if( uniquemask & (1 << chan_index) ) {
1105 sse_cmpps(
1106 func,
1107 make_xmm( registers[chan_index] ),
1108 get_temp(
1109 TGSI_EXEC_TEMP_00000000_I,
1110 TGSI_EXEC_TEMP_00000000_C ),
1111 cc_LessThan );
1112
1113 if( chan_index == firstchan ) {
1114 sse_pmovmskb(
1115 func,
1116 x86_make_reg( file_REG32, reg_AX ),
1117 make_xmm( registers[chan_index] ) );
1118 }
1119 else {
1120 sse_pmovmskb(
1121 func,
1122 x86_make_reg( file_REG32, reg_DX ),
1123 make_xmm( registers[chan_index] ) );
1124 x86_or(
1125 func,
1126 x86_make_reg( file_REG32, reg_AX ),
1127 x86_make_reg( file_REG32, reg_DX ) );
1128 }
1129 }
1130 }
1131
1132 x86_or(
1133 func,
1134 get_temp(
1135 TGSI_EXEC_TEMP_KILMASK_I,
1136 TGSI_EXEC_TEMP_KILMASK_C ),
1137 x86_make_reg( file_REG32, reg_AX ) );
1138
1139 x86_pop(
1140 func,
1141 x86_make_reg( file_REG32, reg_DX ) );
1142 x86_pop(
1143 func,
1144 x86_make_reg( file_REG32, reg_AX ) );
1145 }
1146
1147
1148 static void
1149 emit_kilp(
1150 struct x86_function *func )
1151 {
1152 /* XXX todo / fix me */
1153 }
1154
1155
1156 static void
1157 emit_setcc(
1158 struct x86_function *func,
1159 struct tgsi_full_instruction *inst,
1160 enum sse_cc cc )
1161 {
1162 unsigned chan_index;
1163
1164 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1165 FETCH( func, *inst, 0, 0, chan_index );
1166 FETCH( func, *inst, 1, 1, chan_index );
1167 sse_cmpps(
1168 func,
1169 make_xmm( 0 ),
1170 make_xmm( 1 ),
1171 cc );
1172 sse_andps(
1173 func,
1174 make_xmm( 0 ),
1175 get_temp(
1176 TGSI_EXEC_TEMP_ONE_I,
1177 TGSI_EXEC_TEMP_ONE_C ) );
1178 STORE( func, *inst, 0, 0, chan_index );
1179 }
1180 }
1181
1182 static void
1183 emit_cmp(
1184 struct x86_function *func,
1185 struct tgsi_full_instruction *inst )
1186 {
1187 unsigned chan_index;
1188
1189 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1190 FETCH( func, *inst, 0, 0, chan_index );
1191 FETCH( func, *inst, 1, 1, chan_index );
1192 FETCH( func, *inst, 2, 2, chan_index );
1193 sse_cmpps(
1194 func,
1195 make_xmm( 0 ),
1196 get_temp(
1197 TGSI_EXEC_TEMP_00000000_I,
1198 TGSI_EXEC_TEMP_00000000_C ),
1199 cc_LessThan );
1200 sse_andps(
1201 func,
1202 make_xmm( 1 ),
1203 make_xmm( 0 ) );
1204 sse_andnps(
1205 func,
1206 make_xmm( 0 ),
1207 make_xmm( 2 ) );
1208 sse_orps(
1209 func,
1210 make_xmm( 0 ),
1211 make_xmm( 1 ) );
1212 STORE( func, *inst, 0, 0, chan_index );
1213 }
1214 }
1215
1216 static int
1217 emit_instruction(
1218 struct x86_function *func,
1219 struct tgsi_full_instruction *inst )
1220 {
1221 unsigned chan_index;
1222
1223 switch (inst->Instruction.Opcode) {
1224 case TGSI_OPCODE_ARL:
1225 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1226 FETCH( func, *inst, 0, 0, chan_index );
1227 emit_f2it( func, 0 );
1228 STORE( func, *inst, 0, 0, chan_index );
1229 }
1230 break;
1231
1232 case TGSI_OPCODE_MOV:
1233 case TGSI_OPCODE_SWZ:
1234 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1235 FETCH( func, *inst, 0, 0, chan_index );
1236 STORE( func, *inst, 0, 0, chan_index );
1237 }
1238 break;
1239
1240 case TGSI_OPCODE_LIT:
1241 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1242 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1243 emit_tempf(
1244 func,
1245 0,
1246 TGSI_EXEC_TEMP_ONE_I,
1247 TGSI_EXEC_TEMP_ONE_C);
1248 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1249 STORE( func, *inst, 0, 0, CHAN_X );
1250 }
1251 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1252 STORE( func, *inst, 0, 0, CHAN_W );
1253 }
1254 }
1255 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1256 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1257 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1258 FETCH( func, *inst, 0, 0, CHAN_X );
1259 sse_maxps(
1260 func,
1261 make_xmm( 0 ),
1262 get_temp(
1263 TGSI_EXEC_TEMP_00000000_I,
1264 TGSI_EXEC_TEMP_00000000_C ) );
1265 STORE( func, *inst, 0, 0, CHAN_Y );
1266 }
1267 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1268 /* XMM[1] = SrcReg[0].yyyy */
1269 FETCH( func, *inst, 1, 0, CHAN_Y );
1270 /* XMM[1] = max(XMM[1], 0) */
1271 sse_maxps(
1272 func,
1273 make_xmm( 1 ),
1274 get_temp(
1275 TGSI_EXEC_TEMP_00000000_I,
1276 TGSI_EXEC_TEMP_00000000_C ) );
1277 /* XMM[2] = SrcReg[0].wwww */
1278 FETCH( func, *inst, 2, 0, CHAN_W );
1279 /* XMM[2] = min(XMM[2], 128.0) */
1280 sse_minps(
1281 func,
1282 make_xmm( 2 ),
1283 get_temp(
1284 TGSI_EXEC_TEMP_128_I,
1285 TGSI_EXEC_TEMP_128_C ) );
1286 /* XMM[2] = max(XMM[2], -128.0) */
1287 sse_maxps(
1288 func,
1289 make_xmm( 2 ),
1290 get_temp(
1291 TGSI_EXEC_TEMP_MINUS_128_I,
1292 TGSI_EXEC_TEMP_MINUS_128_C ) );
1293 emit_pow( func, 1, 2 );
1294 FETCH( func, *inst, 0, 0, CHAN_X );
1295 sse_xorps(
1296 func,
1297 make_xmm( 2 ),
1298 make_xmm( 2 ) );
1299 sse_cmpps(
1300 func,
1301 make_xmm( 2 ),
1302 make_xmm( 0 ),
1303 cc_LessThanEqual );
1304 sse_andps(
1305 func,
1306 make_xmm( 2 ),
1307 make_xmm( 1 ) );
1308 STORE( func, *inst, 2, 0, CHAN_Z );
1309 }
1310 }
1311 break;
1312
1313 case TGSI_OPCODE_RCP:
1314 /* TGSI_OPCODE_RECIP */
1315 FETCH( func, *inst, 0, 0, CHAN_X );
1316 emit_rcp( func, 0, 0 );
1317 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1318 STORE( func, *inst, 0, 0, chan_index );
1319 }
1320 break;
1321
1322 case TGSI_OPCODE_RSQ:
1323 /* TGSI_OPCODE_RECIPSQRT */
1324 FETCH( func, *inst, 0, 0, CHAN_X );
1325 emit_rsqrt( func, 1, 0 );
1326 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1327 STORE( func, *inst, 1, 0, chan_index );
1328 }
1329 break;
1330
1331 case TGSI_OPCODE_EXP:
1332 return 0;
1333 break;
1334
1335 case TGSI_OPCODE_LOG:
1336 return 0;
1337 break;
1338
1339 case TGSI_OPCODE_MUL:
1340 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1341 FETCH( func, *inst, 0, 0, chan_index );
1342 FETCH( func, *inst, 1, 1, chan_index );
1343 emit_mul( func, 0, 1 );
1344 STORE( func, *inst, 0, 0, chan_index );
1345 }
1346 break;
1347
1348 case TGSI_OPCODE_ADD:
1349 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1350 FETCH( func, *inst, 0, 0, chan_index );
1351 FETCH( func, *inst, 1, 1, chan_index );
1352 emit_add( func, 0, 1 );
1353 STORE( func, *inst, 0, 0, chan_index );
1354 }
1355 break;
1356
1357 case TGSI_OPCODE_DP3:
1358 /* TGSI_OPCODE_DOT3 */
1359 FETCH( func, *inst, 0, 0, CHAN_X );
1360 FETCH( func, *inst, 1, 1, CHAN_X );
1361 emit_mul( func, 0, 1 );
1362 FETCH( func, *inst, 1, 0, CHAN_Y );
1363 FETCH( func, *inst, 2, 1, CHAN_Y );
1364 emit_mul( func, 1, 2 );
1365 emit_add( func, 0, 1 );
1366 FETCH( func, *inst, 1, 0, CHAN_Z );
1367 FETCH( func, *inst, 2, 1, CHAN_Z );
1368 emit_mul( func, 1, 2 );
1369 emit_add( func, 0, 1 );
1370 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1371 STORE( func, *inst, 0, 0, chan_index );
1372 }
1373 break;
1374
1375 case TGSI_OPCODE_DP4:
1376 /* TGSI_OPCODE_DOT4 */
1377 FETCH( func, *inst, 0, 0, CHAN_X );
1378 FETCH( func, *inst, 1, 1, CHAN_X );
1379 emit_mul( func, 0, 1 );
1380 FETCH( func, *inst, 1, 0, CHAN_Y );
1381 FETCH( func, *inst, 2, 1, CHAN_Y );
1382 emit_mul( func, 1, 2 );
1383 emit_add( func, 0, 1 );
1384 FETCH( func, *inst, 1, 0, CHAN_Z );
1385 FETCH( func, *inst, 2, 1, CHAN_Z );
1386 emit_mul(func, 1, 2 );
1387 emit_add(func, 0, 1 );
1388 FETCH( func, *inst, 1, 0, CHAN_W );
1389 FETCH( func, *inst, 2, 1, CHAN_W );
1390 emit_mul( func, 1, 2 );
1391 emit_add( func, 0, 1 );
1392 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1393 STORE( func, *inst, 0, 0, chan_index );
1394 }
1395 break;
1396
1397 case TGSI_OPCODE_DST:
1398 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1399 emit_tempf(
1400 func,
1401 0,
1402 TGSI_EXEC_TEMP_ONE_I,
1403 TGSI_EXEC_TEMP_ONE_C );
1404 STORE( func, *inst, 0, 0, CHAN_X );
1405 }
1406 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1407 FETCH( func, *inst, 0, 0, CHAN_Y );
1408 FETCH( func, *inst, 1, 1, CHAN_Y );
1409 emit_mul( func, 0, 1 );
1410 STORE( func, *inst, 0, 0, CHAN_Y );
1411 }
1412 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1413 FETCH( func, *inst, 0, 0, CHAN_Z );
1414 STORE( func, *inst, 0, 0, CHAN_Z );
1415 }
1416 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1417 FETCH( func, *inst, 0, 1, CHAN_W );
1418 STORE( func, *inst, 0, 0, CHAN_W );
1419 }
1420 break;
1421
1422 case TGSI_OPCODE_MIN:
1423 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1424 FETCH( func, *inst, 0, 0, chan_index );
1425 FETCH( func, *inst, 1, 1, chan_index );
1426 sse_minps(
1427 func,
1428 make_xmm( 0 ),
1429 make_xmm( 1 ) );
1430 STORE( func, *inst, 0, 0, chan_index );
1431 }
1432 break;
1433
1434 case TGSI_OPCODE_MAX:
1435 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1436 FETCH( func, *inst, 0, 0, chan_index );
1437 FETCH( func, *inst, 1, 1, chan_index );
1438 sse_maxps(
1439 func,
1440 make_xmm( 0 ),
1441 make_xmm( 1 ) );
1442 STORE( func, *inst, 0, 0, chan_index );
1443 }
1444 break;
1445
1446 case TGSI_OPCODE_SLT:
1447 /* TGSI_OPCODE_SETLT */
1448 emit_setcc( func, inst, cc_LessThan );
1449 break;
1450
1451 case TGSI_OPCODE_SGE:
1452 /* TGSI_OPCODE_SETGE */
1453 emit_setcc( func, inst, cc_NotLessThan );
1454 break;
1455
1456 case TGSI_OPCODE_MAD:
1457 /* TGSI_OPCODE_MADD */
1458 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1459 FETCH( func, *inst, 0, 0, chan_index );
1460 FETCH( func, *inst, 1, 1, chan_index );
1461 FETCH( func, *inst, 2, 2, chan_index );
1462 emit_mul( func, 0, 1 );
1463 emit_add( func, 0, 2 );
1464 STORE( func, *inst, 0, 0, chan_index );
1465 }
1466 break;
1467
1468 case TGSI_OPCODE_SUB:
1469 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1470 FETCH( func, *inst, 0, 0, chan_index );
1471 FETCH( func, *inst, 1, 1, chan_index );
1472 emit_sub( func, 0, 1 );
1473 STORE( func, *inst, 0, 0, chan_index );
1474 }
1475 break;
1476
1477 case TGSI_OPCODE_LERP:
1478 /* TGSI_OPCODE_LRP */
1479 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1480 FETCH( func, *inst, 0, 0, chan_index );
1481 FETCH( func, *inst, 1, 1, chan_index );
1482 FETCH( func, *inst, 2, 2, chan_index );
1483 emit_sub( func, 1, 2 );
1484 emit_mul( func, 0, 1 );
1485 emit_add( func, 0, 2 );
1486 STORE( func, *inst, 0, 0, chan_index );
1487 }
1488 break;
1489
1490 case TGSI_OPCODE_CND:
1491 return 0;
1492 break;
1493
1494 case TGSI_OPCODE_CND0:
1495 return 0;
1496 break;
1497
1498 case TGSI_OPCODE_DOT2ADD:
1499 /* TGSI_OPCODE_DP2A */
1500 return 0;
1501 break;
1502
1503 case TGSI_OPCODE_INDEX:
1504 return 0;
1505 break;
1506
1507 case TGSI_OPCODE_NEGATE:
1508 return 0;
1509 break;
1510
1511 case TGSI_OPCODE_FRAC:
1512 /* TGSI_OPCODE_FRC */
1513 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1514 FETCH( func, *inst, 0, 0, chan_index );
1515 emit_frc( func, 0 );
1516 STORE( func, *inst, 0, 0, chan_index );
1517 }
1518 break;
1519
1520 case TGSI_OPCODE_CLAMP:
1521 return 0;
1522 break;
1523
1524 case TGSI_OPCODE_FLOOR:
1525 /* TGSI_OPCODE_FLR */
1526 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1527 FETCH( func, *inst, 0, 0, chan_index );
1528 emit_flr( func, 0 );
1529 STORE( func, *inst, 0, 0, chan_index );
1530 }
1531 break;
1532
1533 case TGSI_OPCODE_ROUND:
1534 return 0;
1535 break;
1536
1537 case TGSI_OPCODE_EXPBASE2:
1538 /* TGSI_OPCODE_EX2 */
1539 FETCH( func, *inst, 0, 0, CHAN_X );
1540 emit_ex2( func, 0 );
1541 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1542 STORE( func, *inst, 0, 0, chan_index );
1543 }
1544 break;
1545
1546 case TGSI_OPCODE_LOGBASE2:
1547 /* TGSI_OPCODE_LG2 */
1548 FETCH( func, *inst, 0, 0, CHAN_X );
1549 emit_lg2( func, 0 );
1550 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1551 STORE( func, *inst, 0, 0, chan_index );
1552 }
1553 break;
1554
1555 case TGSI_OPCODE_POWER:
1556 /* TGSI_OPCODE_POW */
1557 FETCH( func, *inst, 0, 0, CHAN_X );
1558 FETCH( func, *inst, 1, 1, CHAN_X );
1559 emit_pow( func, 0, 1 );
1560 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1561 STORE( func, *inst, 0, 0, chan_index );
1562 }
1563 break;
1564
1565 case TGSI_OPCODE_CROSSPRODUCT:
1566 /* TGSI_OPCODE_XPD */
1567 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1568 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1569 FETCH( func, *inst, 1, 1, CHAN_Z );
1570 FETCH( func, *inst, 3, 0, CHAN_Z );
1571 }
1572 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1573 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1574 FETCH( func, *inst, 0, 0, CHAN_Y );
1575 FETCH( func, *inst, 4, 1, CHAN_Y );
1576 }
1577 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1578 emit_MOV( func, 2, 0 );
1579 emit_mul( func, 2, 1 );
1580 emit_MOV( func, 5, 3 );
1581 emit_mul( func, 5, 4 );
1582 emit_sub( func, 2, 5 );
1583 STORE( func, *inst, 2, 0, CHAN_X );
1584 }
1585 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1586 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1587 FETCH( func, *inst, 2, 1, CHAN_X );
1588 FETCH( func, *inst, 5, 0, CHAN_X );
1589 }
1590 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1591 emit_mul( func, 3, 2 );
1592 emit_mul( func, 1, 5 );
1593 emit_sub( func, 3, 1 );
1594 STORE( func, *inst, 3, 0, CHAN_Y );
1595 }
1596 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1597 emit_mul( func, 5, 4 );
1598 emit_mul( func, 0, 2 );
1599 emit_sub( func, 5, 0 );
1600 STORE( func, *inst, 5, 0, CHAN_Z );
1601 }
1602 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1603 emit_tempf(
1604 func,
1605 0,
1606 TGSI_EXEC_TEMP_ONE_I,
1607 TGSI_EXEC_TEMP_ONE_C );
1608 STORE( func, *inst, 0, 0, CHAN_W );
1609 }
1610 break;
1611
1612 case TGSI_OPCODE_MULTIPLYMATRIX:
1613 return 0;
1614 break;
1615
1616 case TGSI_OPCODE_ABS:
1617 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1618 FETCH( func, *inst, 0, 0, chan_index );
1619 emit_abs( func, 0) ;
1620
1621 STORE( func, *inst, 0, 0, chan_index );
1622 }
1623 break;
1624
1625 case TGSI_OPCODE_RCC:
1626 return 0;
1627 break;
1628
1629 case TGSI_OPCODE_DPH:
1630 FETCH( func, *inst, 0, 0, CHAN_X );
1631 FETCH( func, *inst, 1, 1, CHAN_X );
1632 emit_mul( func, 0, 1 );
1633 FETCH( func, *inst, 1, 0, CHAN_Y );
1634 FETCH( func, *inst, 2, 1, CHAN_Y );
1635 emit_mul( func, 1, 2 );
1636 emit_add( func, 0, 1 );
1637 FETCH( func, *inst, 1, 0, CHAN_Z );
1638 FETCH( func, *inst, 2, 1, CHAN_Z );
1639 emit_mul( func, 1, 2 );
1640 emit_add( func, 0, 1 );
1641 FETCH( func, *inst, 1, 1, CHAN_W );
1642 emit_add( func, 0, 1 );
1643 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1644 STORE( func, *inst, 0, 0, chan_index );
1645 }
1646 break;
1647
1648 case TGSI_OPCODE_COS:
1649 FETCH( func, *inst, 0, 0, CHAN_X );
1650 emit_cos( func, 0 );
1651 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1652 STORE( func, *inst, 0, 0, chan_index );
1653 }
1654 break;
1655
1656 case TGSI_OPCODE_DDX:
1657 return 0;
1658 break;
1659
1660 case TGSI_OPCODE_DDY:
1661 return 0;
1662 break;
1663
1664 case TGSI_OPCODE_KILP:
1665 /* predicated kill */
1666 emit_kilp( func );
1667 return 0; /* XXX fix me */
1668 break;
1669
1670 case TGSI_OPCODE_KIL:
1671 /* conditional kill */
1672 emit_kil( func, &inst->FullSrcRegisters[0] );
1673 break;
1674
1675 case TGSI_OPCODE_PK2H:
1676 return 0;
1677 break;
1678
1679 case TGSI_OPCODE_PK2US:
1680 return 0;
1681 break;
1682
1683 case TGSI_OPCODE_PK4B:
1684 return 0;
1685 break;
1686
1687 case TGSI_OPCODE_PK4UB:
1688 return 0;
1689 break;
1690
1691 case TGSI_OPCODE_RFL:
1692 return 0;
1693 break;
1694
1695 case TGSI_OPCODE_SEQ:
1696 return 0;
1697 break;
1698
1699 case TGSI_OPCODE_SFL:
1700 return 0;
1701 break;
1702
1703 case TGSI_OPCODE_SGT:
1704 return 0;
1705 break;
1706
1707 case TGSI_OPCODE_SIN:
1708 FETCH( func, *inst, 0, 0, CHAN_X );
1709 emit_sin( func, 0 );
1710 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1711 STORE( func, *inst, 0, 0, chan_index );
1712 }
1713 break;
1714
1715 case TGSI_OPCODE_SLE:
1716 return 0;
1717 break;
1718
1719 case TGSI_OPCODE_SNE:
1720 return 0;
1721 break;
1722
1723 case TGSI_OPCODE_STR:
1724 return 0;
1725 break;
1726
1727 case TGSI_OPCODE_TEX:
1728 if (0) {
1729 /* Disable dummy texture code:
1730 */
1731 emit_tempf(
1732 func,
1733 0,
1734 TGSI_EXEC_TEMP_ONE_I,
1735 TGSI_EXEC_TEMP_ONE_C );
1736 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1737 STORE( func, *inst, 0, 0, chan_index );
1738 }
1739 }
1740 else {
1741 return 0;
1742 }
1743 break;
1744
1745 case TGSI_OPCODE_TXD:
1746 return 0;
1747 break;
1748
1749 case TGSI_OPCODE_UP2H:
1750 return 0;
1751 break;
1752
1753 case TGSI_OPCODE_UP2US:
1754 return 0;
1755 break;
1756
1757 case TGSI_OPCODE_UP4B:
1758 return 0;
1759 break;
1760
1761 case TGSI_OPCODE_UP4UB:
1762 return 0;
1763 break;
1764
1765 case TGSI_OPCODE_X2D:
1766 return 0;
1767 break;
1768
1769 case TGSI_OPCODE_ARA:
1770 return 0;
1771 break;
1772
1773 case TGSI_OPCODE_ARR:
1774 return 0;
1775 break;
1776
1777 case TGSI_OPCODE_BRA:
1778 return 0;
1779 break;
1780
1781 case TGSI_OPCODE_CAL:
1782 return 0;
1783 break;
1784
1785 case TGSI_OPCODE_RET:
1786 emit_ret( func );
1787 break;
1788
1789 case TGSI_OPCODE_END:
1790 break;
1791
1792 case TGSI_OPCODE_SSG:
1793 return 0;
1794 break;
1795
1796 case TGSI_OPCODE_CMP:
1797 emit_cmp (func, inst);
1798 break;
1799
1800 case TGSI_OPCODE_SCS:
1801 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1802 FETCH( func, *inst, 0, 0, CHAN_X );
1803 emit_cos( func, 0 );
1804 STORE( func, *inst, 0, 0, CHAN_X );
1805 }
1806 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1807 FETCH( func, *inst, 0, 0, CHAN_X );
1808 emit_sin( func, 0 );
1809 STORE( func, *inst, 0, 0, CHAN_Y );
1810 }
1811 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1812 emit_tempf(
1813 func,
1814 0,
1815 TGSI_EXEC_TEMP_00000000_I,
1816 TGSI_EXEC_TEMP_00000000_C );
1817 STORE( func, *inst, 0, 0, CHAN_Z );
1818 }
1819 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1820 emit_tempf(
1821 func,
1822 0,
1823 TGSI_EXEC_TEMP_ONE_I,
1824 TGSI_EXEC_TEMP_ONE_C );
1825 STORE( func, *inst, 0, 0, CHAN_W );
1826 }
1827 break;
1828
1829 case TGSI_OPCODE_TXB:
1830 return 0;
1831 break;
1832
1833 case TGSI_OPCODE_NRM:
1834 return 0;
1835 break;
1836
1837 case TGSI_OPCODE_DIV:
1838 return 0;
1839 break;
1840
1841 case TGSI_OPCODE_DP2:
1842 return 0;
1843 break;
1844
1845 case TGSI_OPCODE_TXL:
1846 return 0;
1847 break;
1848
1849 case TGSI_OPCODE_BRK:
1850 return 0;
1851 break;
1852
1853 case TGSI_OPCODE_IF:
1854 return 0;
1855 break;
1856
1857 case TGSI_OPCODE_LOOP:
1858 return 0;
1859 break;
1860
1861 case TGSI_OPCODE_REP:
1862 return 0;
1863 break;
1864
1865 case TGSI_OPCODE_ELSE:
1866 return 0;
1867 break;
1868
1869 case TGSI_OPCODE_ENDIF:
1870 return 0;
1871 break;
1872
1873 case TGSI_OPCODE_ENDLOOP:
1874 return 0;
1875 break;
1876
1877 case TGSI_OPCODE_ENDREP:
1878 return 0;
1879 break;
1880
1881 case TGSI_OPCODE_PUSHA:
1882 return 0;
1883 break;
1884
1885 case TGSI_OPCODE_POPA:
1886 return 0;
1887 break;
1888
1889 case TGSI_OPCODE_CEIL:
1890 return 0;
1891 break;
1892
1893 case TGSI_OPCODE_I2F:
1894 return 0;
1895 break;
1896
1897 case TGSI_OPCODE_NOT:
1898 return 0;
1899 break;
1900
1901 case TGSI_OPCODE_TRUNC:
1902 return 0;
1903 break;
1904
1905 case TGSI_OPCODE_SHL:
1906 return 0;
1907 break;
1908
1909 case TGSI_OPCODE_SHR:
1910 return 0;
1911 break;
1912
1913 case TGSI_OPCODE_AND:
1914 return 0;
1915 break;
1916
1917 case TGSI_OPCODE_OR:
1918 return 0;
1919 break;
1920
1921 case TGSI_OPCODE_MOD:
1922 return 0;
1923 break;
1924
1925 case TGSI_OPCODE_XOR:
1926 return 0;
1927 break;
1928
1929 case TGSI_OPCODE_SAD:
1930 return 0;
1931 break;
1932
1933 case TGSI_OPCODE_TXF:
1934 return 0;
1935 break;
1936
1937 case TGSI_OPCODE_TXQ:
1938 return 0;
1939 break;
1940
1941 case TGSI_OPCODE_CONT:
1942 return 0;
1943 break;
1944
1945 case TGSI_OPCODE_EMIT:
1946 return 0;
1947 break;
1948
1949 case TGSI_OPCODE_ENDPRIM:
1950 return 0;
1951 break;
1952
1953 default:
1954 return 0;
1955 }
1956
1957 return 1;
1958 }
1959
1960 static void
1961 emit_declaration(
1962 struct x86_function *func,
1963 struct tgsi_full_declaration *decl )
1964 {
1965 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1966 unsigned first, last, mask;
1967 unsigned i, j;
1968
1969 first = decl->DeclarationRange.First;
1970 last = decl->DeclarationRange.Last;
1971 mask = decl->Declaration.UsageMask;
1972
1973 for( i = first; i <= last; i++ ) {
1974 for( j = 0; j < NUM_CHANNELS; j++ ) {
1975 if( mask & (1 << j) ) {
1976 switch( decl->Declaration.Interpolate ) {
1977 case TGSI_INTERPOLATE_CONSTANT:
1978 emit_coef_a0( func, 0, i, j );
1979 emit_inputs( func, 0, i, j );
1980 break;
1981
1982 case TGSI_INTERPOLATE_LINEAR:
1983 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1984 emit_coef_dadx( func, 1, i, j );
1985 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1986 emit_coef_dady( func, 3, i, j );
1987 emit_mul( func, 0, 1 ); /* x * dadx */
1988 emit_coef_a0( func, 4, i, j );
1989 emit_mul( func, 2, 3 ); /* y * dady */
1990 emit_add( func, 0, 4 ); /* x * dadx + a0 */
1991 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
1992 emit_inputs( func, 0, i, j );
1993 break;
1994
1995 case TGSI_INTERPOLATE_PERSPECTIVE:
1996 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1997 emit_coef_dadx( func, 1, i, j );
1998 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1999 emit_coef_dady( func, 3, i, j );
2000 emit_mul( func, 0, 1 ); /* x * dadx */
2001 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2002 emit_coef_a0( func, 5, i, j );
2003 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2004 emit_mul( func, 2, 3 ); /* y * dady */
2005 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2006 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2007 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2008 emit_inputs( func, 0, i, j );
2009 break;
2010
2011 default:
2012 assert( 0 );
2013 break;
2014 }
2015 }
2016 }
2017 }
2018 }
2019 }
2020
2021 static void aos_to_soa( struct x86_function *func,
2022 uint arg_aos,
2023 uint arg_soa,
2024 uint arg_num,
2025 uint arg_stride )
2026 {
2027 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2028 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2029 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2030 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2031 int inner_loop;
2032
2033
2034 /* Save EBX */
2035 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2036
2037 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2038 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2039 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2040 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2041
2042 /* do */
2043 inner_loop = x86_get_label( func );
2044 {
2045 x86_push( func, aos_input );
2046 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2047 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2048 x86_add( func, aos_input, stride );
2049 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2050 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2051 x86_add( func, aos_input, stride );
2052 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2053 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2054 x86_add( func, aos_input, stride );
2055 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2056 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2057 x86_pop( func, aos_input );
2058
2059 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2060 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2061 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2062 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2063 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2064 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2065
2066 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2067 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2068 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2069 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2070
2071 /* Advance to next input */
2072 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2073 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2074 }
2075 /* while --num_inputs */
2076 x86_dec( func, num_inputs );
2077 x86_jcc( func, cc_NE, inner_loop );
2078
2079 /* Restore EBX */
2080 x86_pop( func, aos_input );
2081 }
2082
2083 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2084 {
2085 struct x86_reg soa_output;
2086 struct x86_reg aos_output;
2087 struct x86_reg num_outputs;
2088 struct x86_reg temp;
2089 int inner_loop;
2090
2091 soa_output = x86_make_reg( file_REG32, reg_AX );
2092 aos_output = x86_make_reg( file_REG32, reg_BX );
2093 num_outputs = x86_make_reg( file_REG32, reg_CX );
2094 temp = x86_make_reg( file_REG32, reg_DX );
2095
2096 /* Save EBX */
2097 x86_push( func, aos_output );
2098
2099 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2100 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2101 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2102
2103 /* do */
2104 inner_loop = x86_get_label( func );
2105 {
2106 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2107 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2108 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2109 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2110
2111 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2112 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2113 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2114 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2115 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2116 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2117
2118 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2119 x86_push( func, aos_output );
2120 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2121 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2122 x86_add( func, aos_output, temp );
2123 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2124 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2125 x86_add( func, aos_output, temp );
2126 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2127 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2128 x86_add( func, aos_output, temp );
2129 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2130 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2131 x86_pop( func, aos_output );
2132
2133 /* Advance to next output */
2134 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2135 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2136 }
2137 /* while --num_outputs */
2138 x86_dec( func, num_outputs );
2139 x86_jcc( func, cc_NE, inner_loop );
2140
2141 /* Restore EBX */
2142 x86_pop( func, aos_output );
2143 }
2144
2145 /**
2146 * Translate a TGSI vertex/fragment shader to SSE2 code.
2147 * Slightly different things are done for vertex vs. fragment shaders.
2148 *
2149 * Note that fragment shaders are responsible for interpolating shader
2150 * inputs. Because on x86 we have only 4 GP registers, and here we
2151 * have 5 shader arguments (input, output, const, temp and coef), the
2152 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2153 * GP register holding the output argument is aliased with the coeff
2154 * argument, as outputs are not needed in the DECLARATION phase.
2155 *
2156 * \param tokens the TGSI input shader
2157 * \param func the output SSE code/function
2158 * \param immediates buffer to place immediates, later passed to SSE func
2159 * \param return 1 for success, 0 if translation failed
2160 */
2161 unsigned
2162 tgsi_emit_sse2(
2163 const struct tgsi_token *tokens,
2164 struct x86_function *func,
2165 float (*immediates)[4],
2166 boolean do_swizzles )
2167 {
2168 struct tgsi_parse_context parse;
2169 boolean instruction_phase = FALSE;
2170 unsigned ok = 1;
2171 uint num_immediates = 0;
2172
2173 func->csr = func->store;
2174
2175 tgsi_parse_init( &parse, tokens );
2176
2177 /* Can't just use EDI, EBX without save/restoring them:
2178 */
2179 x86_push(
2180 func,
2181 get_immediate_base() );
2182
2183 x86_push(
2184 func,
2185 get_temp_base() );
2186
2187
2188 /*
2189 * Different function args for vertex/fragment shaders:
2190 */
2191 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2192 /* DECLARATION phase, do not load output argument. */
2193 x86_mov(
2194 func,
2195 get_input_base(),
2196 x86_fn_arg( func, 1 ) );
2197 /* skipping outputs argument here */
2198 x86_mov(
2199 func,
2200 get_const_base(),
2201 x86_fn_arg( func, 3 ) );
2202 x86_mov(
2203 func,
2204 get_temp_base(),
2205 x86_fn_arg( func, 4 ) );
2206 x86_mov(
2207 func,
2208 get_coef_base(),
2209 x86_fn_arg( func, 5 ) );
2210 x86_mov(
2211 func,
2212 get_immediate_base(),
2213 x86_fn_arg( func, 6 ) );
2214 }
2215 else {
2216 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2217
2218 if (do_swizzles)
2219 aos_to_soa( func,
2220 6, /* aos_input */
2221 1, /* machine->input */
2222 7, /* num_inputs */
2223 8 ); /* input_stride */
2224
2225 x86_mov(
2226 func,
2227 get_input_base(),
2228 x86_fn_arg( func, 1 ) );
2229 x86_mov(
2230 func,
2231 get_output_base(),
2232 x86_fn_arg( func, 2 ) );
2233 x86_mov(
2234 func,
2235 get_const_base(),
2236 x86_fn_arg( func, 3 ) );
2237 x86_mov(
2238 func,
2239 get_temp_base(),
2240 x86_fn_arg( func, 4 ) );
2241 x86_mov(
2242 func,
2243 get_immediate_base(),
2244 x86_fn_arg( func, 5 ) );
2245 }
2246
2247 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2248 tgsi_parse_token( &parse );
2249
2250 switch( parse.FullToken.Token.Type ) {
2251 case TGSI_TOKEN_TYPE_DECLARATION:
2252 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2253 emit_declaration(
2254 func,
2255 &parse.FullToken.FullDeclaration );
2256 }
2257 break;
2258
2259 case TGSI_TOKEN_TYPE_INSTRUCTION:
2260 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2261 if( !instruction_phase ) {
2262 /* INSTRUCTION phase, overwrite coeff with output. */
2263 instruction_phase = TRUE;
2264 x86_mov(
2265 func,
2266 get_output_base(),
2267 x86_fn_arg( func, 2 ) );
2268 }
2269 }
2270
2271 ok = emit_instruction(
2272 func,
2273 &parse.FullToken.FullInstruction );
2274
2275 if (!ok) {
2276 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2277 parse.FullToken.FullInstruction.Instruction.Opcode,
2278 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2279 "vertex shader" : "fragment shader");
2280 }
2281 break;
2282
2283 case TGSI_TOKEN_TYPE_IMMEDIATE:
2284 /* simply copy the immediate values into the next immediates[] slot */
2285 {
2286 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2287 uint i;
2288 assert(size <= 4);
2289 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2290 for( i = 0; i < size; i++ ) {
2291 immediates[num_immediates][i] =
2292 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2293 }
2294 #if 0
2295 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2296 num_immediates,
2297 immediates[num_immediates][0],
2298 immediates[num_immediates][1],
2299 immediates[num_immediates][2],
2300 immediates[num_immediates][3]);
2301 #endif
2302 num_immediates++;
2303 }
2304 break;
2305
2306 default:
2307 ok = 0;
2308 assert( 0 );
2309 }
2310 }
2311
2312 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2313 if (do_swizzles)
2314 soa_to_aos( func, 9, 2, 10, 11 );
2315 }
2316
2317 /* Can't just use EBX, EDI without save/restoring them:
2318 */
2319 x86_pop(
2320 func,
2321 get_temp_base() );
2322
2323 x86_pop(
2324 func,
2325 get_immediate_base() );
2326
2327 emit_ret( func );
2328
2329 tgsi_parse_free( &parse );
2330
2331 return ok;
2332 }
2333
2334 #endif /* PIPE_ARCH_X86 */