tgsi: reduce x86 reg usage in tgsi_sse generated programs
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 /* for 1/sqrt()
47 *
48 * This costs about 100fps (close to 10%) in gears:
49 */
50 #define HIGH_PRECISION 1
51
52 #define FAST_MATH 1
53
54
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
57
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
60
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
63
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67
68 #define CHAN_X 0
69 #define CHAN_Y 1
70 #define CHAN_Z 2
71 #define CHAN_W 3
72
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
75
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
80
81
82 /**
83 * X86 utility functions.
84 */
85
86 static struct x86_reg
87 make_xmm(
88 unsigned xmm )
89 {
90 return x86_make_reg(
91 file_XMM,
92 (enum x86_reg_name) xmm );
93 }
94
95 /**
96 * X86 register mapping helpers.
97 */
98
99 static struct x86_reg
100 get_const_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_CX );
105 }
106
107 static struct x86_reg
108 get_machine_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_AX );
113 }
114
115 static struct x86_reg
116 get_input_base( void )
117 {
118 return x86_make_disp(
119 get_machine_base(),
120 Offset(struct tgsi_exec_machine, Inputs) );
121 }
122
123 static struct x86_reg
124 get_output_base( void )
125 {
126 return x86_make_disp(
127 get_machine_base(),
128 Offset(struct tgsi_exec_machine, Outputs) );
129 }
130
131 static struct x86_reg
132 get_temp_base( void )
133 {
134 return x86_make_disp(
135 get_machine_base(),
136 Offset(struct tgsi_exec_machine, Temps) );
137 }
138
139 static struct x86_reg
140 get_coef_base( void )
141 {
142 return x86_make_reg(
143 file_REG32,
144 reg_BX );
145 }
146
147 static struct x86_reg
148 get_immediate_base( void )
149 {
150 return x86_make_reg(
151 file_REG32,
152 reg_DX );
153 }
154
155
156 /**
157 * Data access helpers.
158 */
159
160
161 static struct x86_reg
162 get_immediate(
163 unsigned vec,
164 unsigned chan )
165 {
166 return x86_make_disp(
167 get_immediate_base(),
168 (vec * 4 + chan) * 4 );
169 }
170
171 static struct x86_reg
172 get_const(
173 unsigned vec,
174 unsigned chan )
175 {
176 return x86_make_disp(
177 get_const_base(),
178 (vec * 4 + chan) * 4 );
179 }
180
181 static struct x86_reg
182 get_input(
183 unsigned vec,
184 unsigned chan )
185 {
186 return x86_make_disp(
187 get_input_base(),
188 (vec * 4 + chan) * 16 );
189 }
190
191 static struct x86_reg
192 get_output(
193 unsigned vec,
194 unsigned chan )
195 {
196 return x86_make_disp(
197 get_output_base(),
198 (vec * 4 + chan) * 16 );
199 }
200
201 static struct x86_reg
202 get_temp(
203 unsigned vec,
204 unsigned chan )
205 {
206 return x86_make_disp(
207 get_temp_base(),
208 (vec * 4 + chan) * 16 );
209 }
210
211 static struct x86_reg
212 get_coef(
213 unsigned vec,
214 unsigned chan,
215 unsigned member )
216 {
217 return x86_make_disp(
218 get_coef_base(),
219 ((vec * 3 + member) * 4 + chan) * 4 );
220 }
221
222
223 static void
224 emit_ret(
225 struct x86_function *func )
226 {
227 x86_ret( func );
228 }
229
230
231 /**
232 * Data fetch helpers.
233 */
234
235 /**
236 * Copy a shader constant to xmm register
237 * \param xmm the destination xmm register
238 * \param vec the src const buffer index
239 * \param chan src channel to fetch (X, Y, Z or W)
240 */
241 static void
242 emit_const(
243 struct x86_function *func,
244 uint xmm,
245 int vec,
246 uint chan,
247 uint indirect,
248 uint indirectFile,
249 int indirectIndex )
250 {
251 if (indirect) {
252 /* 'vec' is the offset from the address register's value.
253 * We're loading CONST[ADDR+vec] into an xmm register.
254 */
255 struct x86_reg r0 = get_input_base();
256 struct x86_reg r1 = get_output_base();
257 uint i;
258
259 assert( indirectFile == TGSI_FILE_ADDRESS );
260 assert( indirectIndex == 0 );
261
262 x86_push( func, r0 );
263 x86_push( func, r1 );
264
265 /*
266 * Loop over the four pixels or vertices in the quad.
267 * Get the value of the address (offset) register for pixel/vertex[i],
268 * add it to the src offset and index into the constant buffer.
269 * Note that we're working on SOA data.
270 * If any of the pixel/vertex execution channels are unused their
271 * values will be garbage. It's very important that we don't use
272 * those garbage values as indexes into the constant buffer since
273 * that'll cause segfaults.
274 * The solution is to bitwise-AND the offset with the execution mask
275 * register whose values are either 0 or ~0.
276 * The caller must setup the execution mask register to indicate
277 * which channels are valid/alive before running the shader.
278 * The execution mask will also figure into loops and conditionals
279 * someday.
280 */
281 for (i = 0; i < QUAD_SIZE; i++) {
282 /* r1 = address register[i] */
283 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
284 /* r0 = execution mask[i] */
285 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
286 /* r1 = r1 & r0 */
287 x86_and( func, r1, r0 );
288 /* r0 = 'vec', the offset */
289 x86_lea( func, r0, get_const( vec, chan ) );
290
291 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
292 */
293 x86_add( func, r1, r1 );
294 x86_add( func, r1, r1 );
295 x86_add( func, r1, r1 );
296 x86_add( func, r1, r1 );
297
298 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
299 x86_mov( func, r1, x86_deref( r0 ) );
300 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
301 }
302
303 x86_pop( func, r1 );
304 x86_pop( func, r0 );
305
306 sse_movaps(
307 func,
308 make_xmm( xmm ),
309 get_temp( TEMP_R0, CHAN_X ) );
310 }
311 else {
312 /* 'vec' is the index into the src register file, such as TEMP[vec] */
313 assert( vec >= 0 );
314
315 sse_movss(
316 func,
317 make_xmm( xmm ),
318 get_const( vec, chan ) );
319 sse_shufps(
320 func,
321 make_xmm( xmm ),
322 make_xmm( xmm ),
323 SHUF( 0, 0, 0, 0 ) );
324 }
325 }
326
327 static void
328 emit_immediate(
329 struct x86_function *func,
330 unsigned xmm,
331 unsigned vec,
332 unsigned chan )
333 {
334 sse_movss(
335 func,
336 make_xmm( xmm ),
337 get_immediate( vec, chan ) );
338 sse_shufps(
339 func,
340 make_xmm( xmm ),
341 make_xmm( xmm ),
342 SHUF( 0, 0, 0, 0 ) );
343 }
344
345
346 /**
347 * Copy a shader input to xmm register
348 * \param xmm the destination xmm register
349 * \param vec the src input attrib
350 * \param chan src channel to fetch (X, Y, Z or W)
351 */
352 static void
353 emit_inputf(
354 struct x86_function *func,
355 unsigned xmm,
356 unsigned vec,
357 unsigned chan )
358 {
359 sse_movups(
360 func,
361 make_xmm( xmm ),
362 get_input( vec, chan ) );
363 }
364
365 /**
366 * Store an xmm register to a shader output
367 * \param xmm the source xmm register
368 * \param vec the dest output attrib
369 * \param chan src dest channel to store (X, Y, Z or W)
370 */
371 static void
372 emit_output(
373 struct x86_function *func,
374 unsigned xmm,
375 unsigned vec,
376 unsigned chan )
377 {
378 sse_movups(
379 func,
380 get_output( vec, chan ),
381 make_xmm( xmm ) );
382 }
383
384 /**
385 * Copy a shader temporary to xmm register
386 * \param xmm the destination xmm register
387 * \param vec the src temp register
388 * \param chan src channel to fetch (X, Y, Z or W)
389 */
390 static void
391 emit_tempf(
392 struct x86_function *func,
393 unsigned xmm,
394 unsigned vec,
395 unsigned chan )
396 {
397 sse_movaps(
398 func,
399 make_xmm( xmm ),
400 get_temp( vec, chan ) );
401 }
402
403 /**
404 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
405 * \param xmm the destination xmm register
406 * \param vec the src input/attribute coefficient index
407 * \param chan src channel to fetch (X, Y, Z or W)
408 * \param member 0=a0, 1=dadx, 2=dady
409 */
410 static void
411 emit_coef(
412 struct x86_function *func,
413 unsigned xmm,
414 unsigned vec,
415 unsigned chan,
416 unsigned member )
417 {
418 sse_movss(
419 func,
420 make_xmm( xmm ),
421 get_coef( vec, chan, member ) );
422 sse_shufps(
423 func,
424 make_xmm( xmm ),
425 make_xmm( xmm ),
426 SHUF( 0, 0, 0, 0 ) );
427 }
428
429 /**
430 * Data store helpers.
431 */
432
433 static void
434 emit_inputs(
435 struct x86_function *func,
436 unsigned xmm,
437 unsigned vec,
438 unsigned chan )
439 {
440 sse_movups(
441 func,
442 get_input( vec, chan ),
443 make_xmm( xmm ) );
444 }
445
446 static void
447 emit_temps(
448 struct x86_function *func,
449 unsigned xmm,
450 unsigned vec,
451 unsigned chan )
452 {
453 sse_movaps(
454 func,
455 get_temp( vec, chan ),
456 make_xmm( xmm ) );
457 }
458
459 static void
460 emit_addrs(
461 struct x86_function *func,
462 unsigned xmm,
463 unsigned vec,
464 unsigned chan )
465 {
466 assert( vec == 0 );
467
468 emit_temps(
469 func,
470 xmm,
471 vec + TGSI_EXEC_TEMP_ADDR,
472 chan );
473 }
474
475 /**
476 * Coefficent fetch helpers.
477 */
478
479 static void
480 emit_coef_a0(
481 struct x86_function *func,
482 unsigned xmm,
483 unsigned vec,
484 unsigned chan )
485 {
486 emit_coef(
487 func,
488 xmm,
489 vec,
490 chan,
491 0 );
492 }
493
494 static void
495 emit_coef_dadx(
496 struct x86_function *func,
497 unsigned xmm,
498 unsigned vec,
499 unsigned chan )
500 {
501 emit_coef(
502 func,
503 xmm,
504 vec,
505 chan,
506 1 );
507 }
508
509 static void
510 emit_coef_dady(
511 struct x86_function *func,
512 unsigned xmm,
513 unsigned vec,
514 unsigned chan )
515 {
516 emit_coef(
517 func,
518 xmm,
519 vec,
520 chan,
521 2 );
522 }
523
524 /**
525 * Function call helpers.
526 */
527
528 /**
529 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
530 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
531 * that the stack pointer is 16 byte aligned, as expected.
532 */
533 static void
534 emit_func_call(
535 struct x86_function *func,
536 unsigned xmm_save,
537 unsigned xmm_dst,
538 void (PIPE_CDECL *code)() )
539 {
540 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
541 unsigned i, n;
542 unsigned xmm_mask;
543
544 /* Bitmask of the xmm registers to save */
545 xmm_mask = (1 << xmm_save) - 1;
546 xmm_mask &= ~(1 << xmm_dst);
547
548 x86_push(
549 func,
550 x86_make_reg( file_REG32, reg_AX) );
551 x86_push(
552 func,
553 x86_make_reg( file_REG32, reg_CX) );
554 x86_push(
555 func,
556 x86_make_reg( file_REG32, reg_DX) );
557
558 /* Store XMM regs to the stack
559 */
560 for(i = 0, n = 0; i < 8; ++i)
561 if(xmm_mask & (1 << i))
562 ++n;
563
564 x86_sub_imm(
565 func,
566 x86_make_reg( file_REG32, reg_SP ),
567 n*16);
568
569 for(i = 0, n = 0; i < 8; ++i)
570 if(xmm_mask & (1 << i)) {
571 sse_movups(
572 func,
573 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
574 make_xmm( i ) );
575 ++n;
576 }
577
578 /* Load the address of the buffer we use for passing arguments and
579 * receiving results:
580 */
581 x86_lea(
582 func,
583 ecx,
584 get_temp( TEMP_R0, 0 ) );
585
586 /* Push actual function arguments (currently just the pointer to
587 * the buffer above), and call the function:
588 */
589 x86_push( func, ecx );
590 x86_mov_reg_imm( func, ecx, (unsigned long) code );
591 x86_call( func, ecx );
592 x86_pop(func, ecx );
593
594
595 /* Pop the saved XMM regs:
596 */
597 for(i = 0, n = 0; i < 8; ++i)
598 if(xmm_mask & (1 << i)) {
599 sse_movups(
600 func,
601 make_xmm( i ),
602 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
603 ++n;
604 }
605
606 x86_add_imm(
607 func,
608 x86_make_reg( file_REG32, reg_SP ),
609 n*16);
610
611 /* Restore GP registers in a reverse order.
612 */
613 x86_pop(
614 func,
615 x86_make_reg( file_REG32, reg_DX) );
616 x86_pop(
617 func,
618 x86_make_reg( file_REG32, reg_CX) );
619 x86_pop(
620 func,
621 x86_make_reg( file_REG32, reg_AX) );
622 }
623
624
625 static void
626 emit_func_call_dst_src1(
627 struct x86_function *func,
628 unsigned xmm_save,
629 unsigned xmm_dst,
630 unsigned xmm_src0,
631 void (PIPE_CDECL *code)() )
632 {
633 /* Store our input parameters (in xmm regs) to the buffer we use
634 * for passing arguments. We will pass a pointer to this buffer as
635 * the actual function argument.
636 */
637 sse_movaps(
638 func,
639 get_temp( TEMP_R0, 0 ),
640 make_xmm( xmm_src0 ) );
641
642 emit_func_call(
643 func,
644 xmm_save,
645 xmm_dst,
646 code );
647
648 sse_movaps(
649 func,
650 make_xmm( xmm_dst ),
651 get_temp( TEMP_R0, 0 ) );
652 }
653
654
655 static void
656 emit_func_call_dst_src2(
657 struct x86_function *func,
658 unsigned xmm_save,
659 unsigned xmm_dst,
660 unsigned xmm_src0,
661 unsigned xmm_src1,
662 void (PIPE_CDECL *code)() )
663 {
664 /* Store two inputs to parameter buffer.
665 */
666 sse_movaps(
667 func,
668 get_temp( TEMP_R0, 0 ),
669 make_xmm( xmm_src0 ) );
670
671 sse_movaps(
672 func,
673 get_temp( TEMP_R0, 1 ),
674 make_xmm( xmm_src1 ) );
675
676
677 /* Emit the call
678 */
679 emit_func_call(
680 func,
681 xmm_save,
682 xmm_dst,
683 code );
684
685 /* Retrieve the results:
686 */
687 sse_movaps(
688 func,
689 make_xmm( xmm_dst ),
690 get_temp( TEMP_R0, 0 ) );
691 }
692
693
694
695
696
697 #if defined(PIPE_ARCH_SSE)
698
699 /*
700 * Fast SSE2 implementation of special math functions.
701 */
702
703 #define POLY0(x, c0) _mm_set1_ps(c0)
704 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
705 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
706 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
707 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
708 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
709
710 #define EXP_POLY_DEGREE 3
711 #define LOG_POLY_DEGREE 5
712
713 /**
714 * See http://www.devmaster.net/forums/showthread.php?p=43580
715 */
716 static INLINE __m128
717 exp2f4(__m128 x)
718 {
719 __m128i ipart;
720 __m128 fpart, expipart, expfpart;
721
722 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
723 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
724
725 /* ipart = int(x - 0.5) */
726 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
727
728 /* fpart = x - ipart */
729 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
730
731 /* expipart = (float) (1 << ipart) */
732 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
733
734 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
735 #if EXP_POLY_DEGREE == 5
736 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
737 #elif EXP_POLY_DEGREE == 4
738 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
739 #elif EXP_POLY_DEGREE == 3
740 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
741 #elif EXP_POLY_DEGREE == 2
742 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
743 #else
744 #error
745 #endif
746
747 return _mm_mul_ps(expipart, expfpart);
748 }
749
750
751 /**
752 * See http://www.devmaster.net/forums/showthread.php?p=43580
753 */
754 static INLINE __m128
755 log2f4(__m128 x)
756 {
757 __m128i expmask = _mm_set1_epi32(0x7f800000);
758 __m128i mantmask = _mm_set1_epi32(0x007fffff);
759 __m128 one = _mm_set1_ps(1.0f);
760
761 __m128i i = _mm_castps_si128(x);
762
763 /* exp = (float) exponent(x) */
764 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
765
766 /* mant = (float) mantissa(x) */
767 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
768
769 __m128 logmant;
770
771 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
772 * These coefficients can be generate with
773 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
774 */
775 #if LOG_POLY_DEGREE == 6
776 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
777 #elif LOG_POLY_DEGREE == 5
778 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
779 #elif LOG_POLY_DEGREE == 4
780 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
781 #elif LOG_POLY_DEGREE == 3
782 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
783 #else
784 #error
785 #endif
786
787 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
788 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
789
790 return _mm_add_ps(logmant, exp);
791 }
792
793
794 static INLINE __m128
795 powf4(__m128 x, __m128 y)
796 {
797 return exp2f4(_mm_mul_ps(log2f4(x), y));
798 }
799
800 #endif /* PIPE_ARCH_SSE */
801
802
803
804 /**
805 * Low-level instruction translators.
806 */
807
808 static void
809 emit_abs(
810 struct x86_function *func,
811 unsigned xmm )
812 {
813 sse_andps(
814 func,
815 make_xmm( xmm ),
816 get_temp(
817 TGSI_EXEC_TEMP_7FFFFFFF_I,
818 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
819 }
820
821 static void
822 emit_add(
823 struct x86_function *func,
824 unsigned xmm_dst,
825 unsigned xmm_src )
826 {
827 sse_addps(
828 func,
829 make_xmm( xmm_dst ),
830 make_xmm( xmm_src ) );
831 }
832
833 static void PIPE_CDECL
834 cos4f(
835 float *store )
836 {
837 store[0] = cosf( store[0] );
838 store[1] = cosf( store[1] );
839 store[2] = cosf( store[2] );
840 store[3] = cosf( store[3] );
841 }
842
843 static void
844 emit_cos(
845 struct x86_function *func,
846 unsigned xmm_save,
847 unsigned xmm_dst )
848 {
849 emit_func_call_dst_src1(
850 func,
851 xmm_save,
852 xmm_dst,
853 xmm_dst,
854 cos4f );
855 }
856
857 static void PIPE_CDECL
858 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
859 __attribute__((force_align_arg_pointer))
860 #endif
861 ex24f(
862 float *store )
863 {
864 #if defined(PIPE_ARCH_SSE)
865 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
866 #else
867 store[0] = util_fast_exp2( store[0] );
868 store[1] = util_fast_exp2( store[1] );
869 store[2] = util_fast_exp2( store[2] );
870 store[3] = util_fast_exp2( store[3] );
871 #endif
872 }
873
874 static void
875 emit_ex2(
876 struct x86_function *func,
877 unsigned xmm_save,
878 unsigned xmm_dst )
879 {
880 emit_func_call_dst_src1(
881 func,
882 xmm_save,
883 xmm_dst,
884 xmm_dst,
885 ex24f );
886 }
887
888 static void
889 emit_f2it(
890 struct x86_function *func,
891 unsigned xmm )
892 {
893 sse2_cvttps2dq(
894 func,
895 make_xmm( xmm ),
896 make_xmm( xmm ) );
897 }
898
899 static void
900 emit_i2f(
901 struct x86_function *func,
902 unsigned xmm )
903 {
904 sse2_cvtdq2ps(
905 func,
906 make_xmm( xmm ),
907 make_xmm( xmm ) );
908 }
909
910 static void PIPE_CDECL
911 flr4f(
912 float *store )
913 {
914 store[0] = floorf( store[0] );
915 store[1] = floorf( store[1] );
916 store[2] = floorf( store[2] );
917 store[3] = floorf( store[3] );
918 }
919
920 static void
921 emit_flr(
922 struct x86_function *func,
923 unsigned xmm_save,
924 unsigned xmm_dst )
925 {
926 emit_func_call_dst_src1(
927 func,
928 xmm_save,
929 xmm_dst,
930 xmm_dst,
931 flr4f );
932 }
933
934 static void PIPE_CDECL
935 frc4f(
936 float *store )
937 {
938 store[0] -= floorf( store[0] );
939 store[1] -= floorf( store[1] );
940 store[2] -= floorf( store[2] );
941 store[3] -= floorf( store[3] );
942 }
943
944 static void
945 emit_frc(
946 struct x86_function *func,
947 unsigned xmm_save,
948 unsigned xmm_dst )
949 {
950 emit_func_call_dst_src1(
951 func,
952 xmm_save,
953 xmm_dst,
954 xmm_dst,
955 frc4f );
956 }
957
958 static void PIPE_CDECL
959 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
960 __attribute__((force_align_arg_pointer))
961 #endif
962 lg24f(
963 float *store )
964 {
965 #if defined(PIPE_ARCH_SSE)
966 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
967 #else
968 store[0] = util_fast_log2( store[0] );
969 store[1] = util_fast_log2( store[1] );
970 store[2] = util_fast_log2( store[2] );
971 store[3] = util_fast_log2( store[3] );
972 #endif
973 }
974
975 static void
976 emit_lg2(
977 struct x86_function *func,
978 unsigned xmm_save,
979 unsigned xmm_dst )
980 {
981 emit_func_call_dst_src1(
982 func,
983 xmm_save,
984 xmm_dst,
985 xmm_dst,
986 lg24f );
987 }
988
989 static void
990 emit_MOV(
991 struct x86_function *func,
992 unsigned xmm_dst,
993 unsigned xmm_src )
994 {
995 sse_movups(
996 func,
997 make_xmm( xmm_dst ),
998 make_xmm( xmm_src ) );
999 }
1000
1001 static void
1002 emit_mul (struct x86_function *func,
1003 unsigned xmm_dst,
1004 unsigned xmm_src)
1005 {
1006 sse_mulps(
1007 func,
1008 make_xmm( xmm_dst ),
1009 make_xmm( xmm_src ) );
1010 }
1011
1012 static void
1013 emit_neg(
1014 struct x86_function *func,
1015 unsigned xmm )
1016 {
1017 sse_xorps(
1018 func,
1019 make_xmm( xmm ),
1020 get_temp(
1021 TGSI_EXEC_TEMP_80000000_I,
1022 TGSI_EXEC_TEMP_80000000_C ) );
1023 }
1024
1025 static void PIPE_CDECL
1026 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1027 __attribute__((force_align_arg_pointer))
1028 #endif
1029 pow4f(
1030 float *store )
1031 {
1032 #if defined(PIPE_ARCH_SSE)
1033 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1034 #else
1035 store[0] = util_fast_pow( store[0], store[4] );
1036 store[1] = util_fast_pow( store[1], store[5] );
1037 store[2] = util_fast_pow( store[2], store[6] );
1038 store[3] = util_fast_pow( store[3], store[7] );
1039 #endif
1040 }
1041
1042 static void
1043 emit_pow(
1044 struct x86_function *func,
1045 unsigned xmm_save,
1046 unsigned xmm_dst,
1047 unsigned xmm_src0,
1048 unsigned xmm_src1 )
1049 {
1050 emit_func_call_dst_src2(
1051 func,
1052 xmm_save,
1053 xmm_dst,
1054 xmm_src0,
1055 xmm_src1,
1056 pow4f );
1057 }
1058
1059 static void
1060 emit_rcp (
1061 struct x86_function *func,
1062 unsigned xmm_dst,
1063 unsigned xmm_src )
1064 {
1065 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1066 * good enough. Need to either emit a proper divide or use the
1067 * iterative technique described below in emit_rsqrt().
1068 */
1069 sse2_rcpps(
1070 func,
1071 make_xmm( xmm_dst ),
1072 make_xmm( xmm_src ) );
1073 }
1074
1075 static void PIPE_CDECL
1076 rnd4f(
1077 float *store )
1078 {
1079 store[0] = floorf( store[0] + 0.5f );
1080 store[1] = floorf( store[1] + 0.5f );
1081 store[2] = floorf( store[2] + 0.5f );
1082 store[3] = floorf( store[3] + 0.5f );
1083 }
1084
1085 static void
1086 emit_rnd(
1087 struct x86_function *func,
1088 unsigned xmm_save,
1089 unsigned xmm_dst )
1090 {
1091 emit_func_call_dst_src1(
1092 func,
1093 xmm_save,
1094 xmm_dst,
1095 xmm_dst,
1096 rnd4f );
1097 }
1098
1099 static void
1100 emit_rsqrt(
1101 struct x86_function *func,
1102 unsigned xmm_dst,
1103 unsigned xmm_src )
1104 {
1105 #if HIGH_PRECISION
1106 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1107 * implementations, it is possible to improve its precision at
1108 * fairly low cost, using a newton/raphson step, as below:
1109 *
1110 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1111 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1112 *
1113 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1114 */
1115 {
1116 struct x86_reg dst = make_xmm( xmm_dst );
1117 struct x86_reg src = make_xmm( xmm_src );
1118 struct x86_reg tmp0 = make_xmm( 2 );
1119 struct x86_reg tmp1 = make_xmm( 3 );
1120
1121 assert( xmm_dst != xmm_src );
1122 assert( xmm_dst != 2 && xmm_dst != 3 );
1123 assert( xmm_src != 2 && xmm_src != 3 );
1124
1125 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1126 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1127 sse_rsqrtps( func, tmp1, src );
1128 sse_mulps( func, src, tmp1 );
1129 sse_mulps( func, dst, tmp1 );
1130 sse_mulps( func, src, tmp1 );
1131 sse_subps( func, tmp0, src );
1132 sse_mulps( func, dst, tmp0 );
1133 }
1134 #else
1135 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1136 * good enough.
1137 */
1138 sse_rsqrtps(
1139 func,
1140 make_xmm( xmm_dst ),
1141 make_xmm( xmm_src ) );
1142 #endif
1143 }
1144
1145 static void
1146 emit_setsign(
1147 struct x86_function *func,
1148 unsigned xmm )
1149 {
1150 sse_orps(
1151 func,
1152 make_xmm( xmm ),
1153 get_temp(
1154 TGSI_EXEC_TEMP_80000000_I,
1155 TGSI_EXEC_TEMP_80000000_C ) );
1156 }
1157
1158 static void PIPE_CDECL
1159 sgn4f(
1160 float *store )
1161 {
1162 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1163 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1164 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1165 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1166 }
1167
1168 static void
1169 emit_sgn(
1170 struct x86_function *func,
1171 unsigned xmm_save,
1172 unsigned xmm_dst )
1173 {
1174 emit_func_call_dst_src1(
1175 func,
1176 xmm_save,
1177 xmm_dst,
1178 xmm_dst,
1179 sgn4f );
1180 }
1181
1182 static void PIPE_CDECL
1183 sin4f(
1184 float *store )
1185 {
1186 store[0] = sinf( store[0] );
1187 store[1] = sinf( store[1] );
1188 store[2] = sinf( store[2] );
1189 store[3] = sinf( store[3] );
1190 }
1191
1192 static void
1193 emit_sin (struct x86_function *func,
1194 unsigned xmm_save,
1195 unsigned xmm_dst)
1196 {
1197 emit_func_call_dst_src1(
1198 func,
1199 xmm_save,
1200 xmm_dst,
1201 xmm_dst,
1202 sin4f );
1203 }
1204
1205 static void
1206 emit_sub(
1207 struct x86_function *func,
1208 unsigned xmm_dst,
1209 unsigned xmm_src )
1210 {
1211 sse_subps(
1212 func,
1213 make_xmm( xmm_dst ),
1214 make_xmm( xmm_src ) );
1215 }
1216
1217 /**
1218 * Register fetch.
1219 */
1220
1221 static void
1222 emit_fetch(
1223 struct x86_function *func,
1224 unsigned xmm,
1225 const struct tgsi_full_src_register *reg,
1226 const unsigned chan_index )
1227 {
1228 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1229
1230 switch (swizzle) {
1231 case TGSI_EXTSWIZZLE_X:
1232 case TGSI_EXTSWIZZLE_Y:
1233 case TGSI_EXTSWIZZLE_Z:
1234 case TGSI_EXTSWIZZLE_W:
1235 switch (reg->SrcRegister.File) {
1236 case TGSI_FILE_CONSTANT:
1237 emit_const(
1238 func,
1239 xmm,
1240 reg->SrcRegister.Index,
1241 swizzle,
1242 reg->SrcRegister.Indirect,
1243 reg->SrcRegisterInd.File,
1244 reg->SrcRegisterInd.Index );
1245 break;
1246
1247 case TGSI_FILE_IMMEDIATE:
1248 emit_immediate(
1249 func,
1250 xmm,
1251 reg->SrcRegister.Index,
1252 swizzle );
1253 break;
1254
1255 case TGSI_FILE_INPUT:
1256 emit_inputf(
1257 func,
1258 xmm,
1259 reg->SrcRegister.Index,
1260 swizzle );
1261 break;
1262
1263 case TGSI_FILE_TEMPORARY:
1264 emit_tempf(
1265 func,
1266 xmm,
1267 reg->SrcRegister.Index,
1268 swizzle );
1269 break;
1270
1271 default:
1272 assert( 0 );
1273 }
1274 break;
1275
1276 case TGSI_EXTSWIZZLE_ZERO:
1277 emit_tempf(
1278 func,
1279 xmm,
1280 TGSI_EXEC_TEMP_00000000_I,
1281 TGSI_EXEC_TEMP_00000000_C );
1282 break;
1283
1284 case TGSI_EXTSWIZZLE_ONE:
1285 emit_tempf(
1286 func,
1287 xmm,
1288 TEMP_ONE_I,
1289 TEMP_ONE_C );
1290 break;
1291
1292 default:
1293 assert( 0 );
1294 }
1295
1296 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1297 case TGSI_UTIL_SIGN_CLEAR:
1298 emit_abs( func, xmm );
1299 break;
1300
1301 case TGSI_UTIL_SIGN_SET:
1302 emit_setsign( func, xmm );
1303 break;
1304
1305 case TGSI_UTIL_SIGN_TOGGLE:
1306 emit_neg( func, xmm );
1307 break;
1308
1309 case TGSI_UTIL_SIGN_KEEP:
1310 break;
1311 }
1312 }
1313
1314 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1315 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1316
1317 /**
1318 * Register store.
1319 */
1320
1321 static void
1322 emit_store(
1323 struct x86_function *func,
1324 unsigned xmm,
1325 const struct tgsi_full_dst_register *reg,
1326 const struct tgsi_full_instruction *inst,
1327 unsigned chan_index )
1328 {
1329 switch( reg->DstRegister.File ) {
1330 case TGSI_FILE_OUTPUT:
1331 emit_output(
1332 func,
1333 xmm,
1334 reg->DstRegister.Index,
1335 chan_index );
1336 break;
1337
1338 case TGSI_FILE_TEMPORARY:
1339 emit_temps(
1340 func,
1341 xmm,
1342 reg->DstRegister.Index,
1343 chan_index );
1344 break;
1345
1346 case TGSI_FILE_ADDRESS:
1347 emit_addrs(
1348 func,
1349 xmm,
1350 reg->DstRegister.Index,
1351 chan_index );
1352 break;
1353
1354 default:
1355 assert( 0 );
1356 }
1357
1358 switch( inst->Instruction.Saturate ) {
1359 case TGSI_SAT_NONE:
1360 break;
1361
1362 case TGSI_SAT_ZERO_ONE:
1363 /* assert( 0 ); */
1364 break;
1365
1366 case TGSI_SAT_MINUS_PLUS_ONE:
1367 assert( 0 );
1368 break;
1369 }
1370 }
1371
1372 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1373 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1374
1375 /**
1376 * High-level instruction translators.
1377 */
1378
1379 static void
1380 emit_kil(
1381 struct x86_function *func,
1382 const struct tgsi_full_src_register *reg )
1383 {
1384 unsigned uniquemask;
1385 unsigned registers[4];
1386 unsigned nextregister = 0;
1387 unsigned firstchan = ~0;
1388 unsigned chan_index;
1389
1390 /* This mask stores component bits that were already tested. Note that
1391 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1392 * tested. */
1393 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1394
1395 FOR_EACH_CHANNEL( chan_index ) {
1396 unsigned swizzle;
1397
1398 /* unswizzle channel */
1399 swizzle = tgsi_util_get_full_src_register_extswizzle(
1400 reg,
1401 chan_index );
1402
1403 /* check if the component has not been already tested */
1404 if( !(uniquemask & (1 << swizzle)) ) {
1405 uniquemask |= 1 << swizzle;
1406
1407 /* allocate register */
1408 registers[chan_index] = nextregister;
1409 emit_fetch(
1410 func,
1411 nextregister,
1412 reg,
1413 chan_index );
1414 nextregister++;
1415
1416 /* mark the first channel used */
1417 if( firstchan == ~0 ) {
1418 firstchan = chan_index;
1419 }
1420 }
1421 }
1422
1423 x86_push(
1424 func,
1425 x86_make_reg( file_REG32, reg_AX ) );
1426 x86_push(
1427 func,
1428 x86_make_reg( file_REG32, reg_DX ) );
1429
1430 FOR_EACH_CHANNEL( chan_index ) {
1431 if( uniquemask & (1 << chan_index) ) {
1432 sse_cmpps(
1433 func,
1434 make_xmm( registers[chan_index] ),
1435 get_temp(
1436 TGSI_EXEC_TEMP_00000000_I,
1437 TGSI_EXEC_TEMP_00000000_C ),
1438 cc_LessThan );
1439
1440 if( chan_index == firstchan ) {
1441 sse_pmovmskb(
1442 func,
1443 x86_make_reg( file_REG32, reg_AX ),
1444 make_xmm( registers[chan_index] ) );
1445 }
1446 else {
1447 sse_pmovmskb(
1448 func,
1449 x86_make_reg( file_REG32, reg_DX ),
1450 make_xmm( registers[chan_index] ) );
1451 x86_or(
1452 func,
1453 x86_make_reg( file_REG32, reg_AX ),
1454 x86_make_reg( file_REG32, reg_DX ) );
1455 }
1456 }
1457 }
1458
1459 x86_or(
1460 func,
1461 get_temp(
1462 TGSI_EXEC_TEMP_KILMASK_I,
1463 TGSI_EXEC_TEMP_KILMASK_C ),
1464 x86_make_reg( file_REG32, reg_AX ) );
1465
1466 x86_pop(
1467 func,
1468 x86_make_reg( file_REG32, reg_DX ) );
1469 x86_pop(
1470 func,
1471 x86_make_reg( file_REG32, reg_AX ) );
1472 }
1473
1474
1475 static void
1476 emit_kilp(
1477 struct x86_function *func )
1478 {
1479 /* XXX todo / fix me */
1480 }
1481
1482
1483 static void
1484 emit_setcc(
1485 struct x86_function *func,
1486 struct tgsi_full_instruction *inst,
1487 enum sse_cc cc )
1488 {
1489 unsigned chan_index;
1490
1491 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1492 FETCH( func, *inst, 0, 0, chan_index );
1493 FETCH( func, *inst, 1, 1, chan_index );
1494 sse_cmpps(
1495 func,
1496 make_xmm( 0 ),
1497 make_xmm( 1 ),
1498 cc );
1499 sse_andps(
1500 func,
1501 make_xmm( 0 ),
1502 get_temp(
1503 TEMP_ONE_I,
1504 TEMP_ONE_C ) );
1505 STORE( func, *inst, 0, 0, chan_index );
1506 }
1507 }
1508
1509 static void
1510 emit_cmp(
1511 struct x86_function *func,
1512 struct tgsi_full_instruction *inst )
1513 {
1514 unsigned chan_index;
1515
1516 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1517 FETCH( func, *inst, 0, 0, chan_index );
1518 FETCH( func, *inst, 1, 1, chan_index );
1519 FETCH( func, *inst, 2, 2, chan_index );
1520 sse_cmpps(
1521 func,
1522 make_xmm( 0 ),
1523 get_temp(
1524 TGSI_EXEC_TEMP_00000000_I,
1525 TGSI_EXEC_TEMP_00000000_C ),
1526 cc_LessThan );
1527 sse_andps(
1528 func,
1529 make_xmm( 1 ),
1530 make_xmm( 0 ) );
1531 sse_andnps(
1532 func,
1533 make_xmm( 0 ),
1534 make_xmm( 2 ) );
1535 sse_orps(
1536 func,
1537 make_xmm( 0 ),
1538 make_xmm( 1 ) );
1539 STORE( func, *inst, 0, 0, chan_index );
1540 }
1541 }
1542
1543
1544 /**
1545 * Check if inst src/dest regs use indirect addressing into temporary
1546 * register file.
1547 */
1548 static boolean
1549 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1550 {
1551 uint i;
1552 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1553 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1554 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1555 reg->SrcRegister.Indirect)
1556 return TRUE;
1557 }
1558 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1559 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1560 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1561 reg->DstRegister.Indirect)
1562 return TRUE;
1563 }
1564 return FALSE;
1565 }
1566
1567
1568 static int
1569 emit_instruction(
1570 struct x86_function *func,
1571 struct tgsi_full_instruction *inst )
1572 {
1573 unsigned chan_index;
1574
1575 /* we can't handle indirect addressing into temp register file yet */
1576 if (indirect_temp_reference(inst))
1577 return FALSE;
1578
1579 switch (inst->Instruction.Opcode) {
1580 case TGSI_OPCODE_ARL:
1581 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1582 FETCH( func, *inst, 0, 0, chan_index );
1583 emit_flr(func, 0, 0);
1584 emit_f2it( func, 0 );
1585 STORE( func, *inst, 0, 0, chan_index );
1586 }
1587 break;
1588
1589 case TGSI_OPCODE_MOV:
1590 case TGSI_OPCODE_SWZ:
1591 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1592 FETCH( func, *inst, 0, 0, chan_index );
1593 STORE( func, *inst, 0, 0, chan_index );
1594 }
1595 break;
1596
1597 case TGSI_OPCODE_LIT:
1598 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1599 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1600 emit_tempf(
1601 func,
1602 0,
1603 TEMP_ONE_I,
1604 TEMP_ONE_C);
1605 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1606 STORE( func, *inst, 0, 0, CHAN_X );
1607 }
1608 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1609 STORE( func, *inst, 0, 0, CHAN_W );
1610 }
1611 }
1612 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1613 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1614 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1615 FETCH( func, *inst, 0, 0, CHAN_X );
1616 sse_maxps(
1617 func,
1618 make_xmm( 0 ),
1619 get_temp(
1620 TGSI_EXEC_TEMP_00000000_I,
1621 TGSI_EXEC_TEMP_00000000_C ) );
1622 STORE( func, *inst, 0, 0, CHAN_Y );
1623 }
1624 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1625 /* XMM[1] = SrcReg[0].yyyy */
1626 FETCH( func, *inst, 1, 0, CHAN_Y );
1627 /* XMM[1] = max(XMM[1], 0) */
1628 sse_maxps(
1629 func,
1630 make_xmm( 1 ),
1631 get_temp(
1632 TGSI_EXEC_TEMP_00000000_I,
1633 TGSI_EXEC_TEMP_00000000_C ) );
1634 /* XMM[2] = SrcReg[0].wwww */
1635 FETCH( func, *inst, 2, 0, CHAN_W );
1636 /* XMM[2] = min(XMM[2], 128.0) */
1637 sse_minps(
1638 func,
1639 make_xmm( 2 ),
1640 get_temp(
1641 TGSI_EXEC_TEMP_128_I,
1642 TGSI_EXEC_TEMP_128_C ) );
1643 /* XMM[2] = max(XMM[2], -128.0) */
1644 sse_maxps(
1645 func,
1646 make_xmm( 2 ),
1647 get_temp(
1648 TGSI_EXEC_TEMP_MINUS_128_I,
1649 TGSI_EXEC_TEMP_MINUS_128_C ) );
1650 emit_pow( func, 3, 1, 1, 2 );
1651 FETCH( func, *inst, 0, 0, CHAN_X );
1652 sse_xorps(
1653 func,
1654 make_xmm( 2 ),
1655 make_xmm( 2 ) );
1656 sse_cmpps(
1657 func,
1658 make_xmm( 2 ),
1659 make_xmm( 0 ),
1660 cc_LessThan );
1661 sse_andps(
1662 func,
1663 make_xmm( 2 ),
1664 make_xmm( 1 ) );
1665 STORE( func, *inst, 2, 0, CHAN_Z );
1666 }
1667 }
1668 break;
1669
1670 case TGSI_OPCODE_RCP:
1671 /* TGSI_OPCODE_RECIP */
1672 FETCH( func, *inst, 0, 0, CHAN_X );
1673 emit_rcp( func, 0, 0 );
1674 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1675 STORE( func, *inst, 0, 0, chan_index );
1676 }
1677 break;
1678
1679 case TGSI_OPCODE_RSQ:
1680 /* TGSI_OPCODE_RECIPSQRT */
1681 FETCH( func, *inst, 0, 0, CHAN_X );
1682 emit_abs( func, 0 );
1683 emit_rsqrt( func, 1, 0 );
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685 STORE( func, *inst, 1, 0, chan_index );
1686 }
1687 break;
1688
1689 case TGSI_OPCODE_EXP:
1690 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1691 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1692 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1693 FETCH( func, *inst, 0, 0, CHAN_X );
1694 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1695 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1696 emit_MOV( func, 1, 0 );
1697 emit_flr( func, 2, 1 );
1698 /* dst.x = ex2(floor(src.x)) */
1699 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1700 emit_MOV( func, 2, 1 );
1701 emit_ex2( func, 3, 2 );
1702 STORE( func, *inst, 2, 0, CHAN_X );
1703 }
1704 /* dst.y = src.x - floor(src.x) */
1705 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1706 emit_MOV( func, 2, 0 );
1707 emit_sub( func, 2, 1 );
1708 STORE( func, *inst, 2, 0, CHAN_Y );
1709 }
1710 }
1711 /* dst.z = ex2(src.x) */
1712 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1713 emit_ex2( func, 3, 0 );
1714 STORE( func, *inst, 0, 0, CHAN_Z );
1715 }
1716 }
1717 /* dst.w = 1.0 */
1718 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1719 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1720 STORE( func, *inst, 0, 0, CHAN_W );
1721 }
1722 break;
1723
1724 case TGSI_OPCODE_LOG:
1725 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1726 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1727 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1728 FETCH( func, *inst, 0, 0, CHAN_X );
1729 emit_abs( func, 0 );
1730 emit_MOV( func, 1, 0 );
1731 emit_lg2( func, 2, 1 );
1732 /* dst.z = lg2(abs(src.x)) */
1733 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1734 STORE( func, *inst, 1, 0, CHAN_Z );
1735 }
1736 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1737 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1738 emit_flr( func, 2, 1 );
1739 /* dst.x = floor(lg2(abs(src.x))) */
1740 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1741 STORE( func, *inst, 1, 0, CHAN_X );
1742 }
1743 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1744 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1745 emit_ex2( func, 2, 1 );
1746 emit_rcp( func, 1, 1 );
1747 emit_mul( func, 0, 1 );
1748 STORE( func, *inst, 0, 0, CHAN_Y );
1749 }
1750 }
1751 }
1752 /* dst.w = 1.0 */
1753 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1754 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1755 STORE( func, *inst, 0, 0, CHAN_W );
1756 }
1757 break;
1758
1759 case TGSI_OPCODE_MUL:
1760 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761 FETCH( func, *inst, 0, 0, chan_index );
1762 FETCH( func, *inst, 1, 1, chan_index );
1763 emit_mul( func, 0, 1 );
1764 STORE( func, *inst, 0, 0, chan_index );
1765 }
1766 break;
1767
1768 case TGSI_OPCODE_ADD:
1769 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1770 FETCH( func, *inst, 0, 0, chan_index );
1771 FETCH( func, *inst, 1, 1, chan_index );
1772 emit_add( func, 0, 1 );
1773 STORE( func, *inst, 0, 0, chan_index );
1774 }
1775 break;
1776
1777 case TGSI_OPCODE_DP3:
1778 /* TGSI_OPCODE_DOT3 */
1779 FETCH( func, *inst, 0, 0, CHAN_X );
1780 FETCH( func, *inst, 1, 1, CHAN_X );
1781 emit_mul( func, 0, 1 );
1782 FETCH( func, *inst, 1, 0, CHAN_Y );
1783 FETCH( func, *inst, 2, 1, CHAN_Y );
1784 emit_mul( func, 1, 2 );
1785 emit_add( func, 0, 1 );
1786 FETCH( func, *inst, 1, 0, CHAN_Z );
1787 FETCH( func, *inst, 2, 1, CHAN_Z );
1788 emit_mul( func, 1, 2 );
1789 emit_add( func, 0, 1 );
1790 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1791 STORE( func, *inst, 0, 0, chan_index );
1792 }
1793 break;
1794
1795 case TGSI_OPCODE_DP4:
1796 /* TGSI_OPCODE_DOT4 */
1797 FETCH( func, *inst, 0, 0, CHAN_X );
1798 FETCH( func, *inst, 1, 1, CHAN_X );
1799 emit_mul( func, 0, 1 );
1800 FETCH( func, *inst, 1, 0, CHAN_Y );
1801 FETCH( func, *inst, 2, 1, CHAN_Y );
1802 emit_mul( func, 1, 2 );
1803 emit_add( func, 0, 1 );
1804 FETCH( func, *inst, 1, 0, CHAN_Z );
1805 FETCH( func, *inst, 2, 1, CHAN_Z );
1806 emit_mul(func, 1, 2 );
1807 emit_add(func, 0, 1 );
1808 FETCH( func, *inst, 1, 0, CHAN_W );
1809 FETCH( func, *inst, 2, 1, CHAN_W );
1810 emit_mul( func, 1, 2 );
1811 emit_add( func, 0, 1 );
1812 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1813 STORE( func, *inst, 0, 0, chan_index );
1814 }
1815 break;
1816
1817 case TGSI_OPCODE_DST:
1818 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1819 emit_tempf(
1820 func,
1821 0,
1822 TEMP_ONE_I,
1823 TEMP_ONE_C );
1824 STORE( func, *inst, 0, 0, CHAN_X );
1825 }
1826 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1827 FETCH( func, *inst, 0, 0, CHAN_Y );
1828 FETCH( func, *inst, 1, 1, CHAN_Y );
1829 emit_mul( func, 0, 1 );
1830 STORE( func, *inst, 0, 0, CHAN_Y );
1831 }
1832 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1833 FETCH( func, *inst, 0, 0, CHAN_Z );
1834 STORE( func, *inst, 0, 0, CHAN_Z );
1835 }
1836 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1837 FETCH( func, *inst, 0, 1, CHAN_W );
1838 STORE( func, *inst, 0, 0, CHAN_W );
1839 }
1840 break;
1841
1842 case TGSI_OPCODE_MIN:
1843 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1844 FETCH( func, *inst, 0, 0, chan_index );
1845 FETCH( func, *inst, 1, 1, chan_index );
1846 sse_minps(
1847 func,
1848 make_xmm( 0 ),
1849 make_xmm( 1 ) );
1850 STORE( func, *inst, 0, 0, chan_index );
1851 }
1852 break;
1853
1854 case TGSI_OPCODE_MAX:
1855 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1856 FETCH( func, *inst, 0, 0, chan_index );
1857 FETCH( func, *inst, 1, 1, chan_index );
1858 sse_maxps(
1859 func,
1860 make_xmm( 0 ),
1861 make_xmm( 1 ) );
1862 STORE( func, *inst, 0, 0, chan_index );
1863 }
1864 break;
1865
1866 case TGSI_OPCODE_SLT:
1867 /* TGSI_OPCODE_SETLT */
1868 emit_setcc( func, inst, cc_LessThan );
1869 break;
1870
1871 case TGSI_OPCODE_SGE:
1872 /* TGSI_OPCODE_SETGE */
1873 emit_setcc( func, inst, cc_NotLessThan );
1874 break;
1875
1876 case TGSI_OPCODE_MAD:
1877 /* TGSI_OPCODE_MADD */
1878 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1879 FETCH( func, *inst, 0, 0, chan_index );
1880 FETCH( func, *inst, 1, 1, chan_index );
1881 FETCH( func, *inst, 2, 2, chan_index );
1882 emit_mul( func, 0, 1 );
1883 emit_add( func, 0, 2 );
1884 STORE( func, *inst, 0, 0, chan_index );
1885 }
1886 break;
1887
1888 case TGSI_OPCODE_SUB:
1889 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1890 FETCH( func, *inst, 0, 0, chan_index );
1891 FETCH( func, *inst, 1, 1, chan_index );
1892 emit_sub( func, 0, 1 );
1893 STORE( func, *inst, 0, 0, chan_index );
1894 }
1895 break;
1896
1897 case TGSI_OPCODE_LERP:
1898 /* TGSI_OPCODE_LRP */
1899 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1900 FETCH( func, *inst, 0, 0, chan_index );
1901 FETCH( func, *inst, 1, 1, chan_index );
1902 FETCH( func, *inst, 2, 2, chan_index );
1903 emit_sub( func, 1, 2 );
1904 emit_mul( func, 0, 1 );
1905 emit_add( func, 0, 2 );
1906 STORE( func, *inst, 0, 0, chan_index );
1907 }
1908 break;
1909
1910 case TGSI_OPCODE_CND:
1911 return 0;
1912 break;
1913
1914 case TGSI_OPCODE_CND0:
1915 return 0;
1916 break;
1917
1918 case TGSI_OPCODE_DOT2ADD:
1919 /* TGSI_OPCODE_DP2A */
1920 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
1921 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
1922 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1923 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
1924 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
1925 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1926 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1927 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
1928 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1929 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1930 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
1931 }
1932 break;
1933
1934 case TGSI_OPCODE_INDEX:
1935 return 0;
1936 break;
1937
1938 case TGSI_OPCODE_NEGATE:
1939 return 0;
1940 break;
1941
1942 case TGSI_OPCODE_FRAC:
1943 /* TGSI_OPCODE_FRC */
1944 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1945 FETCH( func, *inst, 0, 0, chan_index );
1946 emit_frc( func, 0, 0 );
1947 STORE( func, *inst, 0, 0, chan_index );
1948 }
1949 break;
1950
1951 case TGSI_OPCODE_CLAMP:
1952 return 0;
1953 break;
1954
1955 case TGSI_OPCODE_FLOOR:
1956 /* TGSI_OPCODE_FLR */
1957 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1958 FETCH( func, *inst, 0, 0, chan_index );
1959 emit_flr( func, 0, 0 );
1960 STORE( func, *inst, 0, 0, chan_index );
1961 }
1962 break;
1963
1964 case TGSI_OPCODE_ROUND:
1965 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1966 FETCH( func, *inst, 0, 0, chan_index );
1967 emit_rnd( func, 0, 0 );
1968 STORE( func, *inst, 0, 0, chan_index );
1969 }
1970 break;
1971
1972 case TGSI_OPCODE_EXPBASE2:
1973 /* TGSI_OPCODE_EX2 */
1974 FETCH( func, *inst, 0, 0, CHAN_X );
1975 emit_ex2( func, 0, 0 );
1976 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1977 STORE( func, *inst, 0, 0, chan_index );
1978 }
1979 break;
1980
1981 case TGSI_OPCODE_LOGBASE2:
1982 /* TGSI_OPCODE_LG2 */
1983 FETCH( func, *inst, 0, 0, CHAN_X );
1984 emit_lg2( func, 0, 0 );
1985 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1986 STORE( func, *inst, 0, 0, chan_index );
1987 }
1988 break;
1989
1990 case TGSI_OPCODE_POWER:
1991 /* TGSI_OPCODE_POW */
1992 FETCH( func, *inst, 0, 0, CHAN_X );
1993 FETCH( func, *inst, 1, 1, CHAN_X );
1994 emit_pow( func, 0, 0, 0, 1 );
1995 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1996 STORE( func, *inst, 0, 0, chan_index );
1997 }
1998 break;
1999
2000 case TGSI_OPCODE_CROSSPRODUCT:
2001 /* TGSI_OPCODE_XPD */
2002 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2003 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2004 FETCH( func, *inst, 1, 1, CHAN_Z );
2005 FETCH( func, *inst, 3, 0, CHAN_Z );
2006 }
2007 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2008 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2009 FETCH( func, *inst, 0, 0, CHAN_Y );
2010 FETCH( func, *inst, 4, 1, CHAN_Y );
2011 }
2012 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2013 emit_MOV( func, 2, 0 );
2014 emit_mul( func, 2, 1 );
2015 emit_MOV( func, 5, 3 );
2016 emit_mul( func, 5, 4 );
2017 emit_sub( func, 2, 5 );
2018 STORE( func, *inst, 2, 0, CHAN_X );
2019 }
2020 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2021 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2022 FETCH( func, *inst, 2, 1, CHAN_X );
2023 FETCH( func, *inst, 5, 0, CHAN_X );
2024 }
2025 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2026 emit_mul( func, 3, 2 );
2027 emit_mul( func, 1, 5 );
2028 emit_sub( func, 3, 1 );
2029 STORE( func, *inst, 3, 0, CHAN_Y );
2030 }
2031 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2032 emit_mul( func, 5, 4 );
2033 emit_mul( func, 0, 2 );
2034 emit_sub( func, 5, 0 );
2035 STORE( func, *inst, 5, 0, CHAN_Z );
2036 }
2037 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2038 emit_tempf(
2039 func,
2040 0,
2041 TEMP_ONE_I,
2042 TEMP_ONE_C );
2043 STORE( func, *inst, 0, 0, CHAN_W );
2044 }
2045 break;
2046
2047 case TGSI_OPCODE_MULTIPLYMATRIX:
2048 return 0;
2049 break;
2050
2051 case TGSI_OPCODE_ABS:
2052 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2053 FETCH( func, *inst, 0, 0, chan_index );
2054 emit_abs( func, 0) ;
2055
2056 STORE( func, *inst, 0, 0, chan_index );
2057 }
2058 break;
2059
2060 case TGSI_OPCODE_RCC:
2061 return 0;
2062 break;
2063
2064 case TGSI_OPCODE_DPH:
2065 FETCH( func, *inst, 0, 0, CHAN_X );
2066 FETCH( func, *inst, 1, 1, CHAN_X );
2067 emit_mul( func, 0, 1 );
2068 FETCH( func, *inst, 1, 0, CHAN_Y );
2069 FETCH( func, *inst, 2, 1, CHAN_Y );
2070 emit_mul( func, 1, 2 );
2071 emit_add( func, 0, 1 );
2072 FETCH( func, *inst, 1, 0, CHAN_Z );
2073 FETCH( func, *inst, 2, 1, CHAN_Z );
2074 emit_mul( func, 1, 2 );
2075 emit_add( func, 0, 1 );
2076 FETCH( func, *inst, 1, 1, CHAN_W );
2077 emit_add( func, 0, 1 );
2078 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2079 STORE( func, *inst, 0, 0, chan_index );
2080 }
2081 break;
2082
2083 case TGSI_OPCODE_COS:
2084 FETCH( func, *inst, 0, 0, CHAN_X );
2085 emit_cos( func, 0, 0 );
2086 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2087 STORE( func, *inst, 0, 0, chan_index );
2088 }
2089 break;
2090
2091 case TGSI_OPCODE_DDX:
2092 return 0;
2093 break;
2094
2095 case TGSI_OPCODE_DDY:
2096 return 0;
2097 break;
2098
2099 case TGSI_OPCODE_KILP:
2100 /* predicated kill */
2101 emit_kilp( func );
2102 return 0; /* XXX fix me */
2103 break;
2104
2105 case TGSI_OPCODE_KIL:
2106 /* conditional kill */
2107 emit_kil( func, &inst->FullSrcRegisters[0] );
2108 break;
2109
2110 case TGSI_OPCODE_PK2H:
2111 return 0;
2112 break;
2113
2114 case TGSI_OPCODE_PK2US:
2115 return 0;
2116 break;
2117
2118 case TGSI_OPCODE_PK4B:
2119 return 0;
2120 break;
2121
2122 case TGSI_OPCODE_PK4UB:
2123 return 0;
2124 break;
2125
2126 case TGSI_OPCODE_RFL:
2127 return 0;
2128 break;
2129
2130 case TGSI_OPCODE_SEQ:
2131 return 0;
2132 break;
2133
2134 case TGSI_OPCODE_SFL:
2135 return 0;
2136 break;
2137
2138 case TGSI_OPCODE_SGT:
2139 return 0;
2140 break;
2141
2142 case TGSI_OPCODE_SIN:
2143 FETCH( func, *inst, 0, 0, CHAN_X );
2144 emit_sin( func, 0, 0 );
2145 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2146 STORE( func, *inst, 0, 0, chan_index );
2147 }
2148 break;
2149
2150 case TGSI_OPCODE_SLE:
2151 return 0;
2152 break;
2153
2154 case TGSI_OPCODE_SNE:
2155 return 0;
2156 break;
2157
2158 case TGSI_OPCODE_STR:
2159 return 0;
2160 break;
2161
2162 case TGSI_OPCODE_TEX:
2163 if (0) {
2164 /* Disable dummy texture code:
2165 */
2166 emit_tempf(
2167 func,
2168 0,
2169 TEMP_ONE_I,
2170 TEMP_ONE_C );
2171 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2172 STORE( func, *inst, 0, 0, chan_index );
2173 }
2174 }
2175 else {
2176 return 0;
2177 }
2178 break;
2179
2180 case TGSI_OPCODE_TXD:
2181 return 0;
2182 break;
2183
2184 case TGSI_OPCODE_UP2H:
2185 return 0;
2186 break;
2187
2188 case TGSI_OPCODE_UP2US:
2189 return 0;
2190 break;
2191
2192 case TGSI_OPCODE_UP4B:
2193 return 0;
2194 break;
2195
2196 case TGSI_OPCODE_UP4UB:
2197 return 0;
2198 break;
2199
2200 case TGSI_OPCODE_X2D:
2201 return 0;
2202 break;
2203
2204 case TGSI_OPCODE_ARA:
2205 return 0;
2206 break;
2207
2208 case TGSI_OPCODE_ARR:
2209 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2210 FETCH( func, *inst, 0, 0, chan_index );
2211 emit_rnd( func, 0, 0 );
2212 emit_f2it( func, 0 );
2213 STORE( func, *inst, 0, 0, chan_index );
2214 }
2215 break;
2216
2217 case TGSI_OPCODE_BRA:
2218 return 0;
2219 break;
2220
2221 case TGSI_OPCODE_CAL:
2222 return 0;
2223 break;
2224
2225 case TGSI_OPCODE_RET:
2226 emit_ret( func );
2227 break;
2228
2229 case TGSI_OPCODE_END:
2230 break;
2231
2232 case TGSI_OPCODE_SSG:
2233 /* TGSI_OPCODE_SGN */
2234 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2235 FETCH( func, *inst, 0, 0, chan_index );
2236 emit_sgn( func, 0, 0 );
2237 STORE( func, *inst, 0, 0, chan_index );
2238 }
2239 break;
2240
2241 case TGSI_OPCODE_CMP:
2242 emit_cmp (func, inst);
2243 break;
2244
2245 case TGSI_OPCODE_SCS:
2246 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2247 FETCH( func, *inst, 0, 0, CHAN_X );
2248 emit_cos( func, 0, 0 );
2249 STORE( func, *inst, 0, 0, CHAN_X );
2250 }
2251 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2252 FETCH( func, *inst, 0, 0, CHAN_X );
2253 emit_sin( func, 0, 0 );
2254 STORE( func, *inst, 0, 0, CHAN_Y );
2255 }
2256 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2257 emit_tempf(
2258 func,
2259 0,
2260 TGSI_EXEC_TEMP_00000000_I,
2261 TGSI_EXEC_TEMP_00000000_C );
2262 STORE( func, *inst, 0, 0, CHAN_Z );
2263 }
2264 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2265 emit_tempf(
2266 func,
2267 0,
2268 TEMP_ONE_I,
2269 TEMP_ONE_C );
2270 STORE( func, *inst, 0, 0, CHAN_W );
2271 }
2272 break;
2273
2274 case TGSI_OPCODE_TXB:
2275 return 0;
2276 break;
2277
2278 case TGSI_OPCODE_NRM:
2279 /* fall-through */
2280 case TGSI_OPCODE_NRM4:
2281 /* 3 or 4-component normalization */
2282 {
2283 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2284
2285 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2286 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2287 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2288 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2289
2290 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2291
2292 /* xmm4 = src.x */
2293 /* xmm0 = src.x * src.x */
2294 FETCH(func, *inst, 0, 0, CHAN_X);
2295 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2296 emit_MOV(func, 4, 0);
2297 }
2298 emit_mul(func, 0, 0);
2299
2300 /* xmm5 = src.y */
2301 /* xmm0 = xmm0 + src.y * src.y */
2302 FETCH(func, *inst, 1, 0, CHAN_Y);
2303 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2304 emit_MOV(func, 5, 1);
2305 }
2306 emit_mul(func, 1, 1);
2307 emit_add(func, 0, 1);
2308
2309 /* xmm6 = src.z */
2310 /* xmm0 = xmm0 + src.z * src.z */
2311 FETCH(func, *inst, 1, 0, CHAN_Z);
2312 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2313 emit_MOV(func, 6, 1);
2314 }
2315 emit_mul(func, 1, 1);
2316 emit_add(func, 0, 1);
2317
2318 if (dims == 4) {
2319 /* xmm7 = src.w */
2320 /* xmm0 = xmm0 + src.w * src.w */
2321 FETCH(func, *inst, 1, 0, CHAN_W);
2322 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2323 emit_MOV(func, 7, 1);
2324 }
2325 emit_mul(func, 1, 1);
2326 emit_add(func, 0, 1);
2327 }
2328
2329 /* xmm1 = 1 / sqrt(xmm0) */
2330 emit_rsqrt(func, 1, 0);
2331
2332 /* dst.x = xmm1 * src.x */
2333 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2334 emit_mul(func, 4, 1);
2335 STORE(func, *inst, 4, 0, CHAN_X);
2336 }
2337
2338 /* dst.y = xmm1 * src.y */
2339 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2340 emit_mul(func, 5, 1);
2341 STORE(func, *inst, 5, 0, CHAN_Y);
2342 }
2343
2344 /* dst.z = xmm1 * src.z */
2345 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2346 emit_mul(func, 6, 1);
2347 STORE(func, *inst, 6, 0, CHAN_Z);
2348 }
2349
2350 /* dst.w = xmm1 * src.w */
2351 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2352 emit_mul(func, 7, 1);
2353 STORE(func, *inst, 7, 0, CHAN_W);
2354 }
2355 }
2356
2357 /* dst0.w = 1.0 */
2358 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2359 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2360 STORE(func, *inst, 0, 0, CHAN_W);
2361 }
2362 }
2363 break;
2364
2365 case TGSI_OPCODE_DIV:
2366 return 0;
2367 break;
2368
2369 case TGSI_OPCODE_DP2:
2370 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2371 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2372 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2373 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2374 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2375 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2376 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2377 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2378 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2379 }
2380 break;
2381
2382 case TGSI_OPCODE_TXL:
2383 return 0;
2384 break;
2385
2386 case TGSI_OPCODE_BRK:
2387 return 0;
2388 break;
2389
2390 case TGSI_OPCODE_IF:
2391 return 0;
2392 break;
2393
2394 case TGSI_OPCODE_LOOP:
2395 return 0;
2396 break;
2397
2398 case TGSI_OPCODE_REP:
2399 return 0;
2400 break;
2401
2402 case TGSI_OPCODE_ELSE:
2403 return 0;
2404 break;
2405
2406 case TGSI_OPCODE_ENDIF:
2407 return 0;
2408 break;
2409
2410 case TGSI_OPCODE_ENDLOOP:
2411 return 0;
2412 break;
2413
2414 case TGSI_OPCODE_ENDREP:
2415 return 0;
2416 break;
2417
2418 case TGSI_OPCODE_PUSHA:
2419 return 0;
2420 break;
2421
2422 case TGSI_OPCODE_POPA:
2423 return 0;
2424 break;
2425
2426 case TGSI_OPCODE_CEIL:
2427 return 0;
2428 break;
2429
2430 case TGSI_OPCODE_I2F:
2431 return 0;
2432 break;
2433
2434 case TGSI_OPCODE_NOT:
2435 return 0;
2436 break;
2437
2438 case TGSI_OPCODE_TRUNC:
2439 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2440 FETCH( func, *inst, 0, 0, chan_index );
2441 emit_f2it( func, 0 );
2442 emit_i2f( func, 0 );
2443 STORE( func, *inst, 0, 0, chan_index );
2444 }
2445 break;
2446
2447 case TGSI_OPCODE_SHL:
2448 return 0;
2449 break;
2450
2451 case TGSI_OPCODE_SHR:
2452 return 0;
2453 break;
2454
2455 case TGSI_OPCODE_AND:
2456 return 0;
2457 break;
2458
2459 case TGSI_OPCODE_OR:
2460 return 0;
2461 break;
2462
2463 case TGSI_OPCODE_MOD:
2464 return 0;
2465 break;
2466
2467 case TGSI_OPCODE_XOR:
2468 return 0;
2469 break;
2470
2471 case TGSI_OPCODE_SAD:
2472 return 0;
2473 break;
2474
2475 case TGSI_OPCODE_TXF:
2476 return 0;
2477 break;
2478
2479 case TGSI_OPCODE_TXQ:
2480 return 0;
2481 break;
2482
2483 case TGSI_OPCODE_CONT:
2484 return 0;
2485 break;
2486
2487 case TGSI_OPCODE_EMIT:
2488 return 0;
2489 break;
2490
2491 case TGSI_OPCODE_ENDPRIM:
2492 return 0;
2493 break;
2494
2495 default:
2496 return 0;
2497 }
2498
2499 return 1;
2500 }
2501
2502 static void
2503 emit_declaration(
2504 struct x86_function *func,
2505 struct tgsi_full_declaration *decl )
2506 {
2507 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2508 unsigned first, last, mask;
2509 unsigned i, j;
2510
2511 first = decl->DeclarationRange.First;
2512 last = decl->DeclarationRange.Last;
2513 mask = decl->Declaration.UsageMask;
2514
2515 for( i = first; i <= last; i++ ) {
2516 for( j = 0; j < NUM_CHANNELS; j++ ) {
2517 if( mask & (1 << j) ) {
2518 switch( decl->Declaration.Interpolate ) {
2519 case TGSI_INTERPOLATE_CONSTANT:
2520 emit_coef_a0( func, 0, i, j );
2521 emit_inputs( func, 0, i, j );
2522 break;
2523
2524 case TGSI_INTERPOLATE_LINEAR:
2525 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2526 emit_coef_dadx( func, 1, i, j );
2527 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2528 emit_coef_dady( func, 3, i, j );
2529 emit_mul( func, 0, 1 ); /* x * dadx */
2530 emit_coef_a0( func, 4, i, j );
2531 emit_mul( func, 2, 3 ); /* y * dady */
2532 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2533 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2534 emit_inputs( func, 0, i, j );
2535 break;
2536
2537 case TGSI_INTERPOLATE_PERSPECTIVE:
2538 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2539 emit_coef_dadx( func, 1, i, j );
2540 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2541 emit_coef_dady( func, 3, i, j );
2542 emit_mul( func, 0, 1 ); /* x * dadx */
2543 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2544 emit_coef_a0( func, 5, i, j );
2545 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2546 emit_mul( func, 2, 3 ); /* y * dady */
2547 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2548 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2549 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2550 emit_inputs( func, 0, i, j );
2551 break;
2552
2553 default:
2554 assert( 0 );
2555 break;
2556 }
2557 }
2558 }
2559 }
2560 }
2561 }
2562
2563 static void aos_to_soa( struct x86_function *func,
2564 uint arg_aos,
2565 uint arg_machine,
2566 uint arg_num,
2567 uint arg_stride )
2568 {
2569 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2570 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2571 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2572 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2573 int inner_loop;
2574
2575
2576 /* Save EBX */
2577 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2578
2579 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2580 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2581 x86_lea( func, soa_input,
2582 x86_make_disp( soa_input,
2583 Offset(struct tgsi_exec_machine, Inputs) ) );
2584 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2585 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2586
2587 /* do */
2588 inner_loop = x86_get_label( func );
2589 {
2590 x86_push( func, aos_input );
2591 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2592 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2593 x86_add( func, aos_input, stride );
2594 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2595 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2596 x86_add( func, aos_input, stride );
2597 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2598 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2599 x86_add( func, aos_input, stride );
2600 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2601 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2602 x86_pop( func, aos_input );
2603
2604 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2605 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2606 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2607 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2608 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2609 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2610
2611 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2612 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2613 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2614 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2615
2616 /* Advance to next input */
2617 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2618 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2619 }
2620 /* while --num_inputs */
2621 x86_dec( func, num_inputs );
2622 x86_jcc( func, cc_NE, inner_loop );
2623
2624 /* Restore EBX */
2625 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2626 }
2627
2628 static void soa_to_aos( struct x86_function *func,
2629 uint arg_aos,
2630 uint arg_machine,
2631 uint arg_num,
2632 uint arg_stride )
2633 {
2634 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2635 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2636 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2637 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2638 int inner_loop;
2639
2640 /* Save EBX */
2641 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2642
2643 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2644 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2645 x86_lea( func, soa_output,
2646 x86_make_disp( soa_output,
2647 Offset(struct tgsi_exec_machine, Outputs) ) );
2648 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2649
2650 /* do */
2651 inner_loop = x86_get_label( func );
2652 {
2653 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2654 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2655 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2656 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2657
2658 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2659 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2660 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2661 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2662 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2663 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2664
2665 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2666 x86_push( func, aos_output );
2667 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2668 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2669 x86_add( func, aos_output, temp );
2670 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2671 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2672 x86_add( func, aos_output, temp );
2673 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2674 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2675 x86_add( func, aos_output, temp );
2676 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2677 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2678 x86_pop( func, aos_output );
2679
2680 /* Advance to next output */
2681 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2682 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2683 }
2684 /* while --num_outputs */
2685 x86_dec( func, num_outputs );
2686 x86_jcc( func, cc_NE, inner_loop );
2687
2688 /* Restore EBX */
2689 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2690 }
2691
2692 /**
2693 * Translate a TGSI vertex/fragment shader to SSE2 code.
2694 * Slightly different things are done for vertex vs. fragment shaders.
2695 *
2696 * \param tokens the TGSI input shader
2697 * \param func the output SSE code/function
2698 * \param immediates buffer to place immediates, later passed to SSE func
2699 * \param return 1 for success, 0 if translation failed
2700 */
2701 unsigned
2702 tgsi_emit_sse2(
2703 const struct tgsi_token *tokens,
2704 struct x86_function *func,
2705 float (*immediates)[4],
2706 boolean do_swizzles )
2707 {
2708 struct tgsi_parse_context parse;
2709 unsigned ok = 1;
2710 uint num_immediates = 0;
2711
2712 util_init_math();
2713
2714 func->csr = func->store;
2715
2716 tgsi_parse_init( &parse, tokens );
2717
2718 /* Can't just use EDI, EBX without save/restoring them:
2719 */
2720 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2721 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2722
2723 /*
2724 * Different function args for vertex/fragment shaders:
2725 */
2726 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2727 if (do_swizzles)
2728 aos_to_soa( func,
2729 4, /* aos_input */
2730 1, /* machine */
2731 5, /* num_inputs */
2732 6 ); /* input_stride */
2733 }
2734
2735 x86_mov(
2736 func,
2737 get_machine_base(),
2738 x86_fn_arg( func, 1 ) );
2739 x86_mov(
2740 func,
2741 get_const_base(),
2742 x86_fn_arg( func, 2 ) );
2743 x86_mov(
2744 func,
2745 get_immediate_base(),
2746 x86_fn_arg( func, 3 ) );
2747
2748 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2749 x86_mov(
2750 func,
2751 get_coef_base(),
2752 x86_fn_arg( func, 4 ) );
2753 }
2754
2755
2756 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2757 tgsi_parse_token( &parse );
2758
2759 switch( parse.FullToken.Token.Type ) {
2760 case TGSI_TOKEN_TYPE_DECLARATION:
2761 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2762 emit_declaration(
2763 func,
2764 &parse.FullToken.FullDeclaration );
2765 }
2766 break;
2767
2768 case TGSI_TOKEN_TYPE_INSTRUCTION:
2769 ok = emit_instruction(
2770 func,
2771 &parse.FullToken.FullInstruction );
2772
2773 if (!ok) {
2774 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2775 parse.FullToken.FullInstruction.Instruction.Opcode,
2776 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2777 "vertex shader" : "fragment shader");
2778 }
2779 break;
2780
2781 case TGSI_TOKEN_TYPE_IMMEDIATE:
2782 /* simply copy the immediate values into the next immediates[] slot */
2783 {
2784 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2785 uint i;
2786 assert(size <= 4);
2787 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2788 for( i = 0; i < size; i++ ) {
2789 immediates[num_immediates][i] =
2790 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2791 }
2792 #if 0
2793 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2794 num_immediates,
2795 immediates[num_immediates][0],
2796 immediates[num_immediates][1],
2797 immediates[num_immediates][2],
2798 immediates[num_immediates][3]);
2799 #endif
2800 num_immediates++;
2801 }
2802 break;
2803
2804 default:
2805 ok = 0;
2806 assert( 0 );
2807 }
2808 }
2809
2810 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2811 if (do_swizzles)
2812 soa_to_aos( func,
2813 7, /* aos_output */
2814 1, /* machine */
2815 8, /* num_outputs */
2816 9 ); /* output_stride */
2817 }
2818
2819 /* Can't just use EBX, EDI without save/restoring them:
2820 */
2821 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2822 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2823
2824 emit_ret( func );
2825
2826 tgsi_parse_free( &parse );
2827
2828 return ok;
2829 }
2830
2831 #endif /* PIPE_ARCH_X86 */
2832