radeon/r200/r300: cleanup some of the renderbuffer code
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #if defined(PIPE_ARCH_SSE)
36 #include "util/u_sse.h"
37 #endif
38 #include "tgsi/tgsi_parse.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi_exec.h"
41 #include "tgsi_sse2.h"
42
43 #include "rtasm/rtasm_x86sse.h"
44
45 /* for 1/sqrt()
46 *
47 * This costs about 100fps (close to 10%) in gears:
48 */
49 #define HIGH_PRECISION 1
50
51 #define FAST_MATH 1
52
53
54 #define FOR_EACH_CHANNEL( CHAN )\
55 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
56
57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
58 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
59
60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
61 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
62
63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
64 FOR_EACH_CHANNEL( CHAN )\
65 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
66
67 #define CHAN_X 0
68 #define CHAN_Y 1
69 #define CHAN_Z 2
70 #define CHAN_W 3
71
72 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
73 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
74
75 #define TEMP_R0 TGSI_EXEC_TEMP_R0
76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
79
80
81 /**
82 * X86 utility functions.
83 */
84
85 static struct x86_reg
86 make_xmm(
87 unsigned xmm )
88 {
89 return x86_make_reg(
90 file_XMM,
91 (enum x86_reg_name) xmm );
92 }
93
94 /**
95 * X86 register mapping helpers.
96 */
97
98 static struct x86_reg
99 get_const_base( void )
100 {
101 return x86_make_reg(
102 file_REG32,
103 reg_CX );
104 }
105
106 static struct x86_reg
107 get_input_base( void )
108 {
109 return x86_make_reg(
110 file_REG32,
111 reg_AX );
112 }
113
114 static struct x86_reg
115 get_output_base( void )
116 {
117 return x86_make_reg(
118 file_REG32,
119 reg_DX );
120 }
121
122 static struct x86_reg
123 get_temp_base( void )
124 {
125 return x86_make_reg(
126 file_REG32,
127 reg_BX );
128 }
129
130 static struct x86_reg
131 get_coef_base( void )
132 {
133 return get_output_base();
134 }
135
136 static struct x86_reg
137 get_immediate_base( void )
138 {
139 return x86_make_reg(
140 file_REG32,
141 reg_DI );
142 }
143
144
145 /**
146 * Data access helpers.
147 */
148
149
150 static struct x86_reg
151 get_immediate(
152 unsigned vec,
153 unsigned chan )
154 {
155 return x86_make_disp(
156 get_immediate_base(),
157 (vec * 4 + chan) * 4 );
158 }
159
160 static struct x86_reg
161 get_const(
162 unsigned vec,
163 unsigned chan )
164 {
165 return x86_make_disp(
166 get_const_base(),
167 (vec * 4 + chan) * 4 );
168 }
169
170 static struct x86_reg
171 get_input(
172 unsigned vec,
173 unsigned chan )
174 {
175 return x86_make_disp(
176 get_input_base(),
177 (vec * 4 + chan) * 16 );
178 }
179
180 static struct x86_reg
181 get_output(
182 unsigned vec,
183 unsigned chan )
184 {
185 return x86_make_disp(
186 get_output_base(),
187 (vec * 4 + chan) * 16 );
188 }
189
190 static struct x86_reg
191 get_temp(
192 unsigned vec,
193 unsigned chan )
194 {
195 return x86_make_disp(
196 get_temp_base(),
197 (vec * 4 + chan) * 16 );
198 }
199
200 static struct x86_reg
201 get_coef(
202 unsigned vec,
203 unsigned chan,
204 unsigned member )
205 {
206 return x86_make_disp(
207 get_coef_base(),
208 ((vec * 3 + member) * 4 + chan) * 4 );
209 }
210
211
212 static void
213 emit_ret(
214 struct x86_function *func )
215 {
216 x86_ret( func );
217 }
218
219
220 /**
221 * Data fetch helpers.
222 */
223
224 /**
225 * Copy a shader constant to xmm register
226 * \param xmm the destination xmm register
227 * \param vec the src const buffer index
228 * \param chan src channel to fetch (X, Y, Z or W)
229 */
230 static void
231 emit_const(
232 struct x86_function *func,
233 uint xmm,
234 int vec,
235 uint chan,
236 uint indirect,
237 uint indirectFile,
238 int indirectIndex )
239 {
240 if (indirect) {
241 /* 'vec' is the offset from the address register's value.
242 * We're loading CONST[ADDR+vec] into an xmm register.
243 */
244 struct x86_reg r0 = get_input_base();
245 struct x86_reg r1 = get_output_base();
246 uint i;
247
248 assert( indirectFile == TGSI_FILE_ADDRESS );
249 assert( indirectIndex == 0 );
250
251 x86_push( func, r0 );
252 x86_push( func, r1 );
253
254 /*
255 * Loop over the four pixels or vertices in the quad.
256 * Get the value of the address (offset) register for pixel/vertex[i],
257 * add it to the src offset and index into the constant buffer.
258 * Note that we're working on SOA data.
259 * If any of the pixel/vertex execution channels are unused their
260 * values will be garbage. It's very important that we don't use
261 * those garbage values as indexes into the constant buffer since
262 * that'll cause segfaults.
263 * The solution is to bitwise-AND the offset with the execution mask
264 * register whose values are either 0 or ~0.
265 * The caller must setup the execution mask register to indicate
266 * which channels are valid/alive before running the shader.
267 * The execution mask will also figure into loops and conditionals
268 * someday.
269 */
270 for (i = 0; i < QUAD_SIZE; i++) {
271 /* r1 = address register[i] */
272 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
273 /* r0 = execution mask[i] */
274 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
275 /* r1 = r1 & r0 */
276 x86_and( func, r1, r0 );
277 /* r0 = 'vec', the offset */
278 x86_lea( func, r0, get_const( vec, chan ) );
279
280 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
281 */
282 x86_add( func, r1, r1 );
283 x86_add( func, r1, r1 );
284 x86_add( func, r1, r1 );
285 x86_add( func, r1, r1 );
286
287 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
288 x86_mov( func, r1, x86_deref( r0 ) );
289 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
290 }
291
292 x86_pop( func, r1 );
293 x86_pop( func, r0 );
294
295 sse_movaps(
296 func,
297 make_xmm( xmm ),
298 get_temp( TEMP_R0, CHAN_X ) );
299 }
300 else {
301 /* 'vec' is the index into the src register file, such as TEMP[vec] */
302 assert( vec >= 0 );
303
304 sse_movss(
305 func,
306 make_xmm( xmm ),
307 get_const( vec, chan ) );
308 sse_shufps(
309 func,
310 make_xmm( xmm ),
311 make_xmm( xmm ),
312 SHUF( 0, 0, 0, 0 ) );
313 }
314 }
315
316 static void
317 emit_immediate(
318 struct x86_function *func,
319 unsigned xmm,
320 unsigned vec,
321 unsigned chan )
322 {
323 sse_movss(
324 func,
325 make_xmm( xmm ),
326 get_immediate( vec, chan ) );
327 sse_shufps(
328 func,
329 make_xmm( xmm ),
330 make_xmm( xmm ),
331 SHUF( 0, 0, 0, 0 ) );
332 }
333
334
335 /**
336 * Copy a shader input to xmm register
337 * \param xmm the destination xmm register
338 * \param vec the src input attrib
339 * \param chan src channel to fetch (X, Y, Z or W)
340 */
341 static void
342 emit_inputf(
343 struct x86_function *func,
344 unsigned xmm,
345 unsigned vec,
346 unsigned chan )
347 {
348 sse_movups(
349 func,
350 make_xmm( xmm ),
351 get_input( vec, chan ) );
352 }
353
354 /**
355 * Store an xmm register to a shader output
356 * \param xmm the source xmm register
357 * \param vec the dest output attrib
358 * \param chan src dest channel to store (X, Y, Z or W)
359 */
360 static void
361 emit_output(
362 struct x86_function *func,
363 unsigned xmm,
364 unsigned vec,
365 unsigned chan )
366 {
367 sse_movups(
368 func,
369 get_output( vec, chan ),
370 make_xmm( xmm ) );
371 }
372
373 /**
374 * Copy a shader temporary to xmm register
375 * \param xmm the destination xmm register
376 * \param vec the src temp register
377 * \param chan src channel to fetch (X, Y, Z or W)
378 */
379 static void
380 emit_tempf(
381 struct x86_function *func,
382 unsigned xmm,
383 unsigned vec,
384 unsigned chan )
385 {
386 sse_movaps(
387 func,
388 make_xmm( xmm ),
389 get_temp( vec, chan ) );
390 }
391
392 /**
393 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
394 * \param xmm the destination xmm register
395 * \param vec the src input/attribute coefficient index
396 * \param chan src channel to fetch (X, Y, Z or W)
397 * \param member 0=a0, 1=dadx, 2=dady
398 */
399 static void
400 emit_coef(
401 struct x86_function *func,
402 unsigned xmm,
403 unsigned vec,
404 unsigned chan,
405 unsigned member )
406 {
407 sse_movss(
408 func,
409 make_xmm( xmm ),
410 get_coef( vec, chan, member ) );
411 sse_shufps(
412 func,
413 make_xmm( xmm ),
414 make_xmm( xmm ),
415 SHUF( 0, 0, 0, 0 ) );
416 }
417
418 /**
419 * Data store helpers.
420 */
421
422 static void
423 emit_inputs(
424 struct x86_function *func,
425 unsigned xmm,
426 unsigned vec,
427 unsigned chan )
428 {
429 sse_movups(
430 func,
431 get_input( vec, chan ),
432 make_xmm( xmm ) );
433 }
434
435 static void
436 emit_temps(
437 struct x86_function *func,
438 unsigned xmm,
439 unsigned vec,
440 unsigned chan )
441 {
442 sse_movaps(
443 func,
444 get_temp( vec, chan ),
445 make_xmm( xmm ) );
446 }
447
448 static void
449 emit_addrs(
450 struct x86_function *func,
451 unsigned xmm,
452 unsigned vec,
453 unsigned chan )
454 {
455 assert( vec == 0 );
456
457 emit_temps(
458 func,
459 xmm,
460 vec + TGSI_EXEC_TEMP_ADDR,
461 chan );
462 }
463
464 /**
465 * Coefficent fetch helpers.
466 */
467
468 static void
469 emit_coef_a0(
470 struct x86_function *func,
471 unsigned xmm,
472 unsigned vec,
473 unsigned chan )
474 {
475 emit_coef(
476 func,
477 xmm,
478 vec,
479 chan,
480 0 );
481 }
482
483 static void
484 emit_coef_dadx(
485 struct x86_function *func,
486 unsigned xmm,
487 unsigned vec,
488 unsigned chan )
489 {
490 emit_coef(
491 func,
492 xmm,
493 vec,
494 chan,
495 1 );
496 }
497
498 static void
499 emit_coef_dady(
500 struct x86_function *func,
501 unsigned xmm,
502 unsigned vec,
503 unsigned chan )
504 {
505 emit_coef(
506 func,
507 xmm,
508 vec,
509 chan,
510 2 );
511 }
512
513 /**
514 * Function call helpers.
515 */
516
517 /**
518 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
519 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
520 * that the stack pointer is 16 byte aligned, as expected.
521 */
522 static void
523 emit_func_call_dst(
524 struct x86_function *func,
525 unsigned xmm_save,
526 unsigned xmm_dst,
527 void (PIPE_CDECL *code)() )
528 {
529 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
530 unsigned i, n;
531 unsigned xmm_mask;
532
533 /* Bitmask of the xmm registers to save */
534 xmm_mask = (1 << xmm_save) - 1;
535 xmm_mask &= ~(1 << xmm_dst);
536
537 sse_movaps(
538 func,
539 get_temp( TEMP_R0, 0 ),
540 make_xmm( xmm_dst ) );
541
542 x86_push(
543 func,
544 x86_make_reg( file_REG32, reg_AX) );
545 x86_push(
546 func,
547 x86_make_reg( file_REG32, reg_CX) );
548 x86_push(
549 func,
550 x86_make_reg( file_REG32, reg_DX) );
551
552 for(i = 0, n = 0; i < 8; ++i)
553 if(xmm_mask & (1 << i))
554 ++n;
555
556 x86_sub_imm(
557 func,
558 x86_make_reg( file_REG32, reg_SP ),
559 n*16);
560
561 for(i = 0, n = 0; i < 8; ++i)
562 if(xmm_mask & (1 << i)) {
563 sse_movups(
564 func,
565 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
566 make_xmm( i ) );
567 ++n;
568 }
569
570 x86_lea(
571 func,
572 ecx,
573 get_temp( TEMP_R0, 0 ) );
574
575 x86_push( func, ecx );
576 x86_mov_reg_imm( func, ecx, (unsigned long) code );
577 x86_call( func, ecx );
578 x86_pop(func, ecx );
579
580 for(i = 0, n = 0; i < 8; ++i)
581 if(xmm_mask & (1 << i)) {
582 sse_movups(
583 func,
584 make_xmm( i ),
585 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
586 ++n;
587 }
588
589 x86_add_imm(
590 func,
591 x86_make_reg( file_REG32, reg_SP ),
592 n*16);
593
594 /* Restore GP registers in a reverse order.
595 */
596 x86_pop(
597 func,
598 x86_make_reg( file_REG32, reg_DX) );
599 x86_pop(
600 func,
601 x86_make_reg( file_REG32, reg_CX) );
602 x86_pop(
603 func,
604 x86_make_reg( file_REG32, reg_AX) );
605
606 sse_movaps(
607 func,
608 make_xmm( xmm_dst ),
609 get_temp( TEMP_R0, 0 ) );
610 }
611
612 static void
613 emit_func_call_dst_src(
614 struct x86_function *func,
615 unsigned xmm_save,
616 unsigned xmm_dst,
617 unsigned xmm_src,
618 void (PIPE_CDECL *code)() )
619 {
620 sse_movaps(
621 func,
622 get_temp( TEMP_R0, 1 ),
623 make_xmm( xmm_src ) );
624
625 emit_func_call_dst(
626 func,
627 xmm_save,
628 xmm_dst,
629 code );
630 }
631
632
633 #if defined(PIPE_ARCH_SSE)
634
635 /*
636 * Fast SSE2 implementation of special math functions.
637 */
638
639 #define POLY0(x, c0) _mm_set1_ps(c0)
640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
645
646 #define EXP_POLY_DEGREE 3
647 #define LOG_POLY_DEGREE 5
648
649 /**
650 * See http://www.devmaster.net/forums/showthread.php?p=43580
651 */
652 static INLINE __m128
653 exp2f4(__m128 x)
654 {
655 __m128i ipart;
656 __m128 fpart, expipart, expfpart;
657
658 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
659 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
660
661 /* ipart = int(x - 0.5) */
662 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
663
664 /* fpart = x - ipart */
665 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
666
667 /* expipart = (float) (1 << ipart) */
668 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
669
670 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
671 #if EXP_POLY_DEGREE == 5
672 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
673 #elif EXP_POLY_DEGREE == 4
674 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
675 #elif EXP_POLY_DEGREE == 3
676 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
677 #elif EXP_POLY_DEGREE == 2
678 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
679 #else
680 #error
681 #endif
682
683 return _mm_mul_ps(expipart, expfpart);
684 }
685
686
687 /**
688 * See http://www.devmaster.net/forums/showthread.php?p=43580
689 */
690 static INLINE __m128
691 log2f4(__m128 x)
692 {
693 __m128i expmask = _mm_set1_epi32(0x7f800000);
694 __m128i mantmask = _mm_set1_epi32(0x007fffff);
695 __m128 one = _mm_set1_ps(1.0f);
696
697 __m128i i = _mm_castps_si128(x);
698
699 /* exp = (float) exponent(x) */
700 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
701
702 /* mant = (float) mantissa(x) */
703 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
704
705 __m128 logmant;
706
707 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
708 * These coefficients can be generate with
709 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
710 */
711 #if LOG_POLY_DEGREE == 6
712 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
713 #elif LOG_POLY_DEGREE == 5
714 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
715 #elif LOG_POLY_DEGREE == 4
716 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
717 #elif LOG_POLY_DEGREE == 3
718 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
719 #else
720 #error
721 #endif
722
723 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
724 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
725
726 return _mm_add_ps(logmant, exp);
727 }
728
729
730 static INLINE __m128
731 powf4(__m128 x, __m128 y)
732 {
733 return exp2f4(_mm_mul_ps(log2f4(x), y));
734 }
735
736 #endif /* PIPE_ARCH_SSE */
737
738
739
740 /**
741 * Low-level instruction translators.
742 */
743
744 static void
745 emit_abs(
746 struct x86_function *func,
747 unsigned xmm )
748 {
749 sse_andps(
750 func,
751 make_xmm( xmm ),
752 get_temp(
753 TGSI_EXEC_TEMP_7FFFFFFF_I,
754 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
755 }
756
757 static void
758 emit_add(
759 struct x86_function *func,
760 unsigned xmm_dst,
761 unsigned xmm_src )
762 {
763 sse_addps(
764 func,
765 make_xmm( xmm_dst ),
766 make_xmm( xmm_src ) );
767 }
768
769 static void PIPE_CDECL
770 cos4f(
771 float *store )
772 {
773 store[0] = cosf( store[0] );
774 store[1] = cosf( store[1] );
775 store[2] = cosf( store[2] );
776 store[3] = cosf( store[3] );
777 }
778
779 static void
780 emit_cos(
781 struct x86_function *func,
782 unsigned xmm_save,
783 unsigned xmm_dst )
784 {
785 emit_func_call_dst(
786 func,
787 xmm_save,
788 xmm_dst,
789 cos4f );
790 }
791
792 static void PIPE_CDECL
793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
794 __attribute__((force_align_arg_pointer))
795 #endif
796 ex24f(
797 float *store )
798 {
799 #if defined(PIPE_ARCH_SSE)
800 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
801 #else
802 store[0] = util_fast_exp2( store[0] );
803 store[1] = util_fast_exp2( store[1] );
804 store[2] = util_fast_exp2( store[2] );
805 store[3] = util_fast_exp2( store[3] );
806 #endif
807 }
808
809 static void
810 emit_ex2(
811 struct x86_function *func,
812 unsigned xmm_save,
813 unsigned xmm_dst )
814 {
815 emit_func_call_dst(
816 func,
817 xmm_save,
818 xmm_dst,
819 ex24f );
820 }
821
822 static void
823 emit_f2it(
824 struct x86_function *func,
825 unsigned xmm )
826 {
827 sse2_cvttps2dq(
828 func,
829 make_xmm( xmm ),
830 make_xmm( xmm ) );
831 }
832
833 static void
834 emit_i2f(
835 struct x86_function *func,
836 unsigned xmm )
837 {
838 sse2_cvtdq2ps(
839 func,
840 make_xmm( xmm ),
841 make_xmm( xmm ) );
842 }
843
844 static void PIPE_CDECL
845 flr4f(
846 float *store )
847 {
848 store[0] = floorf( store[0] );
849 store[1] = floorf( store[1] );
850 store[2] = floorf( store[2] );
851 store[3] = floorf( store[3] );
852 }
853
854 static void
855 emit_flr(
856 struct x86_function *func,
857 unsigned xmm_save,
858 unsigned xmm_dst )
859 {
860 emit_func_call_dst(
861 func,
862 xmm_save,
863 xmm_dst,
864 flr4f );
865 }
866
867 static void PIPE_CDECL
868 frc4f(
869 float *store )
870 {
871 store[0] -= floorf( store[0] );
872 store[1] -= floorf( store[1] );
873 store[2] -= floorf( store[2] );
874 store[3] -= floorf( store[3] );
875 }
876
877 static void
878 emit_frc(
879 struct x86_function *func,
880 unsigned xmm_save,
881 unsigned xmm_dst )
882 {
883 emit_func_call_dst(
884 func,
885 xmm_save,
886 xmm_dst,
887 frc4f );
888 }
889
890 static void PIPE_CDECL
891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
892 __attribute__((force_align_arg_pointer))
893 #endif
894 lg24f(
895 float *store )
896 {
897 #if defined(PIPE_ARCH_SSE)
898 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
899 #else
900 store[0] = util_fast_log2( store[0] );
901 store[1] = util_fast_log2( store[1] );
902 store[2] = util_fast_log2( store[2] );
903 store[3] = util_fast_log2( store[3] );
904 #endif
905 }
906
907 static void
908 emit_lg2(
909 struct x86_function *func,
910 unsigned xmm_save,
911 unsigned xmm_dst )
912 {
913 emit_func_call_dst(
914 func,
915 xmm_save,
916 xmm_dst,
917 lg24f );
918 }
919
920 static void
921 emit_MOV(
922 struct x86_function *func,
923 unsigned xmm_dst,
924 unsigned xmm_src )
925 {
926 sse_movups(
927 func,
928 make_xmm( xmm_dst ),
929 make_xmm( xmm_src ) );
930 }
931
932 static void
933 emit_mul (struct x86_function *func,
934 unsigned xmm_dst,
935 unsigned xmm_src)
936 {
937 sse_mulps(
938 func,
939 make_xmm( xmm_dst ),
940 make_xmm( xmm_src ) );
941 }
942
943 static void
944 emit_neg(
945 struct x86_function *func,
946 unsigned xmm )
947 {
948 sse_xorps(
949 func,
950 make_xmm( xmm ),
951 get_temp(
952 TGSI_EXEC_TEMP_80000000_I,
953 TGSI_EXEC_TEMP_80000000_C ) );
954 }
955
956 static void PIPE_CDECL
957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
958 __attribute__((force_align_arg_pointer))
959 #endif
960 pow4f(
961 float *store )
962 {
963 #if defined(PIPE_ARCH_SSE)
964 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
965 #else
966 store[0] = util_fast_pow( store[0], store[4] );
967 store[1] = util_fast_pow( store[1], store[5] );
968 store[2] = util_fast_pow( store[2], store[6] );
969 store[3] = util_fast_pow( store[3], store[7] );
970 #endif
971 }
972
973 static void
974 emit_pow(
975 struct x86_function *func,
976 unsigned xmm_save,
977 unsigned xmm_dst,
978 unsigned xmm_src )
979 {
980 emit_func_call_dst_src(
981 func,
982 xmm_save,
983 xmm_dst,
984 xmm_src,
985 pow4f );
986 }
987
988 static void
989 emit_rcp (
990 struct x86_function *func,
991 unsigned xmm_dst,
992 unsigned xmm_src )
993 {
994 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
995 * good enough. Need to either emit a proper divide or use the
996 * iterative technique described below in emit_rsqrt().
997 */
998 sse2_rcpps(
999 func,
1000 make_xmm( xmm_dst ),
1001 make_xmm( xmm_src ) );
1002 }
1003
1004 static void PIPE_CDECL
1005 rnd4f(
1006 float *store )
1007 {
1008 store[0] = floorf( store[0] + 0.5f );
1009 store[1] = floorf( store[1] + 0.5f );
1010 store[2] = floorf( store[2] + 0.5f );
1011 store[3] = floorf( store[3] + 0.5f );
1012 }
1013
1014 static void
1015 emit_rnd(
1016 struct x86_function *func,
1017 unsigned xmm_save,
1018 unsigned xmm_dst )
1019 {
1020 emit_func_call_dst(
1021 func,
1022 xmm_save,
1023 xmm_dst,
1024 rnd4f );
1025 }
1026
1027 static void
1028 emit_rsqrt(
1029 struct x86_function *func,
1030 unsigned xmm_dst,
1031 unsigned xmm_src )
1032 {
1033 #if HIGH_PRECISION
1034 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1035 * implementations, it is possible to improve its precision at
1036 * fairly low cost, using a newton/raphson step, as below:
1037 *
1038 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1039 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1040 *
1041 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1042 */
1043 {
1044 struct x86_reg dst = make_xmm( xmm_dst );
1045 struct x86_reg src = make_xmm( xmm_src );
1046 struct x86_reg tmp0 = make_xmm( 2 );
1047 struct x86_reg tmp1 = make_xmm( 3 );
1048
1049 assert( xmm_dst != xmm_src );
1050 assert( xmm_dst != 2 && xmm_dst != 3 );
1051 assert( xmm_src != 2 && xmm_src != 3 );
1052
1053 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1054 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1055 sse_rsqrtps( func, tmp1, src );
1056 sse_mulps( func, src, tmp1 );
1057 sse_mulps( func, dst, tmp1 );
1058 sse_mulps( func, src, tmp1 );
1059 sse_subps( func, tmp0, src );
1060 sse_mulps( func, dst, tmp0 );
1061 }
1062 #else
1063 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1064 * good enough.
1065 */
1066 sse_rsqrtps(
1067 func,
1068 make_xmm( xmm_dst ),
1069 make_xmm( xmm_src ) );
1070 #endif
1071 }
1072
1073 static void
1074 emit_setsign(
1075 struct x86_function *func,
1076 unsigned xmm )
1077 {
1078 sse_orps(
1079 func,
1080 make_xmm( xmm ),
1081 get_temp(
1082 TGSI_EXEC_TEMP_80000000_I,
1083 TGSI_EXEC_TEMP_80000000_C ) );
1084 }
1085
1086 static void PIPE_CDECL
1087 sgn4f(
1088 float *store )
1089 {
1090 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1091 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1092 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1093 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1094 }
1095
1096 static void
1097 emit_sgn(
1098 struct x86_function *func,
1099 unsigned xmm_save,
1100 unsigned xmm_dst )
1101 {
1102 emit_func_call_dst(
1103 func,
1104 xmm_save,
1105 xmm_dst,
1106 sgn4f );
1107 }
1108
1109 static void PIPE_CDECL
1110 sin4f(
1111 float *store )
1112 {
1113 store[0] = sinf( store[0] );
1114 store[1] = sinf( store[1] );
1115 store[2] = sinf( store[2] );
1116 store[3] = sinf( store[3] );
1117 }
1118
1119 static void
1120 emit_sin (struct x86_function *func,
1121 unsigned xmm_save,
1122 unsigned xmm_dst)
1123 {
1124 emit_func_call_dst(
1125 func,
1126 xmm_save,
1127 xmm_dst,
1128 sin4f );
1129 }
1130
1131 static void
1132 emit_sub(
1133 struct x86_function *func,
1134 unsigned xmm_dst,
1135 unsigned xmm_src )
1136 {
1137 sse_subps(
1138 func,
1139 make_xmm( xmm_dst ),
1140 make_xmm( xmm_src ) );
1141 }
1142
1143 /**
1144 * Register fetch.
1145 */
1146
1147 static void
1148 emit_fetch(
1149 struct x86_function *func,
1150 unsigned xmm,
1151 const struct tgsi_full_src_register *reg,
1152 const unsigned chan_index )
1153 {
1154 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1155
1156 switch (swizzle) {
1157 case TGSI_EXTSWIZZLE_X:
1158 case TGSI_EXTSWIZZLE_Y:
1159 case TGSI_EXTSWIZZLE_Z:
1160 case TGSI_EXTSWIZZLE_W:
1161 switch (reg->SrcRegister.File) {
1162 case TGSI_FILE_CONSTANT:
1163 emit_const(
1164 func,
1165 xmm,
1166 reg->SrcRegister.Index,
1167 swizzle,
1168 reg->SrcRegister.Indirect,
1169 reg->SrcRegisterInd.File,
1170 reg->SrcRegisterInd.Index );
1171 break;
1172
1173 case TGSI_FILE_IMMEDIATE:
1174 emit_immediate(
1175 func,
1176 xmm,
1177 reg->SrcRegister.Index,
1178 swizzle );
1179 break;
1180
1181 case TGSI_FILE_INPUT:
1182 emit_inputf(
1183 func,
1184 xmm,
1185 reg->SrcRegister.Index,
1186 swizzle );
1187 break;
1188
1189 case TGSI_FILE_TEMPORARY:
1190 emit_tempf(
1191 func,
1192 xmm,
1193 reg->SrcRegister.Index,
1194 swizzle );
1195 break;
1196
1197 default:
1198 assert( 0 );
1199 }
1200 break;
1201
1202 case TGSI_EXTSWIZZLE_ZERO:
1203 emit_tempf(
1204 func,
1205 xmm,
1206 TGSI_EXEC_TEMP_00000000_I,
1207 TGSI_EXEC_TEMP_00000000_C );
1208 break;
1209
1210 case TGSI_EXTSWIZZLE_ONE:
1211 emit_tempf(
1212 func,
1213 xmm,
1214 TEMP_ONE_I,
1215 TEMP_ONE_C );
1216 break;
1217
1218 default:
1219 assert( 0 );
1220 }
1221
1222 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1223 case TGSI_UTIL_SIGN_CLEAR:
1224 emit_abs( func, xmm );
1225 break;
1226
1227 case TGSI_UTIL_SIGN_SET:
1228 emit_setsign( func, xmm );
1229 break;
1230
1231 case TGSI_UTIL_SIGN_TOGGLE:
1232 emit_neg( func, xmm );
1233 break;
1234
1235 case TGSI_UTIL_SIGN_KEEP:
1236 break;
1237 }
1238 }
1239
1240 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1241 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1242
1243 /**
1244 * Register store.
1245 */
1246
1247 static void
1248 emit_store(
1249 struct x86_function *func,
1250 unsigned xmm,
1251 const struct tgsi_full_dst_register *reg,
1252 const struct tgsi_full_instruction *inst,
1253 unsigned chan_index )
1254 {
1255 switch( reg->DstRegister.File ) {
1256 case TGSI_FILE_OUTPUT:
1257 emit_output(
1258 func,
1259 xmm,
1260 reg->DstRegister.Index,
1261 chan_index );
1262 break;
1263
1264 case TGSI_FILE_TEMPORARY:
1265 emit_temps(
1266 func,
1267 xmm,
1268 reg->DstRegister.Index,
1269 chan_index );
1270 break;
1271
1272 case TGSI_FILE_ADDRESS:
1273 emit_addrs(
1274 func,
1275 xmm,
1276 reg->DstRegister.Index,
1277 chan_index );
1278 break;
1279
1280 default:
1281 assert( 0 );
1282 }
1283
1284 switch( inst->Instruction.Saturate ) {
1285 case TGSI_SAT_NONE:
1286 break;
1287
1288 case TGSI_SAT_ZERO_ONE:
1289 /* assert( 0 ); */
1290 break;
1291
1292 case TGSI_SAT_MINUS_PLUS_ONE:
1293 assert( 0 );
1294 break;
1295 }
1296 }
1297
1298 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1299 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1300
1301 /**
1302 * High-level instruction translators.
1303 */
1304
1305 static void
1306 emit_kil(
1307 struct x86_function *func,
1308 const struct tgsi_full_src_register *reg )
1309 {
1310 unsigned uniquemask;
1311 unsigned registers[4];
1312 unsigned nextregister = 0;
1313 unsigned firstchan = ~0;
1314 unsigned chan_index;
1315
1316 /* This mask stores component bits that were already tested. Note that
1317 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1318 * tested. */
1319 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1320
1321 FOR_EACH_CHANNEL( chan_index ) {
1322 unsigned swizzle;
1323
1324 /* unswizzle channel */
1325 swizzle = tgsi_util_get_full_src_register_extswizzle(
1326 reg,
1327 chan_index );
1328
1329 /* check if the component has not been already tested */
1330 if( !(uniquemask & (1 << swizzle)) ) {
1331 uniquemask |= 1 << swizzle;
1332
1333 /* allocate register */
1334 registers[chan_index] = nextregister;
1335 emit_fetch(
1336 func,
1337 nextregister,
1338 reg,
1339 chan_index );
1340 nextregister++;
1341
1342 /* mark the first channel used */
1343 if( firstchan == ~0 ) {
1344 firstchan = chan_index;
1345 }
1346 }
1347 }
1348
1349 x86_push(
1350 func,
1351 x86_make_reg( file_REG32, reg_AX ) );
1352 x86_push(
1353 func,
1354 x86_make_reg( file_REG32, reg_DX ) );
1355
1356 FOR_EACH_CHANNEL( chan_index ) {
1357 if( uniquemask & (1 << chan_index) ) {
1358 sse_cmpps(
1359 func,
1360 make_xmm( registers[chan_index] ),
1361 get_temp(
1362 TGSI_EXEC_TEMP_00000000_I,
1363 TGSI_EXEC_TEMP_00000000_C ),
1364 cc_LessThan );
1365
1366 if( chan_index == firstchan ) {
1367 sse_pmovmskb(
1368 func,
1369 x86_make_reg( file_REG32, reg_AX ),
1370 make_xmm( registers[chan_index] ) );
1371 }
1372 else {
1373 sse_pmovmskb(
1374 func,
1375 x86_make_reg( file_REG32, reg_DX ),
1376 make_xmm( registers[chan_index] ) );
1377 x86_or(
1378 func,
1379 x86_make_reg( file_REG32, reg_AX ),
1380 x86_make_reg( file_REG32, reg_DX ) );
1381 }
1382 }
1383 }
1384
1385 x86_or(
1386 func,
1387 get_temp(
1388 TGSI_EXEC_TEMP_KILMASK_I,
1389 TGSI_EXEC_TEMP_KILMASK_C ),
1390 x86_make_reg( file_REG32, reg_AX ) );
1391
1392 x86_pop(
1393 func,
1394 x86_make_reg( file_REG32, reg_DX ) );
1395 x86_pop(
1396 func,
1397 x86_make_reg( file_REG32, reg_AX ) );
1398 }
1399
1400
1401 static void
1402 emit_kilp(
1403 struct x86_function *func )
1404 {
1405 /* XXX todo / fix me */
1406 }
1407
1408
1409 static void
1410 emit_setcc(
1411 struct x86_function *func,
1412 struct tgsi_full_instruction *inst,
1413 enum sse_cc cc )
1414 {
1415 unsigned chan_index;
1416
1417 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1418 FETCH( func, *inst, 0, 0, chan_index );
1419 FETCH( func, *inst, 1, 1, chan_index );
1420 sse_cmpps(
1421 func,
1422 make_xmm( 0 ),
1423 make_xmm( 1 ),
1424 cc );
1425 sse_andps(
1426 func,
1427 make_xmm( 0 ),
1428 get_temp(
1429 TEMP_ONE_I,
1430 TEMP_ONE_C ) );
1431 STORE( func, *inst, 0, 0, chan_index );
1432 }
1433 }
1434
1435 static void
1436 emit_cmp(
1437 struct x86_function *func,
1438 struct tgsi_full_instruction *inst )
1439 {
1440 unsigned chan_index;
1441
1442 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1443 FETCH( func, *inst, 0, 0, chan_index );
1444 FETCH( func, *inst, 1, 1, chan_index );
1445 FETCH( func, *inst, 2, 2, chan_index );
1446 sse_cmpps(
1447 func,
1448 make_xmm( 0 ),
1449 get_temp(
1450 TGSI_EXEC_TEMP_00000000_I,
1451 TGSI_EXEC_TEMP_00000000_C ),
1452 cc_LessThan );
1453 sse_andps(
1454 func,
1455 make_xmm( 1 ),
1456 make_xmm( 0 ) );
1457 sse_andnps(
1458 func,
1459 make_xmm( 0 ),
1460 make_xmm( 2 ) );
1461 sse_orps(
1462 func,
1463 make_xmm( 0 ),
1464 make_xmm( 1 ) );
1465 STORE( func, *inst, 0, 0, chan_index );
1466 }
1467 }
1468
1469 static int
1470 emit_instruction(
1471 struct x86_function *func,
1472 struct tgsi_full_instruction *inst )
1473 {
1474 unsigned chan_index;
1475
1476 switch (inst->Instruction.Opcode) {
1477 case TGSI_OPCODE_ARL:
1478 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1479 FETCH( func, *inst, 0, 0, chan_index );
1480 emit_f2it( func, 0 );
1481 STORE( func, *inst, 0, 0, chan_index );
1482 }
1483 break;
1484
1485 case TGSI_OPCODE_MOV:
1486 case TGSI_OPCODE_SWZ:
1487 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1488 FETCH( func, *inst, 0, 0, chan_index );
1489 STORE( func, *inst, 0, 0, chan_index );
1490 }
1491 break;
1492
1493 case TGSI_OPCODE_LIT:
1494 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1495 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1496 emit_tempf(
1497 func,
1498 0,
1499 TEMP_ONE_I,
1500 TEMP_ONE_C);
1501 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1502 STORE( func, *inst, 0, 0, CHAN_X );
1503 }
1504 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1505 STORE( func, *inst, 0, 0, CHAN_W );
1506 }
1507 }
1508 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1509 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1510 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1511 FETCH( func, *inst, 0, 0, CHAN_X );
1512 sse_maxps(
1513 func,
1514 make_xmm( 0 ),
1515 get_temp(
1516 TGSI_EXEC_TEMP_00000000_I,
1517 TGSI_EXEC_TEMP_00000000_C ) );
1518 STORE( func, *inst, 0, 0, CHAN_Y );
1519 }
1520 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1521 /* XMM[1] = SrcReg[0].yyyy */
1522 FETCH( func, *inst, 1, 0, CHAN_Y );
1523 /* XMM[1] = max(XMM[1], 0) */
1524 sse_maxps(
1525 func,
1526 make_xmm( 1 ),
1527 get_temp(
1528 TGSI_EXEC_TEMP_00000000_I,
1529 TGSI_EXEC_TEMP_00000000_C ) );
1530 /* XMM[2] = SrcReg[0].wwww */
1531 FETCH( func, *inst, 2, 0, CHAN_W );
1532 /* XMM[2] = min(XMM[2], 128.0) */
1533 sse_minps(
1534 func,
1535 make_xmm( 2 ),
1536 get_temp(
1537 TGSI_EXEC_TEMP_128_I,
1538 TGSI_EXEC_TEMP_128_C ) );
1539 /* XMM[2] = max(XMM[2], -128.0) */
1540 sse_maxps(
1541 func,
1542 make_xmm( 2 ),
1543 get_temp(
1544 TGSI_EXEC_TEMP_MINUS_128_I,
1545 TGSI_EXEC_TEMP_MINUS_128_C ) );
1546 emit_pow( func, 3, 1, 2 );
1547 FETCH( func, *inst, 0, 0, CHAN_X );
1548 sse_xorps(
1549 func,
1550 make_xmm( 2 ),
1551 make_xmm( 2 ) );
1552 sse_cmpps(
1553 func,
1554 make_xmm( 2 ),
1555 make_xmm( 0 ),
1556 cc_LessThanEqual );
1557 sse_andps(
1558 func,
1559 make_xmm( 2 ),
1560 make_xmm( 1 ) );
1561 STORE( func, *inst, 2, 0, CHAN_Z );
1562 }
1563 }
1564 break;
1565
1566 case TGSI_OPCODE_RCP:
1567 /* TGSI_OPCODE_RECIP */
1568 FETCH( func, *inst, 0, 0, CHAN_X );
1569 emit_rcp( func, 0, 0 );
1570 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1571 STORE( func, *inst, 0, 0, chan_index );
1572 }
1573 break;
1574
1575 case TGSI_OPCODE_RSQ:
1576 /* TGSI_OPCODE_RECIPSQRT */
1577 FETCH( func, *inst, 0, 0, CHAN_X );
1578 emit_rsqrt( func, 1, 0 );
1579 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1580 STORE( func, *inst, 1, 0, chan_index );
1581 }
1582 break;
1583
1584 case TGSI_OPCODE_EXP:
1585 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1586 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1587 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1588 FETCH( func, *inst, 0, 0, CHAN_X );
1589 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1590 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1591 emit_MOV( func, 1, 0 );
1592 emit_flr( func, 2, 1 );
1593 /* dst.x = ex2(floor(src.x)) */
1594 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1595 emit_MOV( func, 2, 1 );
1596 emit_ex2( func, 3, 2 );
1597 STORE( func, *inst, 2, 0, CHAN_X );
1598 }
1599 /* dst.y = src.x - floor(src.x) */
1600 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1601 emit_MOV( func, 2, 0 );
1602 emit_sub( func, 2, 1 );
1603 STORE( func, *inst, 2, 0, CHAN_Y );
1604 }
1605 }
1606 /* dst.z = ex2(src.x) */
1607 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1608 emit_ex2( func, 3, 0 );
1609 STORE( func, *inst, 0, 0, CHAN_Z );
1610 }
1611 }
1612 /* dst.w = 1.0 */
1613 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1614 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1615 STORE( func, *inst, 0, 0, CHAN_W );
1616 }
1617 break;
1618
1619 case TGSI_OPCODE_LOG:
1620 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1621 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1622 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1623 FETCH( func, *inst, 0, 0, CHAN_X );
1624 emit_abs( func, 0 );
1625 emit_MOV( func, 1, 0 );
1626 emit_lg2( func, 2, 1 );
1627 /* dst.z = lg2(abs(src.x)) */
1628 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1629 STORE( func, *inst, 1, 0, CHAN_Z );
1630 }
1631 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1632 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1633 emit_flr( func, 2, 1 );
1634 /* dst.x = floor(lg2(abs(src.x))) */
1635 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1636 STORE( func, *inst, 1, 0, CHAN_X );
1637 }
1638 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1639 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1640 emit_ex2( func, 2, 1 );
1641 emit_rcp( func, 1, 1 );
1642 emit_mul( func, 0, 1 );
1643 STORE( func, *inst, 0, 0, CHAN_Y );
1644 }
1645 }
1646 }
1647 /* dst.w = 1.0 */
1648 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1649 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1650 STORE( func, *inst, 0, 0, CHAN_W );
1651 }
1652 break;
1653
1654 case TGSI_OPCODE_MUL:
1655 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1656 FETCH( func, *inst, 0, 0, chan_index );
1657 FETCH( func, *inst, 1, 1, chan_index );
1658 emit_mul( func, 0, 1 );
1659 STORE( func, *inst, 0, 0, chan_index );
1660 }
1661 break;
1662
1663 case TGSI_OPCODE_ADD:
1664 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1665 FETCH( func, *inst, 0, 0, chan_index );
1666 FETCH( func, *inst, 1, 1, chan_index );
1667 emit_add( func, 0, 1 );
1668 STORE( func, *inst, 0, 0, chan_index );
1669 }
1670 break;
1671
1672 case TGSI_OPCODE_DP3:
1673 /* TGSI_OPCODE_DOT3 */
1674 FETCH( func, *inst, 0, 0, CHAN_X );
1675 FETCH( func, *inst, 1, 1, CHAN_X );
1676 emit_mul( func, 0, 1 );
1677 FETCH( func, *inst, 1, 0, CHAN_Y );
1678 FETCH( func, *inst, 2, 1, CHAN_Y );
1679 emit_mul( func, 1, 2 );
1680 emit_add( func, 0, 1 );
1681 FETCH( func, *inst, 1, 0, CHAN_Z );
1682 FETCH( func, *inst, 2, 1, CHAN_Z );
1683 emit_mul( func, 1, 2 );
1684 emit_add( func, 0, 1 );
1685 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1686 STORE( func, *inst, 0, 0, chan_index );
1687 }
1688 break;
1689
1690 case TGSI_OPCODE_DP4:
1691 /* TGSI_OPCODE_DOT4 */
1692 FETCH( func, *inst, 0, 0, CHAN_X );
1693 FETCH( func, *inst, 1, 1, CHAN_X );
1694 emit_mul( func, 0, 1 );
1695 FETCH( func, *inst, 1, 0, CHAN_Y );
1696 FETCH( func, *inst, 2, 1, CHAN_Y );
1697 emit_mul( func, 1, 2 );
1698 emit_add( func, 0, 1 );
1699 FETCH( func, *inst, 1, 0, CHAN_Z );
1700 FETCH( func, *inst, 2, 1, CHAN_Z );
1701 emit_mul(func, 1, 2 );
1702 emit_add(func, 0, 1 );
1703 FETCH( func, *inst, 1, 0, CHAN_W );
1704 FETCH( func, *inst, 2, 1, CHAN_W );
1705 emit_mul( func, 1, 2 );
1706 emit_add( func, 0, 1 );
1707 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1708 STORE( func, *inst, 0, 0, chan_index );
1709 }
1710 break;
1711
1712 case TGSI_OPCODE_DST:
1713 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1714 emit_tempf(
1715 func,
1716 0,
1717 TEMP_ONE_I,
1718 TEMP_ONE_C );
1719 STORE( func, *inst, 0, 0, CHAN_X );
1720 }
1721 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1722 FETCH( func, *inst, 0, 0, CHAN_Y );
1723 FETCH( func, *inst, 1, 1, CHAN_Y );
1724 emit_mul( func, 0, 1 );
1725 STORE( func, *inst, 0, 0, CHAN_Y );
1726 }
1727 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1728 FETCH( func, *inst, 0, 0, CHAN_Z );
1729 STORE( func, *inst, 0, 0, CHAN_Z );
1730 }
1731 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1732 FETCH( func, *inst, 0, 1, CHAN_W );
1733 STORE( func, *inst, 0, 0, CHAN_W );
1734 }
1735 break;
1736
1737 case TGSI_OPCODE_MIN:
1738 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1739 FETCH( func, *inst, 0, 0, chan_index );
1740 FETCH( func, *inst, 1, 1, chan_index );
1741 sse_minps(
1742 func,
1743 make_xmm( 0 ),
1744 make_xmm( 1 ) );
1745 STORE( func, *inst, 0, 0, chan_index );
1746 }
1747 break;
1748
1749 case TGSI_OPCODE_MAX:
1750 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1751 FETCH( func, *inst, 0, 0, chan_index );
1752 FETCH( func, *inst, 1, 1, chan_index );
1753 sse_maxps(
1754 func,
1755 make_xmm( 0 ),
1756 make_xmm( 1 ) );
1757 STORE( func, *inst, 0, 0, chan_index );
1758 }
1759 break;
1760
1761 case TGSI_OPCODE_SLT:
1762 /* TGSI_OPCODE_SETLT */
1763 emit_setcc( func, inst, cc_LessThan );
1764 break;
1765
1766 case TGSI_OPCODE_SGE:
1767 /* TGSI_OPCODE_SETGE */
1768 emit_setcc( func, inst, cc_NotLessThan );
1769 break;
1770
1771 case TGSI_OPCODE_MAD:
1772 /* TGSI_OPCODE_MADD */
1773 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1774 FETCH( func, *inst, 0, 0, chan_index );
1775 FETCH( func, *inst, 1, 1, chan_index );
1776 FETCH( func, *inst, 2, 2, chan_index );
1777 emit_mul( func, 0, 1 );
1778 emit_add( func, 0, 2 );
1779 STORE( func, *inst, 0, 0, chan_index );
1780 }
1781 break;
1782
1783 case TGSI_OPCODE_SUB:
1784 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1785 FETCH( func, *inst, 0, 0, chan_index );
1786 FETCH( func, *inst, 1, 1, chan_index );
1787 emit_sub( func, 0, 1 );
1788 STORE( func, *inst, 0, 0, chan_index );
1789 }
1790 break;
1791
1792 case TGSI_OPCODE_LERP:
1793 /* TGSI_OPCODE_LRP */
1794 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1795 FETCH( func, *inst, 0, 0, chan_index );
1796 FETCH( func, *inst, 1, 1, chan_index );
1797 FETCH( func, *inst, 2, 2, chan_index );
1798 emit_sub( func, 1, 2 );
1799 emit_mul( func, 0, 1 );
1800 emit_add( func, 0, 2 );
1801 STORE( func, *inst, 0, 0, chan_index );
1802 }
1803 break;
1804
1805 case TGSI_OPCODE_CND:
1806 return 0;
1807 break;
1808
1809 case TGSI_OPCODE_CND0:
1810 return 0;
1811 break;
1812
1813 case TGSI_OPCODE_DOT2ADD:
1814 /* TGSI_OPCODE_DP2A */
1815 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
1816 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
1817 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1818 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
1819 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
1820 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1821 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1822 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
1823 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1824 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1825 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
1826 }
1827 break;
1828
1829 case TGSI_OPCODE_INDEX:
1830 return 0;
1831 break;
1832
1833 case TGSI_OPCODE_NEGATE:
1834 return 0;
1835 break;
1836
1837 case TGSI_OPCODE_FRAC:
1838 /* TGSI_OPCODE_FRC */
1839 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1840 FETCH( func, *inst, 0, 0, chan_index );
1841 emit_frc( func, 0, 0 );
1842 STORE( func, *inst, 0, 0, chan_index );
1843 }
1844 break;
1845
1846 case TGSI_OPCODE_CLAMP:
1847 return 0;
1848 break;
1849
1850 case TGSI_OPCODE_FLOOR:
1851 /* TGSI_OPCODE_FLR */
1852 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1853 FETCH( func, *inst, 0, 0, chan_index );
1854 emit_flr( func, 0, 0 );
1855 STORE( func, *inst, 0, 0, chan_index );
1856 }
1857 break;
1858
1859 case TGSI_OPCODE_ROUND:
1860 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1861 FETCH( func, *inst, 0, 0, chan_index );
1862 emit_rnd( func, 0, 0 );
1863 STORE( func, *inst, 0, 0, chan_index );
1864 }
1865 break;
1866
1867 case TGSI_OPCODE_EXPBASE2:
1868 /* TGSI_OPCODE_EX2 */
1869 FETCH( func, *inst, 0, 0, CHAN_X );
1870 emit_ex2( func, 0, 0 );
1871 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872 STORE( func, *inst, 0, 0, chan_index );
1873 }
1874 break;
1875
1876 case TGSI_OPCODE_LOGBASE2:
1877 /* TGSI_OPCODE_LG2 */
1878 FETCH( func, *inst, 0, 0, CHAN_X );
1879 emit_lg2( func, 0, 0 );
1880 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1881 STORE( func, *inst, 0, 0, chan_index );
1882 }
1883 break;
1884
1885 case TGSI_OPCODE_POWER:
1886 /* TGSI_OPCODE_POW */
1887 FETCH( func, *inst, 0, 0, CHAN_X );
1888 FETCH( func, *inst, 1, 1, CHAN_X );
1889 emit_pow( func, 0, 0, 1 );
1890 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1891 STORE( func, *inst, 0, 0, chan_index );
1892 }
1893 break;
1894
1895 case TGSI_OPCODE_CROSSPRODUCT:
1896 /* TGSI_OPCODE_XPD */
1897 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1898 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1899 FETCH( func, *inst, 1, 1, CHAN_Z );
1900 FETCH( func, *inst, 3, 0, CHAN_Z );
1901 }
1902 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1903 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1904 FETCH( func, *inst, 0, 0, CHAN_Y );
1905 FETCH( func, *inst, 4, 1, CHAN_Y );
1906 }
1907 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1908 emit_MOV( func, 2, 0 );
1909 emit_mul( func, 2, 1 );
1910 emit_MOV( func, 5, 3 );
1911 emit_mul( func, 5, 4 );
1912 emit_sub( func, 2, 5 );
1913 STORE( func, *inst, 2, 0, CHAN_X );
1914 }
1915 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1916 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1917 FETCH( func, *inst, 2, 1, CHAN_X );
1918 FETCH( func, *inst, 5, 0, CHAN_X );
1919 }
1920 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1921 emit_mul( func, 3, 2 );
1922 emit_mul( func, 1, 5 );
1923 emit_sub( func, 3, 1 );
1924 STORE( func, *inst, 3, 0, CHAN_Y );
1925 }
1926 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1927 emit_mul( func, 5, 4 );
1928 emit_mul( func, 0, 2 );
1929 emit_sub( func, 5, 0 );
1930 STORE( func, *inst, 5, 0, CHAN_Z );
1931 }
1932 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1933 emit_tempf(
1934 func,
1935 0,
1936 TEMP_ONE_I,
1937 TEMP_ONE_C );
1938 STORE( func, *inst, 0, 0, CHAN_W );
1939 }
1940 break;
1941
1942 case TGSI_OPCODE_MULTIPLYMATRIX:
1943 return 0;
1944 break;
1945
1946 case TGSI_OPCODE_ABS:
1947 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1948 FETCH( func, *inst, 0, 0, chan_index );
1949 emit_abs( func, 0) ;
1950
1951 STORE( func, *inst, 0, 0, chan_index );
1952 }
1953 break;
1954
1955 case TGSI_OPCODE_RCC:
1956 return 0;
1957 break;
1958
1959 case TGSI_OPCODE_DPH:
1960 FETCH( func, *inst, 0, 0, CHAN_X );
1961 FETCH( func, *inst, 1, 1, CHAN_X );
1962 emit_mul( func, 0, 1 );
1963 FETCH( func, *inst, 1, 0, CHAN_Y );
1964 FETCH( func, *inst, 2, 1, CHAN_Y );
1965 emit_mul( func, 1, 2 );
1966 emit_add( func, 0, 1 );
1967 FETCH( func, *inst, 1, 0, CHAN_Z );
1968 FETCH( func, *inst, 2, 1, CHAN_Z );
1969 emit_mul( func, 1, 2 );
1970 emit_add( func, 0, 1 );
1971 FETCH( func, *inst, 1, 1, CHAN_W );
1972 emit_add( func, 0, 1 );
1973 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1974 STORE( func, *inst, 0, 0, chan_index );
1975 }
1976 break;
1977
1978 case TGSI_OPCODE_COS:
1979 FETCH( func, *inst, 0, 0, CHAN_X );
1980 emit_cos( func, 0, 0 );
1981 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1982 STORE( func, *inst, 0, 0, chan_index );
1983 }
1984 break;
1985
1986 case TGSI_OPCODE_DDX:
1987 return 0;
1988 break;
1989
1990 case TGSI_OPCODE_DDY:
1991 return 0;
1992 break;
1993
1994 case TGSI_OPCODE_KILP:
1995 /* predicated kill */
1996 emit_kilp( func );
1997 return 0; /* XXX fix me */
1998 break;
1999
2000 case TGSI_OPCODE_KIL:
2001 /* conditional kill */
2002 emit_kil( func, &inst->FullSrcRegisters[0] );
2003 break;
2004
2005 case TGSI_OPCODE_PK2H:
2006 return 0;
2007 break;
2008
2009 case TGSI_OPCODE_PK2US:
2010 return 0;
2011 break;
2012
2013 case TGSI_OPCODE_PK4B:
2014 return 0;
2015 break;
2016
2017 case TGSI_OPCODE_PK4UB:
2018 return 0;
2019 break;
2020
2021 case TGSI_OPCODE_RFL:
2022 return 0;
2023 break;
2024
2025 case TGSI_OPCODE_SEQ:
2026 return 0;
2027 break;
2028
2029 case TGSI_OPCODE_SFL:
2030 return 0;
2031 break;
2032
2033 case TGSI_OPCODE_SGT:
2034 return 0;
2035 break;
2036
2037 case TGSI_OPCODE_SIN:
2038 FETCH( func, *inst, 0, 0, CHAN_X );
2039 emit_sin( func, 0, 0 );
2040 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2041 STORE( func, *inst, 0, 0, chan_index );
2042 }
2043 break;
2044
2045 case TGSI_OPCODE_SLE:
2046 return 0;
2047 break;
2048
2049 case TGSI_OPCODE_SNE:
2050 return 0;
2051 break;
2052
2053 case TGSI_OPCODE_STR:
2054 return 0;
2055 break;
2056
2057 case TGSI_OPCODE_TEX:
2058 if (0) {
2059 /* Disable dummy texture code:
2060 */
2061 emit_tempf(
2062 func,
2063 0,
2064 TEMP_ONE_I,
2065 TEMP_ONE_C );
2066 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2067 STORE( func, *inst, 0, 0, chan_index );
2068 }
2069 }
2070 else {
2071 return 0;
2072 }
2073 break;
2074
2075 case TGSI_OPCODE_TXD:
2076 return 0;
2077 break;
2078
2079 case TGSI_OPCODE_UP2H:
2080 return 0;
2081 break;
2082
2083 case TGSI_OPCODE_UP2US:
2084 return 0;
2085 break;
2086
2087 case TGSI_OPCODE_UP4B:
2088 return 0;
2089 break;
2090
2091 case TGSI_OPCODE_UP4UB:
2092 return 0;
2093 break;
2094
2095 case TGSI_OPCODE_X2D:
2096 return 0;
2097 break;
2098
2099 case TGSI_OPCODE_ARA:
2100 return 0;
2101 break;
2102
2103 case TGSI_OPCODE_ARR:
2104 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2105 FETCH( func, *inst, 0, 0, chan_index );
2106 emit_rnd( func, 0, 0 );
2107 emit_f2it( func, 0 );
2108 STORE( func, *inst, 0, 0, chan_index );
2109 }
2110 break;
2111
2112 case TGSI_OPCODE_BRA:
2113 return 0;
2114 break;
2115
2116 case TGSI_OPCODE_CAL:
2117 return 0;
2118 break;
2119
2120 case TGSI_OPCODE_RET:
2121 emit_ret( func );
2122 break;
2123
2124 case TGSI_OPCODE_END:
2125 break;
2126
2127 case TGSI_OPCODE_SSG:
2128 /* TGSI_OPCODE_SGN */
2129 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2130 FETCH( func, *inst, 0, 0, chan_index );
2131 emit_sgn( func, 0, 0 );
2132 STORE( func, *inst, 0, 0, chan_index );
2133 }
2134 break;
2135
2136 case TGSI_OPCODE_CMP:
2137 emit_cmp (func, inst);
2138 break;
2139
2140 case TGSI_OPCODE_SCS:
2141 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2142 FETCH( func, *inst, 0, 0, CHAN_X );
2143 emit_cos( func, 0, 0 );
2144 STORE( func, *inst, 0, 0, CHAN_X );
2145 }
2146 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2147 FETCH( func, *inst, 0, 0, CHAN_X );
2148 emit_sin( func, 0, 0 );
2149 STORE( func, *inst, 0, 0, CHAN_Y );
2150 }
2151 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2152 emit_tempf(
2153 func,
2154 0,
2155 TGSI_EXEC_TEMP_00000000_I,
2156 TGSI_EXEC_TEMP_00000000_C );
2157 STORE( func, *inst, 0, 0, CHAN_Z );
2158 }
2159 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2160 emit_tempf(
2161 func,
2162 0,
2163 TEMP_ONE_I,
2164 TEMP_ONE_C );
2165 STORE( func, *inst, 0, 0, CHAN_W );
2166 }
2167 break;
2168
2169 case TGSI_OPCODE_TXB:
2170 return 0;
2171 break;
2172
2173 case TGSI_OPCODE_NRM:
2174 /* fall-through */
2175 case TGSI_OPCODE_NRM4:
2176 /* 3 or 4-component normalization */
2177 {
2178 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2179 /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2180 FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
2181 FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
2182 FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
2183 if (dims == 4) {
2184 FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2185 }
2186 emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
2187 emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
2188 emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
2189 emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
2190 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2191 emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
2192 emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
2193 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2194 if (dims == 4) {
2195 emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
2196 emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
2197 emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
2198 }
2199 emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
2200 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2201 if (chan_index < dims) {
2202 emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2203 STORE( func, *inst, 4+chan_index, 0, chan_index );
2204 }
2205 }
2206 }
2207 break;
2208
2209 case TGSI_OPCODE_DIV:
2210 return 0;
2211 break;
2212
2213 case TGSI_OPCODE_DP2:
2214 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2215 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2216 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2217 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2218 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2219 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2220 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2221 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2222 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2223 }
2224 break;
2225
2226 case TGSI_OPCODE_TXL:
2227 return 0;
2228 break;
2229
2230 case TGSI_OPCODE_BRK:
2231 return 0;
2232 break;
2233
2234 case TGSI_OPCODE_IF:
2235 return 0;
2236 break;
2237
2238 case TGSI_OPCODE_LOOP:
2239 return 0;
2240 break;
2241
2242 case TGSI_OPCODE_REP:
2243 return 0;
2244 break;
2245
2246 case TGSI_OPCODE_ELSE:
2247 return 0;
2248 break;
2249
2250 case TGSI_OPCODE_ENDIF:
2251 return 0;
2252 break;
2253
2254 case TGSI_OPCODE_ENDLOOP:
2255 return 0;
2256 break;
2257
2258 case TGSI_OPCODE_ENDREP:
2259 return 0;
2260 break;
2261
2262 case TGSI_OPCODE_PUSHA:
2263 return 0;
2264 break;
2265
2266 case TGSI_OPCODE_POPA:
2267 return 0;
2268 break;
2269
2270 case TGSI_OPCODE_CEIL:
2271 return 0;
2272 break;
2273
2274 case TGSI_OPCODE_I2F:
2275 return 0;
2276 break;
2277
2278 case TGSI_OPCODE_NOT:
2279 return 0;
2280 break;
2281
2282 case TGSI_OPCODE_TRUNC:
2283 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2284 FETCH( func, *inst, 0, 0, chan_index );
2285 emit_f2it( func, 0 );
2286 emit_i2f( func, 0 );
2287 STORE( func, *inst, 0, 0, chan_index );
2288 }
2289 break;
2290
2291 case TGSI_OPCODE_SHL:
2292 return 0;
2293 break;
2294
2295 case TGSI_OPCODE_SHR:
2296 return 0;
2297 break;
2298
2299 case TGSI_OPCODE_AND:
2300 return 0;
2301 break;
2302
2303 case TGSI_OPCODE_OR:
2304 return 0;
2305 break;
2306
2307 case TGSI_OPCODE_MOD:
2308 return 0;
2309 break;
2310
2311 case TGSI_OPCODE_XOR:
2312 return 0;
2313 break;
2314
2315 case TGSI_OPCODE_SAD:
2316 return 0;
2317 break;
2318
2319 case TGSI_OPCODE_TXF:
2320 return 0;
2321 break;
2322
2323 case TGSI_OPCODE_TXQ:
2324 return 0;
2325 break;
2326
2327 case TGSI_OPCODE_CONT:
2328 return 0;
2329 break;
2330
2331 case TGSI_OPCODE_EMIT:
2332 return 0;
2333 break;
2334
2335 case TGSI_OPCODE_ENDPRIM:
2336 return 0;
2337 break;
2338
2339 default:
2340 return 0;
2341 }
2342
2343 return 1;
2344 }
2345
2346 static void
2347 emit_declaration(
2348 struct x86_function *func,
2349 struct tgsi_full_declaration *decl )
2350 {
2351 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2352 unsigned first, last, mask;
2353 unsigned i, j;
2354
2355 first = decl->DeclarationRange.First;
2356 last = decl->DeclarationRange.Last;
2357 mask = decl->Declaration.UsageMask;
2358
2359 for( i = first; i <= last; i++ ) {
2360 for( j = 0; j < NUM_CHANNELS; j++ ) {
2361 if( mask & (1 << j) ) {
2362 switch( decl->Declaration.Interpolate ) {
2363 case TGSI_INTERPOLATE_CONSTANT:
2364 emit_coef_a0( func, 0, i, j );
2365 emit_inputs( func, 0, i, j );
2366 break;
2367
2368 case TGSI_INTERPOLATE_LINEAR:
2369 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2370 emit_coef_dadx( func, 1, i, j );
2371 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2372 emit_coef_dady( func, 3, i, j );
2373 emit_mul( func, 0, 1 ); /* x * dadx */
2374 emit_coef_a0( func, 4, i, j );
2375 emit_mul( func, 2, 3 ); /* y * dady */
2376 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2377 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2378 emit_inputs( func, 0, i, j );
2379 break;
2380
2381 case TGSI_INTERPOLATE_PERSPECTIVE:
2382 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2383 emit_coef_dadx( func, 1, i, j );
2384 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2385 emit_coef_dady( func, 3, i, j );
2386 emit_mul( func, 0, 1 ); /* x * dadx */
2387 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2388 emit_coef_a0( func, 5, i, j );
2389 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2390 emit_mul( func, 2, 3 ); /* y * dady */
2391 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2392 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2393 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2394 emit_inputs( func, 0, i, j );
2395 break;
2396
2397 default:
2398 assert( 0 );
2399 break;
2400 }
2401 }
2402 }
2403 }
2404 }
2405 }
2406
2407 static void aos_to_soa( struct x86_function *func,
2408 uint arg_aos,
2409 uint arg_soa,
2410 uint arg_num,
2411 uint arg_stride )
2412 {
2413 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2414 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2415 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2416 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2417 int inner_loop;
2418
2419
2420 /* Save EBX */
2421 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2422
2423 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2424 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2425 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2426 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2427
2428 /* do */
2429 inner_loop = x86_get_label( func );
2430 {
2431 x86_push( func, aos_input );
2432 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2433 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2434 x86_add( func, aos_input, stride );
2435 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2436 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2437 x86_add( func, aos_input, stride );
2438 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2439 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2440 x86_add( func, aos_input, stride );
2441 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2442 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2443 x86_pop( func, aos_input );
2444
2445 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2446 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2447 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2448 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2449 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2450 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2451
2452 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2453 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2454 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2455 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2456
2457 /* Advance to next input */
2458 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2459 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2460 }
2461 /* while --num_inputs */
2462 x86_dec( func, num_inputs );
2463 x86_jcc( func, cc_NE, inner_loop );
2464
2465 /* Restore EBX */
2466 x86_pop( func, aos_input );
2467 }
2468
2469 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2470 {
2471 struct x86_reg soa_output;
2472 struct x86_reg aos_output;
2473 struct x86_reg num_outputs;
2474 struct x86_reg temp;
2475 int inner_loop;
2476
2477 soa_output = x86_make_reg( file_REG32, reg_AX );
2478 aos_output = x86_make_reg( file_REG32, reg_BX );
2479 num_outputs = x86_make_reg( file_REG32, reg_CX );
2480 temp = x86_make_reg( file_REG32, reg_DX );
2481
2482 /* Save EBX */
2483 x86_push( func, aos_output );
2484
2485 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2486 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2487 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2488
2489 /* do */
2490 inner_loop = x86_get_label( func );
2491 {
2492 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2493 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2494 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2495 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2496
2497 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2498 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2499 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2500 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2501 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2502 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2503
2504 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2505 x86_push( func, aos_output );
2506 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2507 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2508 x86_add( func, aos_output, temp );
2509 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2510 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2511 x86_add( func, aos_output, temp );
2512 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2513 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2514 x86_add( func, aos_output, temp );
2515 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2516 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2517 x86_pop( func, aos_output );
2518
2519 /* Advance to next output */
2520 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2521 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2522 }
2523 /* while --num_outputs */
2524 x86_dec( func, num_outputs );
2525 x86_jcc( func, cc_NE, inner_loop );
2526
2527 /* Restore EBX */
2528 x86_pop( func, aos_output );
2529 }
2530
2531 /**
2532 * Translate a TGSI vertex/fragment shader to SSE2 code.
2533 * Slightly different things are done for vertex vs. fragment shaders.
2534 *
2535 * Note that fragment shaders are responsible for interpolating shader
2536 * inputs. Because on x86 we have only 4 GP registers, and here we
2537 * have 5 shader arguments (input, output, const, temp and coef), the
2538 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2539 * GP register holding the output argument is aliased with the coeff
2540 * argument, as outputs are not needed in the DECLARATION phase.
2541 *
2542 * \param tokens the TGSI input shader
2543 * \param func the output SSE code/function
2544 * \param immediates buffer to place immediates, later passed to SSE func
2545 * \param return 1 for success, 0 if translation failed
2546 */
2547 unsigned
2548 tgsi_emit_sse2(
2549 const struct tgsi_token *tokens,
2550 struct x86_function *func,
2551 float (*immediates)[4],
2552 boolean do_swizzles )
2553 {
2554 struct tgsi_parse_context parse;
2555 boolean instruction_phase = FALSE;
2556 unsigned ok = 1;
2557 uint num_immediates = 0;
2558
2559 util_init_math();
2560
2561 func->csr = func->store;
2562
2563 tgsi_parse_init( &parse, tokens );
2564
2565 /* Can't just use EDI, EBX without save/restoring them:
2566 */
2567 x86_push(
2568 func,
2569 get_immediate_base() );
2570
2571 x86_push(
2572 func,
2573 get_temp_base() );
2574
2575
2576 /*
2577 * Different function args for vertex/fragment shaders:
2578 */
2579 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2580 /* DECLARATION phase, do not load output argument. */
2581 x86_mov(
2582 func,
2583 get_input_base(),
2584 x86_fn_arg( func, 1 ) );
2585 /* skipping outputs argument here */
2586 x86_mov(
2587 func,
2588 get_const_base(),
2589 x86_fn_arg( func, 3 ) );
2590 x86_mov(
2591 func,
2592 get_temp_base(),
2593 x86_fn_arg( func, 4 ) );
2594 x86_mov(
2595 func,
2596 get_coef_base(),
2597 x86_fn_arg( func, 5 ) );
2598 x86_mov(
2599 func,
2600 get_immediate_base(),
2601 x86_fn_arg( func, 6 ) );
2602 }
2603 else {
2604 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2605
2606 if (do_swizzles)
2607 aos_to_soa( func,
2608 6, /* aos_input */
2609 1, /* machine->input */
2610 7, /* num_inputs */
2611 8 ); /* input_stride */
2612
2613 x86_mov(
2614 func,
2615 get_input_base(),
2616 x86_fn_arg( func, 1 ) );
2617 x86_mov(
2618 func,
2619 get_output_base(),
2620 x86_fn_arg( func, 2 ) );
2621 x86_mov(
2622 func,
2623 get_const_base(),
2624 x86_fn_arg( func, 3 ) );
2625 x86_mov(
2626 func,
2627 get_temp_base(),
2628 x86_fn_arg( func, 4 ) );
2629 x86_mov(
2630 func,
2631 get_immediate_base(),
2632 x86_fn_arg( func, 5 ) );
2633 }
2634
2635 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2636 tgsi_parse_token( &parse );
2637
2638 switch( parse.FullToken.Token.Type ) {
2639 case TGSI_TOKEN_TYPE_DECLARATION:
2640 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2641 emit_declaration(
2642 func,
2643 &parse.FullToken.FullDeclaration );
2644 }
2645 break;
2646
2647 case TGSI_TOKEN_TYPE_INSTRUCTION:
2648 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2649 if( !instruction_phase ) {
2650 /* INSTRUCTION phase, overwrite coeff with output. */
2651 instruction_phase = TRUE;
2652 x86_mov(
2653 func,
2654 get_output_base(),
2655 x86_fn_arg( func, 2 ) );
2656 }
2657 }
2658
2659 ok = emit_instruction(
2660 func,
2661 &parse.FullToken.FullInstruction );
2662
2663 if (!ok) {
2664 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2665 parse.FullToken.FullInstruction.Instruction.Opcode,
2666 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2667 "vertex shader" : "fragment shader");
2668 }
2669 break;
2670
2671 case TGSI_TOKEN_TYPE_IMMEDIATE:
2672 /* simply copy the immediate values into the next immediates[] slot */
2673 {
2674 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2675 uint i;
2676 assert(size <= 4);
2677 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2678 for( i = 0; i < size; i++ ) {
2679 immediates[num_immediates][i] =
2680 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2681 }
2682 #if 0
2683 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2684 num_immediates,
2685 immediates[num_immediates][0],
2686 immediates[num_immediates][1],
2687 immediates[num_immediates][2],
2688 immediates[num_immediates][3]);
2689 #endif
2690 num_immediates++;
2691 }
2692 break;
2693
2694 default:
2695 ok = 0;
2696 assert( 0 );
2697 }
2698 }
2699
2700 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2701 if (do_swizzles)
2702 soa_to_aos( func, 9, 2, 10, 11 );
2703 }
2704
2705 /* Can't just use EBX, EDI without save/restoring them:
2706 */
2707 x86_pop(
2708 func,
2709 get_temp_base() );
2710
2711 x86_pop(
2712 func,
2713 get_immediate_base() );
2714
2715 emit_ret( func );
2716
2717 tgsi_parse_free( &parse );
2718
2719 return ok;
2720 }
2721
2722 #endif /* PIPE_ARCH_X86 */
2723