3ce2c1c27bdf9885ff1f4c2d75ae2f42be710e51
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
31
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_sse.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi_exec.h"
39 #include "tgsi_sse2.h"
40
41 #include "rtasm/rtasm_x86sse.h"
42
43 /* for 1/sqrt()
44 *
45 * This costs about 100fps (close to 10%) in gears:
46 */
47 #define HIGH_PRECISION 1
48
49 #define FAST_MATH 1
50
51
52 #define FOR_EACH_CHANNEL( CHAN )\
53 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
54
55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
57
58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
60
61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
62 FOR_EACH_CHANNEL( CHAN )\
63 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
64
65 #define CHAN_X 0
66 #define CHAN_Y 1
67 #define CHAN_Z 2
68 #define CHAN_W 3
69
70 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
71 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
72
73 #define TEMP_R0 TGSI_EXEC_TEMP_R0
74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
75 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
76 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
77
78
79 /**
80 * X86 utility functions.
81 */
82
83 static struct x86_reg
84 make_xmm(
85 unsigned xmm )
86 {
87 return x86_make_reg(
88 file_XMM,
89 (enum x86_reg_name) xmm );
90 }
91
92 /**
93 * X86 register mapping helpers.
94 */
95
96 static struct x86_reg
97 get_const_base( void )
98 {
99 return x86_make_reg(
100 file_REG32,
101 reg_CX );
102 }
103
104 static struct x86_reg
105 get_input_base( void )
106 {
107 return x86_make_reg(
108 file_REG32,
109 reg_AX );
110 }
111
112 static struct x86_reg
113 get_output_base( void )
114 {
115 return x86_make_reg(
116 file_REG32,
117 reg_DX );
118 }
119
120 static struct x86_reg
121 get_temp_base( void )
122 {
123 return x86_make_reg(
124 file_REG32,
125 reg_BX );
126 }
127
128 static struct x86_reg
129 get_coef_base( void )
130 {
131 return get_output_base();
132 }
133
134 static struct x86_reg
135 get_immediate_base( void )
136 {
137 return x86_make_reg(
138 file_REG32,
139 reg_DI );
140 }
141
142
143 /**
144 * Data access helpers.
145 */
146
147
148 static struct x86_reg
149 get_immediate(
150 unsigned vec,
151 unsigned chan )
152 {
153 return x86_make_disp(
154 get_immediate_base(),
155 (vec * 4 + chan) * 4 );
156 }
157
158 static struct x86_reg
159 get_const(
160 unsigned vec,
161 unsigned chan )
162 {
163 return x86_make_disp(
164 get_const_base(),
165 (vec * 4 + chan) * 4 );
166 }
167
168 static struct x86_reg
169 get_input(
170 unsigned vec,
171 unsigned chan )
172 {
173 return x86_make_disp(
174 get_input_base(),
175 (vec * 4 + chan) * 16 );
176 }
177
178 static struct x86_reg
179 get_output(
180 unsigned vec,
181 unsigned chan )
182 {
183 return x86_make_disp(
184 get_output_base(),
185 (vec * 4 + chan) * 16 );
186 }
187
188 static struct x86_reg
189 get_temp(
190 unsigned vec,
191 unsigned chan )
192 {
193 return x86_make_disp(
194 get_temp_base(),
195 (vec * 4 + chan) * 16 );
196 }
197
198 static struct x86_reg
199 get_coef(
200 unsigned vec,
201 unsigned chan,
202 unsigned member )
203 {
204 return x86_make_disp(
205 get_coef_base(),
206 ((vec * 3 + member) * 4 + chan) * 4 );
207 }
208
209
210 static void
211 emit_ret(
212 struct x86_function *func )
213 {
214 x86_ret( func );
215 }
216
217
218 /**
219 * Data fetch helpers.
220 */
221
222 /**
223 * Copy a shader constant to xmm register
224 * \param xmm the destination xmm register
225 * \param vec the src const buffer index
226 * \param chan src channel to fetch (X, Y, Z or W)
227 */
228 static void
229 emit_const(
230 struct x86_function *func,
231 uint xmm,
232 int vec,
233 uint chan,
234 uint indirect,
235 uint indirectFile,
236 int indirectIndex )
237 {
238 if (indirect) {
239 /* 'vec' is the offset from the address register's value.
240 * We're loading CONST[ADDR+vec] into an xmm register.
241 */
242 struct x86_reg r0 = get_input_base();
243 struct x86_reg r1 = get_output_base();
244 uint i;
245
246 assert( indirectFile == TGSI_FILE_ADDRESS );
247 assert( indirectIndex == 0 );
248
249 x86_push( func, r0 );
250 x86_push( func, r1 );
251
252 /*
253 * Loop over the four pixels or vertices in the quad.
254 * Get the value of the address (offset) register for pixel/vertex[i],
255 * add it to the src offset and index into the constant buffer.
256 * Note that we're working on SOA data.
257 * If any of the pixel/vertex execution channels are unused their
258 * values will be garbage. It's very important that we don't use
259 * those garbage values as indexes into the constant buffer since
260 * that'll cause segfaults.
261 * The solution is to bitwise-AND the offset with the execution mask
262 * register whose values are either 0 or ~0.
263 * The caller must setup the execution mask register to indicate
264 * which channels are valid/alive before running the shader.
265 * The execution mask will also figure into loops and conditionals
266 * someday.
267 */
268 for (i = 0; i < QUAD_SIZE; i++) {
269 /* r1 = address register[i] */
270 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
271 /* r0 = execution mask[i] */
272 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
273 /* r1 = r1 & r0 */
274 x86_and( func, r1, r0 );
275 /* r0 = 'vec', the offset */
276 x86_lea( func, r0, get_const( vec, chan ) );
277
278 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
279 */
280 x86_add( func, r1, r1 );
281 x86_add( func, r1, r1 );
282 x86_add( func, r1, r1 );
283 x86_add( func, r1, r1 );
284
285 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
286 x86_mov( func, r1, x86_deref( r0 ) );
287 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
288 }
289
290 x86_pop( func, r1 );
291 x86_pop( func, r0 );
292
293 sse_movaps(
294 func,
295 make_xmm( xmm ),
296 get_temp( TEMP_R0, CHAN_X ) );
297 }
298 else {
299 /* 'vec' is the index into the src register file, such as TEMP[vec] */
300 assert( vec >= 0 );
301
302 sse_movss(
303 func,
304 make_xmm( xmm ),
305 get_const( vec, chan ) );
306 sse_shufps(
307 func,
308 make_xmm( xmm ),
309 make_xmm( xmm ),
310 SHUF( 0, 0, 0, 0 ) );
311 }
312 }
313
314 static void
315 emit_immediate(
316 struct x86_function *func,
317 unsigned xmm,
318 unsigned vec,
319 unsigned chan )
320 {
321 sse_movss(
322 func,
323 make_xmm( xmm ),
324 get_immediate( vec, chan ) );
325 sse_shufps(
326 func,
327 make_xmm( xmm ),
328 make_xmm( xmm ),
329 SHUF( 0, 0, 0, 0 ) );
330 }
331
332
333 /**
334 * Copy a shader input to xmm register
335 * \param xmm the destination xmm register
336 * \param vec the src input attrib
337 * \param chan src channel to fetch (X, Y, Z or W)
338 */
339 static void
340 emit_inputf(
341 struct x86_function *func,
342 unsigned xmm,
343 unsigned vec,
344 unsigned chan )
345 {
346 sse_movups(
347 func,
348 make_xmm( xmm ),
349 get_input( vec, chan ) );
350 }
351
352 /**
353 * Store an xmm register to a shader output
354 * \param xmm the source xmm register
355 * \param vec the dest output attrib
356 * \param chan src dest channel to store (X, Y, Z or W)
357 */
358 static void
359 emit_output(
360 struct x86_function *func,
361 unsigned xmm,
362 unsigned vec,
363 unsigned chan )
364 {
365 sse_movups(
366 func,
367 get_output( vec, chan ),
368 make_xmm( xmm ) );
369 }
370
371 /**
372 * Copy a shader temporary to xmm register
373 * \param xmm the destination xmm register
374 * \param vec the src temp register
375 * \param chan src channel to fetch (X, Y, Z or W)
376 */
377 static void
378 emit_tempf(
379 struct x86_function *func,
380 unsigned xmm,
381 unsigned vec,
382 unsigned chan )
383 {
384 sse_movaps(
385 func,
386 make_xmm( xmm ),
387 get_temp( vec, chan ) );
388 }
389
390 /**
391 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
392 * \param xmm the destination xmm register
393 * \param vec the src input/attribute coefficient index
394 * \param chan src channel to fetch (X, Y, Z or W)
395 * \param member 0=a0, 1=dadx, 2=dady
396 */
397 static void
398 emit_coef(
399 struct x86_function *func,
400 unsigned xmm,
401 unsigned vec,
402 unsigned chan,
403 unsigned member )
404 {
405 sse_movss(
406 func,
407 make_xmm( xmm ),
408 get_coef( vec, chan, member ) );
409 sse_shufps(
410 func,
411 make_xmm( xmm ),
412 make_xmm( xmm ),
413 SHUF( 0, 0, 0, 0 ) );
414 }
415
416 /**
417 * Data store helpers.
418 */
419
420 static void
421 emit_inputs(
422 struct x86_function *func,
423 unsigned xmm,
424 unsigned vec,
425 unsigned chan )
426 {
427 sse_movups(
428 func,
429 get_input( vec, chan ),
430 make_xmm( xmm ) );
431 }
432
433 static void
434 emit_temps(
435 struct x86_function *func,
436 unsigned xmm,
437 unsigned vec,
438 unsigned chan )
439 {
440 sse_movaps(
441 func,
442 get_temp( vec, chan ),
443 make_xmm( xmm ) );
444 }
445
446 static void
447 emit_addrs(
448 struct x86_function *func,
449 unsigned xmm,
450 unsigned vec,
451 unsigned chan )
452 {
453 assert( vec == 0 );
454
455 emit_temps(
456 func,
457 xmm,
458 vec + TGSI_EXEC_TEMP_ADDR,
459 chan );
460 }
461
462 /**
463 * Coefficent fetch helpers.
464 */
465
466 static void
467 emit_coef_a0(
468 struct x86_function *func,
469 unsigned xmm,
470 unsigned vec,
471 unsigned chan )
472 {
473 emit_coef(
474 func,
475 xmm,
476 vec,
477 chan,
478 0 );
479 }
480
481 static void
482 emit_coef_dadx(
483 struct x86_function *func,
484 unsigned xmm,
485 unsigned vec,
486 unsigned chan )
487 {
488 emit_coef(
489 func,
490 xmm,
491 vec,
492 chan,
493 1 );
494 }
495
496 static void
497 emit_coef_dady(
498 struct x86_function *func,
499 unsigned xmm,
500 unsigned vec,
501 unsigned chan )
502 {
503 emit_coef(
504 func,
505 xmm,
506 vec,
507 chan,
508 2 );
509 }
510
511 /**
512 * Function call helpers.
513 */
514
515 /**
516 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
517 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
518 * that the stack pointer is 16 byte aligned, as expected.
519 */
520 static void
521 emit_func_call_dst(
522 struct x86_function *func,
523 unsigned xmm_save,
524 unsigned xmm_dst,
525 void (PIPE_CDECL *code)() )
526 {
527 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
528 unsigned i, n, xmm;
529 unsigned xmm_mask;
530
531 /* Bitmask of the xmm registers to save */
532 xmm_mask = (1 << xmm_save) - 1;
533 xmm_mask &= ~(1 << xmm_dst);
534
535 sse_movaps(
536 func,
537 get_temp( TEMP_R0, 0 ),
538 make_xmm( xmm_dst ) );
539
540 x86_push(
541 func,
542 x86_make_reg( file_REG32, reg_AX) );
543 x86_push(
544 func,
545 x86_make_reg( file_REG32, reg_CX) );
546 x86_push(
547 func,
548 x86_make_reg( file_REG32, reg_DX) );
549
550 for(i = 0, n = 0; i < 8; ++i)
551 if(xmm_mask & (1 << i))
552 ++n;
553
554 x86_sub_imm(
555 func,
556 x86_make_reg( file_REG32, reg_SP ),
557 n*16);
558
559 for(i = 0, n = 0; i < 8; ++i)
560 if(xmm_mask & (1 << i)) {
561 sse_movups(
562 func,
563 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
564 make_xmm( xmm ) );
565 ++n;
566 }
567
568 x86_lea(
569 func,
570 ecx,
571 get_temp( TEMP_R0, 0 ) );
572
573 x86_push( func, ecx );
574 x86_mov_reg_imm( func, ecx, (unsigned long) code );
575 x86_call( func, ecx );
576 x86_pop(func, ecx );
577
578 for(i = 0, n = 0; i < 8; ++i)
579 if(xmm_mask & (1 << i)) {
580 sse_movups(
581 func,
582 make_xmm( xmm ),
583 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
584 ++n;
585 }
586
587 x86_add_imm(
588 func,
589 x86_make_reg( file_REG32, reg_SP ),
590 n*16);
591
592 /* Restore GP registers in a reverse order.
593 */
594 x86_pop(
595 func,
596 x86_make_reg( file_REG32, reg_DX) );
597 x86_pop(
598 func,
599 x86_make_reg( file_REG32, reg_CX) );
600 x86_pop(
601 func,
602 x86_make_reg( file_REG32, reg_AX) );
603
604 sse_movaps(
605 func,
606 make_xmm( xmm_dst ),
607 get_temp( TEMP_R0, 0 ) );
608 }
609
610 static void
611 emit_func_call_dst_src(
612 struct x86_function *func,
613 unsigned xmm_save,
614 unsigned xmm_dst,
615 unsigned xmm_src,
616 void (PIPE_CDECL *code)() )
617 {
618 sse_movaps(
619 func,
620 get_temp( TEMP_R0, 1 ),
621 make_xmm( xmm_src ) );
622
623 emit_func_call_dst(
624 func,
625 xmm_save,
626 xmm_dst,
627 code );
628 }
629
630 /*
631 * Fast SSE2 implementation of special math functions.
632 */
633
634 #define POLY0(x, c0) _mm_set1_ps(c0)
635 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
636 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
637 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
638 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
639 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
640
641 #define EXP_POLY_DEGREE 3
642 #define LOG_POLY_DEGREE 5
643
644 /**
645 * See http://www.devmaster.net/forums/showthread.php?p=43580
646 */
647 static INLINE __m128
648 exp2f4(__m128 x)
649 {
650 __m128i ipart;
651 __m128 fpart, expipart, expfpart;
652
653 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
654 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
655
656 /* ipart = int(x - 0.5) */
657 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
658
659 /* fpart = x - ipart */
660 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
661
662 /* expipart = (float) (1 << ipart) */
663 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
664
665 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
666 #if EXP_POLY_DEGREE == 5
667 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
668 #elif EXP_POLY_DEGREE == 4
669 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
670 #elif EXP_POLY_DEGREE == 3
671 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
672 #elif EXP_POLY_DEGREE == 2
673 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
674 #else
675 #error
676 #endif
677
678 return _mm_mul_ps(expipart, expfpart);
679 }
680
681 /**
682 * See http://www.devmaster.net/forums/showthread.php?p=43580
683 */
684 static INLINE __m128
685 log2f4(__m128 x)
686 {
687 __m128i expmask = _mm_set1_epi32(0x7f800000);
688 __m128i mantmask = _mm_set1_epi32(0x007fffff);
689 __m128 one = _mm_set1_ps(1.0f);
690
691 __m128i i = _mm_castps_si128(x);
692
693 /* exp = (float) exponent(x) */
694 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
695
696 /* mant = (float) mantissa(x) */
697 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
698
699 __m128 logmant;
700
701 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
702 * These coefficients can be generate with
703 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
704 */
705 #if LOG_POLY_DEGREE == 6
706 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
707 #elif LOG_POLY_DEGREE == 5
708 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
709 #elif LOG_POLY_DEGREE == 4
710 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
711 #elif LOG_POLY_DEGREE == 3
712 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
713 #else
714 #error
715 #endif
716
717 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
718 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
719
720 return _mm_add_ps(logmant, exp);
721 }
722
723 static INLINE __m128
724 powf4(__m128 x, __m128 y)
725 {
726 return exp2f4(_mm_mul_ps(log2f4(x), y));
727 }
728
729
730 /**
731 * Low-level instruction translators.
732 */
733
734 static void
735 emit_abs(
736 struct x86_function *func,
737 unsigned xmm )
738 {
739 sse_andps(
740 func,
741 make_xmm( xmm ),
742 get_temp(
743 TGSI_EXEC_TEMP_7FFFFFFF_I,
744 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
745 }
746
747 static void
748 emit_add(
749 struct x86_function *func,
750 unsigned xmm_dst,
751 unsigned xmm_src )
752 {
753 sse_addps(
754 func,
755 make_xmm( xmm_dst ),
756 make_xmm( xmm_src ) );
757 }
758
759 static void PIPE_CDECL
760 cos4f(
761 float *store )
762 {
763 store[0] = cosf( store[0] );
764 store[1] = cosf( store[1] );
765 store[2] = cosf( store[2] );
766 store[3] = cosf( store[3] );
767 }
768
769 static void
770 emit_cos(
771 struct x86_function *func,
772 unsigned xmm_save,
773 unsigned xmm_dst )
774 {
775 emit_func_call_dst(
776 func,
777 xmm_save,
778 xmm_dst,
779 cos4f );
780 }
781
782 static void PIPE_CDECL
783 #if defined(PIPE_CC_GCC)
784 __attribute__((force_align_arg_pointer))
785 #endif
786 ex24f(
787 float *store )
788 {
789 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
790 }
791
792 static void
793 emit_ex2(
794 struct x86_function *func,
795 unsigned xmm_save,
796 unsigned xmm_dst )
797 {
798 emit_func_call_dst(
799 func,
800 xmm_save,
801 xmm_dst,
802 ex24f );
803 }
804
805 static void
806 emit_f2it(
807 struct x86_function *func,
808 unsigned xmm )
809 {
810 sse2_cvttps2dq(
811 func,
812 make_xmm( xmm ),
813 make_xmm( xmm ) );
814 }
815
816 static void
817 emit_i2f(
818 struct x86_function *func,
819 unsigned xmm )
820 {
821 sse2_cvtdq2ps(
822 func,
823 make_xmm( xmm ),
824 make_xmm( xmm ) );
825 }
826
827 static void PIPE_CDECL
828 flr4f(
829 float *store )
830 {
831 store[0] = floorf( store[0] );
832 store[1] = floorf( store[1] );
833 store[2] = floorf( store[2] );
834 store[3] = floorf( store[3] );
835 }
836
837 static void
838 emit_flr(
839 struct x86_function *func,
840 unsigned xmm_save,
841 unsigned xmm_dst )
842 {
843 emit_func_call_dst(
844 func,
845 xmm_save,
846 xmm_dst,
847 flr4f );
848 }
849
850 static void PIPE_CDECL
851 frc4f(
852 float *store )
853 {
854 store[0] -= floorf( store[0] );
855 store[1] -= floorf( store[1] );
856 store[2] -= floorf( store[2] );
857 store[3] -= floorf( store[3] );
858 }
859
860 static void
861 emit_frc(
862 struct x86_function *func,
863 unsigned xmm_save,
864 unsigned xmm_dst )
865 {
866 emit_func_call_dst(
867 func,
868 xmm_save,
869 xmm_dst,
870 frc4f );
871 }
872
873 static void PIPE_CDECL
874 #if defined(PIPE_CC_GCC)
875 __attribute__((force_align_arg_pointer))
876 #endif
877 lg24f(
878 float *store )
879 {
880 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
881 }
882
883 static void
884 emit_lg2(
885 struct x86_function *func,
886 unsigned xmm_save,
887 unsigned xmm_dst )
888 {
889 emit_func_call_dst(
890 func,
891 xmm_save,
892 xmm_dst,
893 lg24f );
894 }
895
896 static void
897 emit_MOV(
898 struct x86_function *func,
899 unsigned xmm_dst,
900 unsigned xmm_src )
901 {
902 sse_movups(
903 func,
904 make_xmm( xmm_dst ),
905 make_xmm( xmm_src ) );
906 }
907
908 static void
909 emit_mul (struct x86_function *func,
910 unsigned xmm_dst,
911 unsigned xmm_src)
912 {
913 sse_mulps(
914 func,
915 make_xmm( xmm_dst ),
916 make_xmm( xmm_src ) );
917 }
918
919 static void
920 emit_neg(
921 struct x86_function *func,
922 unsigned xmm )
923 {
924 sse_xorps(
925 func,
926 make_xmm( xmm ),
927 get_temp(
928 TGSI_EXEC_TEMP_80000000_I,
929 TGSI_EXEC_TEMP_80000000_C ) );
930 }
931
932 static void PIPE_CDECL
933 #if defined(PIPE_CC_GCC)
934 __attribute__((force_align_arg_pointer))
935 #endif
936 pow4f(
937 float *store )
938 {
939 #if 1
940 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
941 #else
942 store[0] = powf( store[0], store[4] );
943 store[1] = powf( store[1], store[5] );
944 store[2] = powf( store[2], store[6] );
945 store[3] = powf( store[3], store[7] );
946 #endif
947 }
948
949 static void
950 emit_pow(
951 struct x86_function *func,
952 unsigned xmm_save,
953 unsigned xmm_dst,
954 unsigned xmm_src )
955 {
956 emit_func_call_dst_src(
957 func,
958 xmm_save,
959 xmm_dst,
960 xmm_src,
961 pow4f );
962 }
963
964 static void
965 emit_rcp (
966 struct x86_function *func,
967 unsigned xmm_dst,
968 unsigned xmm_src )
969 {
970 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
971 * good enough. Need to either emit a proper divide or use the
972 * iterative technique described below in emit_rsqrt().
973 */
974 sse2_rcpps(
975 func,
976 make_xmm( xmm_dst ),
977 make_xmm( xmm_src ) );
978 }
979
980 static void
981 emit_rsqrt(
982 struct x86_function *func,
983 unsigned xmm_dst,
984 unsigned xmm_src )
985 {
986 #if HIGH_PRECISION
987 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
988 * implementations, it is possible to improve its precision at
989 * fairly low cost, using a newton/raphson step, as below:
990 *
991 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
992 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
993 *
994 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
995 */
996 {
997 struct x86_reg dst = make_xmm( xmm_dst );
998 struct x86_reg src = make_xmm( xmm_src );
999 struct x86_reg tmp0 = make_xmm( 2 );
1000 struct x86_reg tmp1 = make_xmm( 3 );
1001
1002 assert( xmm_dst != xmm_src );
1003 assert( xmm_dst != 2 && xmm_dst != 3 );
1004 assert( xmm_src != 2 && xmm_src != 3 );
1005
1006 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1007 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1008 sse_rsqrtps( func, tmp1, src );
1009 sse_mulps( func, src, tmp1 );
1010 sse_mulps( func, dst, tmp1 );
1011 sse_mulps( func, src, tmp1 );
1012 sse_subps( func, tmp0, src );
1013 sse_mulps( func, dst, tmp0 );
1014 }
1015 #else
1016 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1017 * good enough.
1018 */
1019 sse_rsqrtps(
1020 func,
1021 make_xmm( xmm_dst ),
1022 make_xmm( xmm_src ) );
1023 #endif
1024 }
1025
1026 static void
1027 emit_setsign(
1028 struct x86_function *func,
1029 unsigned xmm )
1030 {
1031 sse_orps(
1032 func,
1033 make_xmm( xmm ),
1034 get_temp(
1035 TGSI_EXEC_TEMP_80000000_I,
1036 TGSI_EXEC_TEMP_80000000_C ) );
1037 }
1038
1039 static void PIPE_CDECL
1040 sin4f(
1041 float *store )
1042 {
1043 store[0] = sinf( store[0] );
1044 store[1] = sinf( store[1] );
1045 store[2] = sinf( store[2] );
1046 store[3] = sinf( store[3] );
1047 }
1048
1049 static void
1050 emit_sin (struct x86_function *func,
1051 unsigned xmm_save,
1052 unsigned xmm_dst)
1053 {
1054 emit_func_call_dst(
1055 func,
1056 xmm_save,
1057 xmm_dst,
1058 sin4f );
1059 }
1060
1061 static void
1062 emit_sub(
1063 struct x86_function *func,
1064 unsigned xmm_dst,
1065 unsigned xmm_src )
1066 {
1067 sse_subps(
1068 func,
1069 make_xmm( xmm_dst ),
1070 make_xmm( xmm_src ) );
1071 }
1072
1073 /**
1074 * Register fetch.
1075 */
1076
1077 static void
1078 emit_fetch(
1079 struct x86_function *func,
1080 unsigned xmm,
1081 const struct tgsi_full_src_register *reg,
1082 const unsigned chan_index )
1083 {
1084 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1085
1086 switch (swizzle) {
1087 case TGSI_EXTSWIZZLE_X:
1088 case TGSI_EXTSWIZZLE_Y:
1089 case TGSI_EXTSWIZZLE_Z:
1090 case TGSI_EXTSWIZZLE_W:
1091 switch (reg->SrcRegister.File) {
1092 case TGSI_FILE_CONSTANT:
1093 emit_const(
1094 func,
1095 xmm,
1096 reg->SrcRegister.Index,
1097 swizzle,
1098 reg->SrcRegister.Indirect,
1099 reg->SrcRegisterInd.File,
1100 reg->SrcRegisterInd.Index );
1101 break;
1102
1103 case TGSI_FILE_IMMEDIATE:
1104 emit_immediate(
1105 func,
1106 xmm,
1107 reg->SrcRegister.Index,
1108 swizzle );
1109 break;
1110
1111 case TGSI_FILE_INPUT:
1112 emit_inputf(
1113 func,
1114 xmm,
1115 reg->SrcRegister.Index,
1116 swizzle );
1117 break;
1118
1119 case TGSI_FILE_TEMPORARY:
1120 emit_tempf(
1121 func,
1122 xmm,
1123 reg->SrcRegister.Index,
1124 swizzle );
1125 break;
1126
1127 default:
1128 assert( 0 );
1129 }
1130 break;
1131
1132 case TGSI_EXTSWIZZLE_ZERO:
1133 emit_tempf(
1134 func,
1135 xmm,
1136 TGSI_EXEC_TEMP_00000000_I,
1137 TGSI_EXEC_TEMP_00000000_C );
1138 break;
1139
1140 case TGSI_EXTSWIZZLE_ONE:
1141 emit_tempf(
1142 func,
1143 xmm,
1144 TEMP_ONE_I,
1145 TEMP_ONE_C );
1146 break;
1147
1148 default:
1149 assert( 0 );
1150 }
1151
1152 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1153 case TGSI_UTIL_SIGN_CLEAR:
1154 emit_abs( func, xmm );
1155 break;
1156
1157 case TGSI_UTIL_SIGN_SET:
1158 emit_setsign( func, xmm );
1159 break;
1160
1161 case TGSI_UTIL_SIGN_TOGGLE:
1162 emit_neg( func, xmm );
1163 break;
1164
1165 case TGSI_UTIL_SIGN_KEEP:
1166 break;
1167 }
1168 }
1169
1170 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1171 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1172
1173 /**
1174 * Register store.
1175 */
1176
1177 static void
1178 emit_store(
1179 struct x86_function *func,
1180 unsigned xmm,
1181 const struct tgsi_full_dst_register *reg,
1182 const struct tgsi_full_instruction *inst,
1183 unsigned chan_index )
1184 {
1185 switch( reg->DstRegister.File ) {
1186 case TGSI_FILE_OUTPUT:
1187 emit_output(
1188 func,
1189 xmm,
1190 reg->DstRegister.Index,
1191 chan_index );
1192 break;
1193
1194 case TGSI_FILE_TEMPORARY:
1195 emit_temps(
1196 func,
1197 xmm,
1198 reg->DstRegister.Index,
1199 chan_index );
1200 break;
1201
1202 case TGSI_FILE_ADDRESS:
1203 emit_addrs(
1204 func,
1205 xmm,
1206 reg->DstRegister.Index,
1207 chan_index );
1208 break;
1209
1210 default:
1211 assert( 0 );
1212 }
1213
1214 switch( inst->Instruction.Saturate ) {
1215 case TGSI_SAT_NONE:
1216 break;
1217
1218 case TGSI_SAT_ZERO_ONE:
1219 /* assert( 0 ); */
1220 break;
1221
1222 case TGSI_SAT_MINUS_PLUS_ONE:
1223 assert( 0 );
1224 break;
1225 }
1226 }
1227
1228 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1229 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1230
1231 /**
1232 * High-level instruction translators.
1233 */
1234
1235 static void
1236 emit_kil(
1237 struct x86_function *func,
1238 const struct tgsi_full_src_register *reg )
1239 {
1240 unsigned uniquemask;
1241 unsigned registers[4];
1242 unsigned nextregister = 0;
1243 unsigned firstchan = ~0;
1244 unsigned chan_index;
1245
1246 /* This mask stores component bits that were already tested. Note that
1247 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1248 * tested. */
1249 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1250
1251 FOR_EACH_CHANNEL( chan_index ) {
1252 unsigned swizzle;
1253
1254 /* unswizzle channel */
1255 swizzle = tgsi_util_get_full_src_register_extswizzle(
1256 reg,
1257 chan_index );
1258
1259 /* check if the component has not been already tested */
1260 if( !(uniquemask & (1 << swizzle)) ) {
1261 uniquemask |= 1 << swizzle;
1262
1263 /* allocate register */
1264 registers[chan_index] = nextregister;
1265 emit_fetch(
1266 func,
1267 nextregister,
1268 reg,
1269 chan_index );
1270 nextregister++;
1271
1272 /* mark the first channel used */
1273 if( firstchan == ~0 ) {
1274 firstchan = chan_index;
1275 }
1276 }
1277 }
1278
1279 x86_push(
1280 func,
1281 x86_make_reg( file_REG32, reg_AX ) );
1282 x86_push(
1283 func,
1284 x86_make_reg( file_REG32, reg_DX ) );
1285
1286 FOR_EACH_CHANNEL( chan_index ) {
1287 if( uniquemask & (1 << chan_index) ) {
1288 sse_cmpps(
1289 func,
1290 make_xmm( registers[chan_index] ),
1291 get_temp(
1292 TGSI_EXEC_TEMP_00000000_I,
1293 TGSI_EXEC_TEMP_00000000_C ),
1294 cc_LessThan );
1295
1296 if( chan_index == firstchan ) {
1297 sse_pmovmskb(
1298 func,
1299 x86_make_reg( file_REG32, reg_AX ),
1300 make_xmm( registers[chan_index] ) );
1301 }
1302 else {
1303 sse_pmovmskb(
1304 func,
1305 x86_make_reg( file_REG32, reg_DX ),
1306 make_xmm( registers[chan_index] ) );
1307 x86_or(
1308 func,
1309 x86_make_reg( file_REG32, reg_AX ),
1310 x86_make_reg( file_REG32, reg_DX ) );
1311 }
1312 }
1313 }
1314
1315 x86_or(
1316 func,
1317 get_temp(
1318 TGSI_EXEC_TEMP_KILMASK_I,
1319 TGSI_EXEC_TEMP_KILMASK_C ),
1320 x86_make_reg( file_REG32, reg_AX ) );
1321
1322 x86_pop(
1323 func,
1324 x86_make_reg( file_REG32, reg_DX ) );
1325 x86_pop(
1326 func,
1327 x86_make_reg( file_REG32, reg_AX ) );
1328 }
1329
1330
1331 static void
1332 emit_kilp(
1333 struct x86_function *func )
1334 {
1335 /* XXX todo / fix me */
1336 }
1337
1338
1339 static void
1340 emit_setcc(
1341 struct x86_function *func,
1342 struct tgsi_full_instruction *inst,
1343 enum sse_cc cc )
1344 {
1345 unsigned chan_index;
1346
1347 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1348 FETCH( func, *inst, 0, 0, chan_index );
1349 FETCH( func, *inst, 1, 1, chan_index );
1350 sse_cmpps(
1351 func,
1352 make_xmm( 0 ),
1353 make_xmm( 1 ),
1354 cc );
1355 sse_andps(
1356 func,
1357 make_xmm( 0 ),
1358 get_temp(
1359 TEMP_ONE_I,
1360 TEMP_ONE_C ) );
1361 STORE( func, *inst, 0, 0, chan_index );
1362 }
1363 }
1364
1365 static void
1366 emit_cmp(
1367 struct x86_function *func,
1368 struct tgsi_full_instruction *inst )
1369 {
1370 unsigned chan_index;
1371
1372 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1373 FETCH( func, *inst, 0, 0, chan_index );
1374 FETCH( func, *inst, 1, 1, chan_index );
1375 FETCH( func, *inst, 2, 2, chan_index );
1376 sse_cmpps(
1377 func,
1378 make_xmm( 0 ),
1379 get_temp(
1380 TGSI_EXEC_TEMP_00000000_I,
1381 TGSI_EXEC_TEMP_00000000_C ),
1382 cc_LessThan );
1383 sse_andps(
1384 func,
1385 make_xmm( 1 ),
1386 make_xmm( 0 ) );
1387 sse_andnps(
1388 func,
1389 make_xmm( 0 ),
1390 make_xmm( 2 ) );
1391 sse_orps(
1392 func,
1393 make_xmm( 0 ),
1394 make_xmm( 1 ) );
1395 STORE( func, *inst, 0, 0, chan_index );
1396 }
1397 }
1398
1399 static int
1400 emit_instruction(
1401 struct x86_function *func,
1402 struct tgsi_full_instruction *inst )
1403 {
1404 unsigned chan_index;
1405
1406 switch (inst->Instruction.Opcode) {
1407 case TGSI_OPCODE_ARL:
1408 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1409 FETCH( func, *inst, 0, 0, chan_index );
1410 emit_f2it( func, 0 );
1411 STORE( func, *inst, 0, 0, chan_index );
1412 }
1413 break;
1414
1415 case TGSI_OPCODE_MOV:
1416 case TGSI_OPCODE_SWZ:
1417 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1418 FETCH( func, *inst, 0, 0, chan_index );
1419 STORE( func, *inst, 0, 0, chan_index );
1420 }
1421 break;
1422
1423 case TGSI_OPCODE_LIT:
1424 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1425 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1426 emit_tempf(
1427 func,
1428 0,
1429 TEMP_ONE_I,
1430 TEMP_ONE_C);
1431 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1432 STORE( func, *inst, 0, 0, CHAN_X );
1433 }
1434 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1435 STORE( func, *inst, 0, 0, CHAN_W );
1436 }
1437 }
1438 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1439 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1440 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1441 FETCH( func, *inst, 0, 0, CHAN_X );
1442 sse_maxps(
1443 func,
1444 make_xmm( 0 ),
1445 get_temp(
1446 TGSI_EXEC_TEMP_00000000_I,
1447 TGSI_EXEC_TEMP_00000000_C ) );
1448 STORE( func, *inst, 0, 0, CHAN_Y );
1449 }
1450 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1451 /* XMM[1] = SrcReg[0].yyyy */
1452 FETCH( func, *inst, 1, 0, CHAN_Y );
1453 /* XMM[1] = max(XMM[1], 0) */
1454 sse_maxps(
1455 func,
1456 make_xmm( 1 ),
1457 get_temp(
1458 TGSI_EXEC_TEMP_00000000_I,
1459 TGSI_EXEC_TEMP_00000000_C ) );
1460 /* XMM[2] = SrcReg[0].wwww */
1461 FETCH( func, *inst, 2, 0, CHAN_W );
1462 /* XMM[2] = min(XMM[2], 128.0) */
1463 sse_minps(
1464 func,
1465 make_xmm( 2 ),
1466 get_temp(
1467 TGSI_EXEC_TEMP_128_I,
1468 TGSI_EXEC_TEMP_128_C ) );
1469 /* XMM[2] = max(XMM[2], -128.0) */
1470 sse_maxps(
1471 func,
1472 make_xmm( 2 ),
1473 get_temp(
1474 TGSI_EXEC_TEMP_MINUS_128_I,
1475 TGSI_EXEC_TEMP_MINUS_128_C ) );
1476 emit_pow( func, 3, 1, 2 );
1477 FETCH( func, *inst, 0, 0, CHAN_X );
1478 sse_xorps(
1479 func,
1480 make_xmm( 2 ),
1481 make_xmm( 2 ) );
1482 sse_cmpps(
1483 func,
1484 make_xmm( 2 ),
1485 make_xmm( 0 ),
1486 cc_LessThanEqual );
1487 sse_andps(
1488 func,
1489 make_xmm( 2 ),
1490 make_xmm( 1 ) );
1491 STORE( func, *inst, 2, 0, CHAN_Z );
1492 }
1493 }
1494 break;
1495
1496 case TGSI_OPCODE_RCP:
1497 /* TGSI_OPCODE_RECIP */
1498 FETCH( func, *inst, 0, 0, CHAN_X );
1499 emit_rcp( func, 0, 0 );
1500 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1501 STORE( func, *inst, 0, 0, chan_index );
1502 }
1503 break;
1504
1505 case TGSI_OPCODE_RSQ:
1506 /* TGSI_OPCODE_RECIPSQRT */
1507 FETCH( func, *inst, 0, 0, CHAN_X );
1508 emit_rsqrt( func, 1, 0 );
1509 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1510 STORE( func, *inst, 1, 0, chan_index );
1511 }
1512 break;
1513
1514 case TGSI_OPCODE_EXP:
1515 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1516 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1517 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1518 FETCH( func, *inst, 0, 0, CHAN_X );
1519 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1520 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1521 emit_MOV( func, 1, 0 );
1522 emit_flr( func, 2, 1 );
1523 /* dst.x = ex2(floor(src.x)) */
1524 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1525 emit_MOV( func, 2, 1 );
1526 emit_ex2( func, 3, 2 );
1527 STORE( func, *inst, 2, 0, CHAN_X );
1528 }
1529 /* dst.y = src.x - floor(src.x) */
1530 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1531 emit_MOV( func, 2, 0 );
1532 emit_sub( func, 2, 1 );
1533 STORE( func, *inst, 2, 0, CHAN_Y );
1534 }
1535 }
1536 /* dst.z = ex2(src.x) */
1537 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1538 emit_ex2( func, 3, 0 );
1539 STORE( func, *inst, 0, 0, CHAN_Z );
1540 }
1541 }
1542 /* dst.w = 1.0 */
1543 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1544 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1545 STORE( func, *inst, 0, 0, CHAN_W );
1546 }
1547 break;
1548
1549 case TGSI_OPCODE_LOG:
1550 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1551 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1552 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1553 FETCH( func, *inst, 0, 0, CHAN_X );
1554 emit_abs( func, 0 );
1555 emit_MOV( func, 1, 0 );
1556 emit_lg2( func, 2, 1 );
1557 /* dst.z = lg2(abs(src.x)) */
1558 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1559 STORE( func, *inst, 1, 0, CHAN_Z );
1560 }
1561 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1562 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1563 emit_flr( func, 2, 1 );
1564 /* dst.x = floor(lg2(abs(src.x))) */
1565 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1566 STORE( func, *inst, 1, 0, CHAN_X );
1567 }
1568 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1569 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1570 emit_ex2( func, 2, 1 );
1571 emit_rcp( func, 1, 1 );
1572 emit_mul( func, 0, 1 );
1573 STORE( func, *inst, 0, 0, CHAN_Y );
1574 }
1575 }
1576 }
1577 /* dst.w = 1.0 */
1578 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1579 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1580 STORE( func, *inst, 0, 0, CHAN_W );
1581 }
1582 break;
1583
1584 case TGSI_OPCODE_MUL:
1585 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1586 FETCH( func, *inst, 0, 0, chan_index );
1587 FETCH( func, *inst, 1, 1, chan_index );
1588 emit_mul( func, 0, 1 );
1589 STORE( func, *inst, 0, 0, chan_index );
1590 }
1591 break;
1592
1593 case TGSI_OPCODE_ADD:
1594 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1595 FETCH( func, *inst, 0, 0, chan_index );
1596 FETCH( func, *inst, 1, 1, chan_index );
1597 emit_add( func, 0, 1 );
1598 STORE( func, *inst, 0, 0, chan_index );
1599 }
1600 break;
1601
1602 case TGSI_OPCODE_DP3:
1603 /* TGSI_OPCODE_DOT3 */
1604 FETCH( func, *inst, 0, 0, CHAN_X );
1605 FETCH( func, *inst, 1, 1, CHAN_X );
1606 emit_mul( func, 0, 1 );
1607 FETCH( func, *inst, 1, 0, CHAN_Y );
1608 FETCH( func, *inst, 2, 1, CHAN_Y );
1609 emit_mul( func, 1, 2 );
1610 emit_add( func, 0, 1 );
1611 FETCH( func, *inst, 1, 0, CHAN_Z );
1612 FETCH( func, *inst, 2, 1, CHAN_Z );
1613 emit_mul( func, 1, 2 );
1614 emit_add( func, 0, 1 );
1615 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1616 STORE( func, *inst, 0, 0, chan_index );
1617 }
1618 break;
1619
1620 case TGSI_OPCODE_DP4:
1621 /* TGSI_OPCODE_DOT4 */
1622 FETCH( func, *inst, 0, 0, CHAN_X );
1623 FETCH( func, *inst, 1, 1, CHAN_X );
1624 emit_mul( func, 0, 1 );
1625 FETCH( func, *inst, 1, 0, CHAN_Y );
1626 FETCH( func, *inst, 2, 1, CHAN_Y );
1627 emit_mul( func, 1, 2 );
1628 emit_add( func, 0, 1 );
1629 FETCH( func, *inst, 1, 0, CHAN_Z );
1630 FETCH( func, *inst, 2, 1, CHAN_Z );
1631 emit_mul(func, 1, 2 );
1632 emit_add(func, 0, 1 );
1633 FETCH( func, *inst, 1, 0, CHAN_W );
1634 FETCH( func, *inst, 2, 1, CHAN_W );
1635 emit_mul( func, 1, 2 );
1636 emit_add( func, 0, 1 );
1637 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1638 STORE( func, *inst, 0, 0, chan_index );
1639 }
1640 break;
1641
1642 case TGSI_OPCODE_DST:
1643 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1644 emit_tempf(
1645 func,
1646 0,
1647 TEMP_ONE_I,
1648 TEMP_ONE_C );
1649 STORE( func, *inst, 0, 0, CHAN_X );
1650 }
1651 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1652 FETCH( func, *inst, 0, 0, CHAN_Y );
1653 FETCH( func, *inst, 1, 1, CHAN_Y );
1654 emit_mul( func, 0, 1 );
1655 STORE( func, *inst, 0, 0, CHAN_Y );
1656 }
1657 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1658 FETCH( func, *inst, 0, 0, CHAN_Z );
1659 STORE( func, *inst, 0, 0, CHAN_Z );
1660 }
1661 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1662 FETCH( func, *inst, 0, 1, CHAN_W );
1663 STORE( func, *inst, 0, 0, CHAN_W );
1664 }
1665 break;
1666
1667 case TGSI_OPCODE_MIN:
1668 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1669 FETCH( func, *inst, 0, 0, chan_index );
1670 FETCH( func, *inst, 1, 1, chan_index );
1671 sse_minps(
1672 func,
1673 make_xmm( 0 ),
1674 make_xmm( 1 ) );
1675 STORE( func, *inst, 0, 0, chan_index );
1676 }
1677 break;
1678
1679 case TGSI_OPCODE_MAX:
1680 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1681 FETCH( func, *inst, 0, 0, chan_index );
1682 FETCH( func, *inst, 1, 1, chan_index );
1683 sse_maxps(
1684 func,
1685 make_xmm( 0 ),
1686 make_xmm( 1 ) );
1687 STORE( func, *inst, 0, 0, chan_index );
1688 }
1689 break;
1690
1691 case TGSI_OPCODE_SLT:
1692 /* TGSI_OPCODE_SETLT */
1693 emit_setcc( func, inst, cc_LessThan );
1694 break;
1695
1696 case TGSI_OPCODE_SGE:
1697 /* TGSI_OPCODE_SETGE */
1698 emit_setcc( func, inst, cc_NotLessThan );
1699 break;
1700
1701 case TGSI_OPCODE_MAD:
1702 /* TGSI_OPCODE_MADD */
1703 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1704 FETCH( func, *inst, 0, 0, chan_index );
1705 FETCH( func, *inst, 1, 1, chan_index );
1706 FETCH( func, *inst, 2, 2, chan_index );
1707 emit_mul( func, 0, 1 );
1708 emit_add( func, 0, 2 );
1709 STORE( func, *inst, 0, 0, chan_index );
1710 }
1711 break;
1712
1713 case TGSI_OPCODE_SUB:
1714 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1715 FETCH( func, *inst, 0, 0, chan_index );
1716 FETCH( func, *inst, 1, 1, chan_index );
1717 emit_sub( func, 0, 1 );
1718 STORE( func, *inst, 0, 0, chan_index );
1719 }
1720 break;
1721
1722 case TGSI_OPCODE_LERP:
1723 /* TGSI_OPCODE_LRP */
1724 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1725 FETCH( func, *inst, 0, 0, chan_index );
1726 FETCH( func, *inst, 1, 1, chan_index );
1727 FETCH( func, *inst, 2, 2, chan_index );
1728 emit_sub( func, 1, 2 );
1729 emit_mul( func, 0, 1 );
1730 emit_add( func, 0, 2 );
1731 STORE( func, *inst, 0, 0, chan_index );
1732 }
1733 break;
1734
1735 case TGSI_OPCODE_CND:
1736 return 0;
1737 break;
1738
1739 case TGSI_OPCODE_CND0:
1740 return 0;
1741 break;
1742
1743 case TGSI_OPCODE_DOT2ADD:
1744 /* TGSI_OPCODE_DP2A */
1745 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
1746 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
1747 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1748 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
1749 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
1750 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1751 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1752 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
1753 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1754 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1755 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
1756 }
1757 break;
1758
1759 case TGSI_OPCODE_INDEX:
1760 return 0;
1761 break;
1762
1763 case TGSI_OPCODE_NEGATE:
1764 return 0;
1765 break;
1766
1767 case TGSI_OPCODE_FRAC:
1768 /* TGSI_OPCODE_FRC */
1769 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1770 FETCH( func, *inst, 0, 0, chan_index );
1771 emit_frc( func, 0, 0 );
1772 STORE( func, *inst, 0, 0, chan_index );
1773 }
1774 break;
1775
1776 case TGSI_OPCODE_CLAMP:
1777 return 0;
1778 break;
1779
1780 case TGSI_OPCODE_FLOOR:
1781 /* TGSI_OPCODE_FLR */
1782 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1783 FETCH( func, *inst, 0, 0, chan_index );
1784 emit_flr( func, 0, 0 );
1785 STORE( func, *inst, 0, 0, chan_index );
1786 }
1787 break;
1788
1789 case TGSI_OPCODE_ROUND:
1790 return 0;
1791 break;
1792
1793 case TGSI_OPCODE_EXPBASE2:
1794 /* TGSI_OPCODE_EX2 */
1795 FETCH( func, *inst, 0, 0, CHAN_X );
1796 emit_ex2( func, 0, 0 );
1797 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1798 STORE( func, *inst, 0, 0, chan_index );
1799 }
1800 break;
1801
1802 case TGSI_OPCODE_LOGBASE2:
1803 /* TGSI_OPCODE_LG2 */
1804 FETCH( func, *inst, 0, 0, CHAN_X );
1805 emit_lg2( func, 0, 0 );
1806 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1807 STORE( func, *inst, 0, 0, chan_index );
1808 }
1809 break;
1810
1811 case TGSI_OPCODE_POWER:
1812 /* TGSI_OPCODE_POW */
1813 FETCH( func, *inst, 0, 0, CHAN_X );
1814 FETCH( func, *inst, 1, 1, CHAN_X );
1815 emit_pow( func, 0, 0, 1 );
1816 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1817 STORE( func, *inst, 0, 0, chan_index );
1818 }
1819 break;
1820
1821 case TGSI_OPCODE_CROSSPRODUCT:
1822 /* TGSI_OPCODE_XPD */
1823 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1824 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1825 FETCH( func, *inst, 1, 1, CHAN_Z );
1826 FETCH( func, *inst, 3, 0, CHAN_Z );
1827 }
1828 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1829 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1830 FETCH( func, *inst, 0, 0, CHAN_Y );
1831 FETCH( func, *inst, 4, 1, CHAN_Y );
1832 }
1833 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1834 emit_MOV( func, 2, 0 );
1835 emit_mul( func, 2, 1 );
1836 emit_MOV( func, 5, 3 );
1837 emit_mul( func, 5, 4 );
1838 emit_sub( func, 2, 5 );
1839 STORE( func, *inst, 2, 0, CHAN_X );
1840 }
1841 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1842 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1843 FETCH( func, *inst, 2, 1, CHAN_X );
1844 FETCH( func, *inst, 5, 0, CHAN_X );
1845 }
1846 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1847 emit_mul( func, 3, 2 );
1848 emit_mul( func, 1, 5 );
1849 emit_sub( func, 3, 1 );
1850 STORE( func, *inst, 3, 0, CHAN_Y );
1851 }
1852 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1853 emit_mul( func, 5, 4 );
1854 emit_mul( func, 0, 2 );
1855 emit_sub( func, 5, 0 );
1856 STORE( func, *inst, 5, 0, CHAN_Z );
1857 }
1858 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1859 emit_tempf(
1860 func,
1861 0,
1862 TEMP_ONE_I,
1863 TEMP_ONE_C );
1864 STORE( func, *inst, 0, 0, CHAN_W );
1865 }
1866 break;
1867
1868 case TGSI_OPCODE_MULTIPLYMATRIX:
1869 return 0;
1870 break;
1871
1872 case TGSI_OPCODE_ABS:
1873 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1874 FETCH( func, *inst, 0, 0, chan_index );
1875 emit_abs( func, 0) ;
1876
1877 STORE( func, *inst, 0, 0, chan_index );
1878 }
1879 break;
1880
1881 case TGSI_OPCODE_RCC:
1882 return 0;
1883 break;
1884
1885 case TGSI_OPCODE_DPH:
1886 FETCH( func, *inst, 0, 0, CHAN_X );
1887 FETCH( func, *inst, 1, 1, CHAN_X );
1888 emit_mul( func, 0, 1 );
1889 FETCH( func, *inst, 1, 0, CHAN_Y );
1890 FETCH( func, *inst, 2, 1, CHAN_Y );
1891 emit_mul( func, 1, 2 );
1892 emit_add( func, 0, 1 );
1893 FETCH( func, *inst, 1, 0, CHAN_Z );
1894 FETCH( func, *inst, 2, 1, CHAN_Z );
1895 emit_mul( func, 1, 2 );
1896 emit_add( func, 0, 1 );
1897 FETCH( func, *inst, 1, 1, CHAN_W );
1898 emit_add( func, 0, 1 );
1899 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1900 STORE( func, *inst, 0, 0, chan_index );
1901 }
1902 break;
1903
1904 case TGSI_OPCODE_COS:
1905 FETCH( func, *inst, 0, 0, CHAN_X );
1906 emit_cos( func, 0, 0 );
1907 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1908 STORE( func, *inst, 0, 0, chan_index );
1909 }
1910 break;
1911
1912 case TGSI_OPCODE_DDX:
1913 return 0;
1914 break;
1915
1916 case TGSI_OPCODE_DDY:
1917 return 0;
1918 break;
1919
1920 case TGSI_OPCODE_KILP:
1921 /* predicated kill */
1922 emit_kilp( func );
1923 return 0; /* XXX fix me */
1924 break;
1925
1926 case TGSI_OPCODE_KIL:
1927 /* conditional kill */
1928 emit_kil( func, &inst->FullSrcRegisters[0] );
1929 break;
1930
1931 case TGSI_OPCODE_PK2H:
1932 return 0;
1933 break;
1934
1935 case TGSI_OPCODE_PK2US:
1936 return 0;
1937 break;
1938
1939 case TGSI_OPCODE_PK4B:
1940 return 0;
1941 break;
1942
1943 case TGSI_OPCODE_PK4UB:
1944 return 0;
1945 break;
1946
1947 case TGSI_OPCODE_RFL:
1948 return 0;
1949 break;
1950
1951 case TGSI_OPCODE_SEQ:
1952 return 0;
1953 break;
1954
1955 case TGSI_OPCODE_SFL:
1956 return 0;
1957 break;
1958
1959 case TGSI_OPCODE_SGT:
1960 return 0;
1961 break;
1962
1963 case TGSI_OPCODE_SIN:
1964 FETCH( func, *inst, 0, 0, CHAN_X );
1965 emit_sin( func, 0, 0 );
1966 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1967 STORE( func, *inst, 0, 0, chan_index );
1968 }
1969 break;
1970
1971 case TGSI_OPCODE_SLE:
1972 return 0;
1973 break;
1974
1975 case TGSI_OPCODE_SNE:
1976 return 0;
1977 break;
1978
1979 case TGSI_OPCODE_STR:
1980 return 0;
1981 break;
1982
1983 case TGSI_OPCODE_TEX:
1984 if (0) {
1985 /* Disable dummy texture code:
1986 */
1987 emit_tempf(
1988 func,
1989 0,
1990 TEMP_ONE_I,
1991 TEMP_ONE_C );
1992 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1993 STORE( func, *inst, 0, 0, chan_index );
1994 }
1995 }
1996 else {
1997 return 0;
1998 }
1999 break;
2000
2001 case TGSI_OPCODE_TXD:
2002 return 0;
2003 break;
2004
2005 case TGSI_OPCODE_UP2H:
2006 return 0;
2007 break;
2008
2009 case TGSI_OPCODE_UP2US:
2010 return 0;
2011 break;
2012
2013 case TGSI_OPCODE_UP4B:
2014 return 0;
2015 break;
2016
2017 case TGSI_OPCODE_UP4UB:
2018 return 0;
2019 break;
2020
2021 case TGSI_OPCODE_X2D:
2022 return 0;
2023 break;
2024
2025 case TGSI_OPCODE_ARA:
2026 return 0;
2027 break;
2028
2029 case TGSI_OPCODE_ARR:
2030 return 0;
2031 break;
2032
2033 case TGSI_OPCODE_BRA:
2034 return 0;
2035 break;
2036
2037 case TGSI_OPCODE_CAL:
2038 return 0;
2039 break;
2040
2041 case TGSI_OPCODE_RET:
2042 emit_ret( func );
2043 break;
2044
2045 case TGSI_OPCODE_END:
2046 break;
2047
2048 case TGSI_OPCODE_SSG:
2049 return 0;
2050 break;
2051
2052 case TGSI_OPCODE_CMP:
2053 emit_cmp (func, inst);
2054 break;
2055
2056 case TGSI_OPCODE_SCS:
2057 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2058 FETCH( func, *inst, 0, 0, CHAN_X );
2059 emit_cos( func, 0, 0 );
2060 STORE( func, *inst, 0, 0, CHAN_X );
2061 }
2062 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2063 FETCH( func, *inst, 0, 0, CHAN_X );
2064 emit_sin( func, 0, 0 );
2065 STORE( func, *inst, 0, 0, CHAN_Y );
2066 }
2067 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2068 emit_tempf(
2069 func,
2070 0,
2071 TGSI_EXEC_TEMP_00000000_I,
2072 TGSI_EXEC_TEMP_00000000_C );
2073 STORE( func, *inst, 0, 0, CHAN_Z );
2074 }
2075 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2076 emit_tempf(
2077 func,
2078 0,
2079 TEMP_ONE_I,
2080 TEMP_ONE_C );
2081 STORE( func, *inst, 0, 0, CHAN_W );
2082 }
2083 break;
2084
2085 case TGSI_OPCODE_TXB:
2086 return 0;
2087 break;
2088
2089 case TGSI_OPCODE_NRM:
2090 /* fall-through */
2091 case TGSI_OPCODE_NRM4:
2092 /* 3 or 4-component normalization */
2093 {
2094 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2095 /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2096 FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
2097 FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
2098 FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
2099 if (dims == 4) {
2100 FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2101 }
2102 emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
2103 emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
2104 emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
2105 emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
2106 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2107 emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
2108 emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
2109 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2110 if (dims == 4) {
2111 emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
2112 emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
2113 emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
2114 }
2115 emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117 if (chan_index < dims) {
2118 emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2119 STORE( func, *inst, 4+chan_index, 0, chan_index );
2120 }
2121 }
2122 }
2123 break;
2124
2125 case TGSI_OPCODE_DIV:
2126 return 0;
2127 break;
2128
2129 case TGSI_OPCODE_DP2:
2130 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2131 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2132 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2133 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2134 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2135 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2136 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2137 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2138 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2139 }
2140 break;
2141
2142 case TGSI_OPCODE_TXL:
2143 return 0;
2144 break;
2145
2146 case TGSI_OPCODE_BRK:
2147 return 0;
2148 break;
2149
2150 case TGSI_OPCODE_IF:
2151 return 0;
2152 break;
2153
2154 case TGSI_OPCODE_LOOP:
2155 return 0;
2156 break;
2157
2158 case TGSI_OPCODE_REP:
2159 return 0;
2160 break;
2161
2162 case TGSI_OPCODE_ELSE:
2163 return 0;
2164 break;
2165
2166 case TGSI_OPCODE_ENDIF:
2167 return 0;
2168 break;
2169
2170 case TGSI_OPCODE_ENDLOOP:
2171 return 0;
2172 break;
2173
2174 case TGSI_OPCODE_ENDREP:
2175 return 0;
2176 break;
2177
2178 case TGSI_OPCODE_PUSHA:
2179 return 0;
2180 break;
2181
2182 case TGSI_OPCODE_POPA:
2183 return 0;
2184 break;
2185
2186 case TGSI_OPCODE_CEIL:
2187 return 0;
2188 break;
2189
2190 case TGSI_OPCODE_I2F:
2191 return 0;
2192 break;
2193
2194 case TGSI_OPCODE_NOT:
2195 return 0;
2196 break;
2197
2198 case TGSI_OPCODE_TRUNC:
2199 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2200 FETCH( func, *inst, 0, 0, chan_index );
2201 emit_f2it( func, 0 );
2202 emit_i2f( func, 0 );
2203 STORE( func, *inst, 0, 0, chan_index );
2204 }
2205 break;
2206
2207 case TGSI_OPCODE_SHL:
2208 return 0;
2209 break;
2210
2211 case TGSI_OPCODE_SHR:
2212 return 0;
2213 break;
2214
2215 case TGSI_OPCODE_AND:
2216 return 0;
2217 break;
2218
2219 case TGSI_OPCODE_OR:
2220 return 0;
2221 break;
2222
2223 case TGSI_OPCODE_MOD:
2224 return 0;
2225 break;
2226
2227 case TGSI_OPCODE_XOR:
2228 return 0;
2229 break;
2230
2231 case TGSI_OPCODE_SAD:
2232 return 0;
2233 break;
2234
2235 case TGSI_OPCODE_TXF:
2236 return 0;
2237 break;
2238
2239 case TGSI_OPCODE_TXQ:
2240 return 0;
2241 break;
2242
2243 case TGSI_OPCODE_CONT:
2244 return 0;
2245 break;
2246
2247 case TGSI_OPCODE_EMIT:
2248 return 0;
2249 break;
2250
2251 case TGSI_OPCODE_ENDPRIM:
2252 return 0;
2253 break;
2254
2255 default:
2256 return 0;
2257 }
2258
2259 return 1;
2260 }
2261
2262 static void
2263 emit_declaration(
2264 struct x86_function *func,
2265 struct tgsi_full_declaration *decl )
2266 {
2267 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2268 unsigned first, last, mask;
2269 unsigned i, j;
2270
2271 first = decl->DeclarationRange.First;
2272 last = decl->DeclarationRange.Last;
2273 mask = decl->Declaration.UsageMask;
2274
2275 for( i = first; i <= last; i++ ) {
2276 for( j = 0; j < NUM_CHANNELS; j++ ) {
2277 if( mask & (1 << j) ) {
2278 switch( decl->Declaration.Interpolate ) {
2279 case TGSI_INTERPOLATE_CONSTANT:
2280 emit_coef_a0( func, 0, i, j );
2281 emit_inputs( func, 0, i, j );
2282 break;
2283
2284 case TGSI_INTERPOLATE_LINEAR:
2285 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2286 emit_coef_dadx( func, 1, i, j );
2287 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2288 emit_coef_dady( func, 3, i, j );
2289 emit_mul( func, 0, 1 ); /* x * dadx */
2290 emit_coef_a0( func, 4, i, j );
2291 emit_mul( func, 2, 3 ); /* y * dady */
2292 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2293 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2294 emit_inputs( func, 0, i, j );
2295 break;
2296
2297 case TGSI_INTERPOLATE_PERSPECTIVE:
2298 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2299 emit_coef_dadx( func, 1, i, j );
2300 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2301 emit_coef_dady( func, 3, i, j );
2302 emit_mul( func, 0, 1 ); /* x * dadx */
2303 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2304 emit_coef_a0( func, 5, i, j );
2305 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2306 emit_mul( func, 2, 3 ); /* y * dady */
2307 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2308 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2309 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2310 emit_inputs( func, 0, i, j );
2311 break;
2312
2313 default:
2314 assert( 0 );
2315 break;
2316 }
2317 }
2318 }
2319 }
2320 }
2321 }
2322
2323 static void aos_to_soa( struct x86_function *func,
2324 uint arg_aos,
2325 uint arg_soa,
2326 uint arg_num,
2327 uint arg_stride )
2328 {
2329 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2330 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2331 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2332 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2333 int inner_loop;
2334
2335
2336 /* Save EBX */
2337 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2338
2339 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2340 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2341 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2342 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2343
2344 /* do */
2345 inner_loop = x86_get_label( func );
2346 {
2347 x86_push( func, aos_input );
2348 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2349 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2350 x86_add( func, aos_input, stride );
2351 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2352 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2353 x86_add( func, aos_input, stride );
2354 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2355 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2356 x86_add( func, aos_input, stride );
2357 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2358 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2359 x86_pop( func, aos_input );
2360
2361 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2362 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2363 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2364 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2365 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2366 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2367
2368 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2369 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2370 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2371 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2372
2373 /* Advance to next input */
2374 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2375 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2376 }
2377 /* while --num_inputs */
2378 x86_dec( func, num_inputs );
2379 x86_jcc( func, cc_NE, inner_loop );
2380
2381 /* Restore EBX */
2382 x86_pop( func, aos_input );
2383 }
2384
2385 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2386 {
2387 struct x86_reg soa_output;
2388 struct x86_reg aos_output;
2389 struct x86_reg num_outputs;
2390 struct x86_reg temp;
2391 int inner_loop;
2392
2393 soa_output = x86_make_reg( file_REG32, reg_AX );
2394 aos_output = x86_make_reg( file_REG32, reg_BX );
2395 num_outputs = x86_make_reg( file_REG32, reg_CX );
2396 temp = x86_make_reg( file_REG32, reg_DX );
2397
2398 /* Save EBX */
2399 x86_push( func, aos_output );
2400
2401 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2402 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2403 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2404
2405 /* do */
2406 inner_loop = x86_get_label( func );
2407 {
2408 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2409 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2410 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2411 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2412
2413 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2414 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2415 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2416 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2417 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2418 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2419
2420 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2421 x86_push( func, aos_output );
2422 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2423 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2424 x86_add( func, aos_output, temp );
2425 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2426 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2427 x86_add( func, aos_output, temp );
2428 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2429 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2430 x86_add( func, aos_output, temp );
2431 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2432 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2433 x86_pop( func, aos_output );
2434
2435 /* Advance to next output */
2436 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2437 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2438 }
2439 /* while --num_outputs */
2440 x86_dec( func, num_outputs );
2441 x86_jcc( func, cc_NE, inner_loop );
2442
2443 /* Restore EBX */
2444 x86_pop( func, aos_output );
2445 }
2446
2447 /**
2448 * Translate a TGSI vertex/fragment shader to SSE2 code.
2449 * Slightly different things are done for vertex vs. fragment shaders.
2450 *
2451 * Note that fragment shaders are responsible for interpolating shader
2452 * inputs. Because on x86 we have only 4 GP registers, and here we
2453 * have 5 shader arguments (input, output, const, temp and coef), the
2454 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2455 * GP register holding the output argument is aliased with the coeff
2456 * argument, as outputs are not needed in the DECLARATION phase.
2457 *
2458 * \param tokens the TGSI input shader
2459 * \param func the output SSE code/function
2460 * \param immediates buffer to place immediates, later passed to SSE func
2461 * \param return 1 for success, 0 if translation failed
2462 */
2463 unsigned
2464 tgsi_emit_sse2(
2465 const struct tgsi_token *tokens,
2466 struct x86_function *func,
2467 float (*immediates)[4],
2468 boolean do_swizzles )
2469 {
2470 struct tgsi_parse_context parse;
2471 boolean instruction_phase = FALSE;
2472 unsigned ok = 1;
2473 uint num_immediates = 0;
2474
2475 util_init_math();
2476
2477 func->csr = func->store;
2478
2479 tgsi_parse_init( &parse, tokens );
2480
2481 /* Can't just use EDI, EBX without save/restoring them:
2482 */
2483 x86_push(
2484 func,
2485 get_immediate_base() );
2486
2487 x86_push(
2488 func,
2489 get_temp_base() );
2490
2491
2492 /*
2493 * Different function args for vertex/fragment shaders:
2494 */
2495 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2496 /* DECLARATION phase, do not load output argument. */
2497 x86_mov(
2498 func,
2499 get_input_base(),
2500 x86_fn_arg( func, 1 ) );
2501 /* skipping outputs argument here */
2502 x86_mov(
2503 func,
2504 get_const_base(),
2505 x86_fn_arg( func, 3 ) );
2506 x86_mov(
2507 func,
2508 get_temp_base(),
2509 x86_fn_arg( func, 4 ) );
2510 x86_mov(
2511 func,
2512 get_coef_base(),
2513 x86_fn_arg( func, 5 ) );
2514 x86_mov(
2515 func,
2516 get_immediate_base(),
2517 x86_fn_arg( func, 6 ) );
2518 }
2519 else {
2520 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2521
2522 if (do_swizzles)
2523 aos_to_soa( func,
2524 6, /* aos_input */
2525 1, /* machine->input */
2526 7, /* num_inputs */
2527 8 ); /* input_stride */
2528
2529 x86_mov(
2530 func,
2531 get_input_base(),
2532 x86_fn_arg( func, 1 ) );
2533 x86_mov(
2534 func,
2535 get_output_base(),
2536 x86_fn_arg( func, 2 ) );
2537 x86_mov(
2538 func,
2539 get_const_base(),
2540 x86_fn_arg( func, 3 ) );
2541 x86_mov(
2542 func,
2543 get_temp_base(),
2544 x86_fn_arg( func, 4 ) );
2545 x86_mov(
2546 func,
2547 get_immediate_base(),
2548 x86_fn_arg( func, 5 ) );
2549 }
2550
2551 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2552 tgsi_parse_token( &parse );
2553
2554 switch( parse.FullToken.Token.Type ) {
2555 case TGSI_TOKEN_TYPE_DECLARATION:
2556 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2557 emit_declaration(
2558 func,
2559 &parse.FullToken.FullDeclaration );
2560 }
2561 break;
2562
2563 case TGSI_TOKEN_TYPE_INSTRUCTION:
2564 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2565 if( !instruction_phase ) {
2566 /* INSTRUCTION phase, overwrite coeff with output. */
2567 instruction_phase = TRUE;
2568 x86_mov(
2569 func,
2570 get_output_base(),
2571 x86_fn_arg( func, 2 ) );
2572 }
2573 }
2574
2575 ok = emit_instruction(
2576 func,
2577 &parse.FullToken.FullInstruction );
2578
2579 if (!ok) {
2580 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2581 parse.FullToken.FullInstruction.Instruction.Opcode,
2582 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2583 "vertex shader" : "fragment shader");
2584 }
2585 break;
2586
2587 case TGSI_TOKEN_TYPE_IMMEDIATE:
2588 /* simply copy the immediate values into the next immediates[] slot */
2589 {
2590 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2591 uint i;
2592 assert(size <= 4);
2593 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2594 for( i = 0; i < size; i++ ) {
2595 immediates[num_immediates][i] =
2596 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2597 }
2598 #if 0
2599 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2600 num_immediates,
2601 immediates[num_immediates][0],
2602 immediates[num_immediates][1],
2603 immediates[num_immediates][2],
2604 immediates[num_immediates][3]);
2605 #endif
2606 num_immediates++;
2607 }
2608 break;
2609
2610 default:
2611 ok = 0;
2612 assert( 0 );
2613 }
2614 }
2615
2616 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2617 if (do_swizzles)
2618 soa_to_aos( func, 9, 2, 10, 11 );
2619 }
2620
2621 /* Can't just use EBX, EDI without save/restoring them:
2622 */
2623 x86_pop(
2624 func,
2625 get_temp_base() );
2626
2627 x86_pop(
2628 func,
2629 get_immediate_base() );
2630
2631 emit_ret( func );
2632
2633 tgsi_parse_free( &parse );
2634
2635 return ok;
2636 }
2637
2638 #endif /* PIPE_ARCH_X86 */
2639