Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
31
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_sse.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi_exec.h"
39 #include "tgsi_sse2.h"
40
41 #include "rtasm/rtasm_x86sse.h"
42
43 /* for 1/sqrt()
44 *
45 * This costs about 100fps (close to 10%) in gears:
46 */
47 #define HIGH_PRECISION 1
48
49 #define FAST_MATH 1
50
51
52 #define FOR_EACH_CHANNEL( CHAN )\
53 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
54
55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
57
58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
60
61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
62 FOR_EACH_CHANNEL( CHAN )\
63 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
64
65 #define CHAN_X 0
66 #define CHAN_Y 1
67 #define CHAN_Z 2
68 #define CHAN_W 3
69
70 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
71 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
72
73 #define TEMP_R0 TGSI_EXEC_TEMP_R0
74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
75
76 /**
77 * X86 utility functions.
78 */
79
80 static struct x86_reg
81 make_xmm(
82 unsigned xmm )
83 {
84 return x86_make_reg(
85 file_XMM,
86 (enum x86_reg_name) xmm );
87 }
88
89 /**
90 * X86 register mapping helpers.
91 */
92
93 static struct x86_reg
94 get_const_base( void )
95 {
96 return x86_make_reg(
97 file_REG32,
98 reg_CX );
99 }
100
101 static struct x86_reg
102 get_input_base( void )
103 {
104 return x86_make_reg(
105 file_REG32,
106 reg_AX );
107 }
108
109 static struct x86_reg
110 get_output_base( void )
111 {
112 return x86_make_reg(
113 file_REG32,
114 reg_DX );
115 }
116
117 static struct x86_reg
118 get_temp_base( void )
119 {
120 return x86_make_reg(
121 file_REG32,
122 reg_BX );
123 }
124
125 static struct x86_reg
126 get_coef_base( void )
127 {
128 return get_output_base();
129 }
130
131 static struct x86_reg
132 get_immediate_base( void )
133 {
134 return x86_make_reg(
135 file_REG32,
136 reg_DI );
137 }
138
139
140 /**
141 * Data access helpers.
142 */
143
144
145 static struct x86_reg
146 get_immediate(
147 unsigned vec,
148 unsigned chan )
149 {
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec * 4 + chan) * 4 );
153 }
154
155 static struct x86_reg
156 get_const(
157 unsigned vec,
158 unsigned chan )
159 {
160 return x86_make_disp(
161 get_const_base(),
162 (vec * 4 + chan) * 4 );
163 }
164
165 static struct x86_reg
166 get_input(
167 unsigned vec,
168 unsigned chan )
169 {
170 return x86_make_disp(
171 get_input_base(),
172 (vec * 4 + chan) * 16 );
173 }
174
175 static struct x86_reg
176 get_output(
177 unsigned vec,
178 unsigned chan )
179 {
180 return x86_make_disp(
181 get_output_base(),
182 (vec * 4 + chan) * 16 );
183 }
184
185 static struct x86_reg
186 get_temp(
187 unsigned vec,
188 unsigned chan )
189 {
190 return x86_make_disp(
191 get_temp_base(),
192 (vec * 4 + chan) * 16 );
193 }
194
195 static struct x86_reg
196 get_coef(
197 unsigned vec,
198 unsigned chan,
199 unsigned member )
200 {
201 return x86_make_disp(
202 get_coef_base(),
203 ((vec * 3 + member) * 4 + chan) * 4 );
204 }
205
206
207 static void
208 emit_ret(
209 struct x86_function *func )
210 {
211 x86_ret( func );
212 }
213
214
215 /**
216 * Data fetch helpers.
217 */
218
219 /**
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
224 */
225 static void
226 emit_const(
227 struct x86_function *func,
228 uint xmm,
229 int vec,
230 uint chan,
231 uint indirect,
232 uint indirectFile,
233 int indirectIndex )
234 {
235 if (indirect) {
236 struct x86_reg r0 = get_input_base();
237 struct x86_reg r1 = get_output_base();
238 uint i;
239
240 assert( indirectFile == TGSI_FILE_ADDRESS );
241 assert( indirectIndex == 0 );
242
243 x86_push( func, r0 );
244 x86_push( func, r1 );
245
246 for (i = 0; i < QUAD_SIZE; i++) {
247 x86_lea( func, r0, get_const( vec, chan ) );
248 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
249
250 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
251 */
252 x86_add( func, r1, r1 );
253 x86_add( func, r1, r1 );
254 x86_add( func, r1, r1 );
255 x86_add( func, r1, r1 );
256
257 x86_add( func, r0, r1 );
258 x86_mov( func, r1, x86_deref( r0 ) );
259 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
260 }
261
262 x86_pop( func, r1 );
263 x86_pop( func, r0 );
264
265 sse_movaps(
266 func,
267 make_xmm( xmm ),
268 get_temp( TEMP_R0, CHAN_X ) );
269 }
270 else {
271 assert( vec >= 0 );
272
273 sse_movss(
274 func,
275 make_xmm( xmm ),
276 get_const( vec, chan ) );
277 sse_shufps(
278 func,
279 make_xmm( xmm ),
280 make_xmm( xmm ),
281 SHUF( 0, 0, 0, 0 ) );
282 }
283 }
284
285 static void
286 emit_immediate(
287 struct x86_function *func,
288 unsigned xmm,
289 unsigned vec,
290 unsigned chan )
291 {
292 sse_movss(
293 func,
294 make_xmm( xmm ),
295 get_immediate( vec, chan ) );
296 sse_shufps(
297 func,
298 make_xmm( xmm ),
299 make_xmm( xmm ),
300 SHUF( 0, 0, 0, 0 ) );
301 }
302
303
304 /**
305 * Copy a shader input to xmm register
306 * \param xmm the destination xmm register
307 * \param vec the src input attrib
308 * \param chan src channel to fetch (X, Y, Z or W)
309 */
310 static void
311 emit_inputf(
312 struct x86_function *func,
313 unsigned xmm,
314 unsigned vec,
315 unsigned chan )
316 {
317 sse_movups(
318 func,
319 make_xmm( xmm ),
320 get_input( vec, chan ) );
321 }
322
323 /**
324 * Store an xmm register to a shader output
325 * \param xmm the source xmm register
326 * \param vec the dest output attrib
327 * \param chan src dest channel to store (X, Y, Z or W)
328 */
329 static void
330 emit_output(
331 struct x86_function *func,
332 unsigned xmm,
333 unsigned vec,
334 unsigned chan )
335 {
336 sse_movups(
337 func,
338 get_output( vec, chan ),
339 make_xmm( xmm ) );
340 }
341
342 /**
343 * Copy a shader temporary to xmm register
344 * \param xmm the destination xmm register
345 * \param vec the src temp register
346 * \param chan src channel to fetch (X, Y, Z or W)
347 */
348 static void
349 emit_tempf(
350 struct x86_function *func,
351 unsigned xmm,
352 unsigned vec,
353 unsigned chan )
354 {
355 sse_movaps(
356 func,
357 make_xmm( xmm ),
358 get_temp( vec, chan ) );
359 }
360
361 /**
362 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
363 * \param xmm the destination xmm register
364 * \param vec the src input/attribute coefficient index
365 * \param chan src channel to fetch (X, Y, Z or W)
366 * \param member 0=a0, 1=dadx, 2=dady
367 */
368 static void
369 emit_coef(
370 struct x86_function *func,
371 unsigned xmm,
372 unsigned vec,
373 unsigned chan,
374 unsigned member )
375 {
376 sse_movss(
377 func,
378 make_xmm( xmm ),
379 get_coef( vec, chan, member ) );
380 sse_shufps(
381 func,
382 make_xmm( xmm ),
383 make_xmm( xmm ),
384 SHUF( 0, 0, 0, 0 ) );
385 }
386
387 /**
388 * Data store helpers.
389 */
390
391 static void
392 emit_inputs(
393 struct x86_function *func,
394 unsigned xmm,
395 unsigned vec,
396 unsigned chan )
397 {
398 sse_movups(
399 func,
400 get_input( vec, chan ),
401 make_xmm( xmm ) );
402 }
403
404 static void
405 emit_temps(
406 struct x86_function *func,
407 unsigned xmm,
408 unsigned vec,
409 unsigned chan )
410 {
411 sse_movaps(
412 func,
413 get_temp( vec, chan ),
414 make_xmm( xmm ) );
415 }
416
417 static void
418 emit_addrs(
419 struct x86_function *func,
420 unsigned xmm,
421 unsigned vec,
422 unsigned chan )
423 {
424 assert( vec == 0 );
425
426 emit_temps(
427 func,
428 xmm,
429 vec + TGSI_EXEC_TEMP_ADDR,
430 chan );
431 }
432
433 /**
434 * Coefficent fetch helpers.
435 */
436
437 static void
438 emit_coef_a0(
439 struct x86_function *func,
440 unsigned xmm,
441 unsigned vec,
442 unsigned chan )
443 {
444 emit_coef(
445 func,
446 xmm,
447 vec,
448 chan,
449 0 );
450 }
451
452 static void
453 emit_coef_dadx(
454 struct x86_function *func,
455 unsigned xmm,
456 unsigned vec,
457 unsigned chan )
458 {
459 emit_coef(
460 func,
461 xmm,
462 vec,
463 chan,
464 1 );
465 }
466
467 static void
468 emit_coef_dady(
469 struct x86_function *func,
470 unsigned xmm,
471 unsigned vec,
472 unsigned chan )
473 {
474 emit_coef(
475 func,
476 xmm,
477 vec,
478 chan,
479 2 );
480 }
481
482 /**
483 * Function call helpers.
484 */
485
486 /**
487 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
488 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
489 * that the stack pointer is 16 byte aligned, as expected.
490 */
491 static void
492 emit_func_call_dst(
493 struct x86_function *func,
494 unsigned xmm_save,
495 unsigned xmm_dst,
496 void (PIPE_CDECL *code)() )
497 {
498 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
499 unsigned i, n, xmm;
500 unsigned xmm_mask;
501
502 /* Bitmask of the xmm registers to save */
503 xmm_mask = (1 << xmm_save) - 1;
504 xmm_mask &= ~(1 << xmm_dst);
505
506 sse_movaps(
507 func,
508 get_temp( TEMP_R0, 0 ),
509 make_xmm( xmm_dst ) );
510
511 x86_push(
512 func,
513 x86_make_reg( file_REG32, reg_AX) );
514 x86_push(
515 func,
516 x86_make_reg( file_REG32, reg_CX) );
517 x86_push(
518 func,
519 x86_make_reg( file_REG32, reg_DX) );
520
521 for(i = 0, n = 0; i < 8; ++i)
522 if(xmm_mask & (1 << i))
523 ++n;
524
525 x86_sub_imm(
526 func,
527 x86_make_reg( file_REG32, reg_SP ),
528 n*16);
529
530 for(i = 0, n = 0; i < 8; ++i)
531 if(xmm_mask & (1 << i)) {
532 sse_movups(
533 func,
534 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
535 make_xmm( xmm ) );
536 ++n;
537 }
538
539 x86_lea(
540 func,
541 ecx,
542 get_temp( TEMP_R0, 0 ) );
543
544 x86_push( func, ecx );
545 x86_mov_reg_imm( func, ecx, (unsigned long) code );
546 x86_call( func, ecx );
547 x86_pop(func, ecx );
548
549 for(i = 0, n = 0; i < 8; ++i)
550 if(xmm_mask & (1 << i)) {
551 sse_movups(
552 func,
553 make_xmm( xmm ),
554 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
555 ++n;
556 }
557
558 x86_add_imm(
559 func,
560 x86_make_reg( file_REG32, reg_SP ),
561 n*16);
562
563 /* Restore GP registers in a reverse order.
564 */
565 x86_pop(
566 func,
567 x86_make_reg( file_REG32, reg_DX) );
568 x86_pop(
569 func,
570 x86_make_reg( file_REG32, reg_CX) );
571 x86_pop(
572 func,
573 x86_make_reg( file_REG32, reg_AX) );
574
575 sse_movaps(
576 func,
577 make_xmm( xmm_dst ),
578 get_temp( TEMP_R0, 0 ) );
579 }
580
581 static void
582 emit_func_call_dst_src(
583 struct x86_function *func,
584 unsigned xmm_save,
585 unsigned xmm_dst,
586 unsigned xmm_src,
587 void (PIPE_CDECL *code)() )
588 {
589 sse_movaps(
590 func,
591 get_temp( TEMP_R0, 1 ),
592 make_xmm( xmm_src ) );
593
594 emit_func_call_dst(
595 func,
596 xmm_save,
597 xmm_dst,
598 code );
599 }
600
601 /*
602 * Fast SSE2 implementation of special math functions.
603 */
604
605 #define POLY0(x, c0) _mm_set1_ps(c0)
606 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
607 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
608 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
609 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
610 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
611
612 #define EXP_POLY_DEGREE 3
613 #define LOG_POLY_DEGREE 5
614
615 /**
616 * See http://www.devmaster.net/forums/showthread.php?p=43580
617 */
618 static INLINE __m128
619 exp2f4(__m128 x)
620 {
621 __m128i ipart;
622 __m128 fpart, expipart, expfpart;
623
624 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
625 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
626
627 /* ipart = int(x - 0.5) */
628 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
629
630 /* fpart = x - ipart */
631 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
632
633 /* expipart = (float) (1 << ipart) */
634 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
635
636 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
637 #if EXP_POLY_DEGREE == 5
638 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
639 #elif EXP_POLY_DEGREE == 4
640 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
641 #elif EXP_POLY_DEGREE == 3
642 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
643 #elif EXP_POLY_DEGREE == 2
644 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
645 #else
646 #error
647 #endif
648
649 return _mm_mul_ps(expipart, expfpart);
650 }
651
652 /**
653 * See http://www.devmaster.net/forums/showthread.php?p=43580
654 */
655 static INLINE __m128
656 log2f4(__m128 x)
657 {
658 __m128i expmask = _mm_set1_epi32(0x7f800000);
659 __m128i mantmask = _mm_set1_epi32(0x007fffff);
660 __m128 one = _mm_set1_ps(1.0f);
661
662 __m128i i = _mm_castps_si128(x);
663
664 /* exp = (float) exponent(x) */
665 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
666
667 /* mant = (float) mantissa(x) */
668 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
669
670 __m128 logmant;
671
672 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
673 * These coefficients can be generate with
674 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
675 */
676 #if LOG_POLY_DEGREE == 6
677 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
678 #elif LOG_POLY_DEGREE == 5
679 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
680 #elif LOG_POLY_DEGREE == 4
681 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
682 #elif LOG_POLY_DEGREE == 3
683 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
684 #else
685 #error
686 #endif
687
688 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
689 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
690
691 return _mm_add_ps(logmant, exp);
692 }
693
694 static INLINE __m128
695 powf4(__m128 x, __m128 y)
696 {
697 return exp2f4(_mm_mul_ps(log2f4(x), y));
698 }
699
700
701 /**
702 * Low-level instruction translators.
703 */
704
705 static void
706 emit_abs(
707 struct x86_function *func,
708 unsigned xmm )
709 {
710 sse_andps(
711 func,
712 make_xmm( xmm ),
713 get_temp(
714 TGSI_EXEC_TEMP_7FFFFFFF_I,
715 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
716 }
717
718 static void
719 emit_add(
720 struct x86_function *func,
721 unsigned xmm_dst,
722 unsigned xmm_src )
723 {
724 sse_addps(
725 func,
726 make_xmm( xmm_dst ),
727 make_xmm( xmm_src ) );
728 }
729
730 static void PIPE_CDECL
731 cos4f(
732 float *store )
733 {
734 store[0] = cosf( store[0] );
735 store[1] = cosf( store[1] );
736 store[2] = cosf( store[2] );
737 store[3] = cosf( store[3] );
738 }
739
740 static void
741 emit_cos(
742 struct x86_function *func,
743 unsigned xmm_save,
744 unsigned xmm_dst )
745 {
746 emit_func_call_dst(
747 func,
748 xmm_save,
749 xmm_dst,
750 cos4f );
751 }
752
753 static void PIPE_CDECL
754 #if defined(PIPE_CC_GCC)
755 __attribute__((force_align_arg_pointer))
756 #endif
757 ex24f(
758 float *store )
759 {
760 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
761 }
762
763 static void
764 emit_ex2(
765 struct x86_function *func,
766 unsigned xmm_save,
767 unsigned xmm_dst )
768 {
769 emit_func_call_dst(
770 func,
771 xmm_save,
772 xmm_dst,
773 ex24f );
774 }
775
776 static void
777 emit_f2it(
778 struct x86_function *func,
779 unsigned xmm )
780 {
781 sse2_cvttps2dq(
782 func,
783 make_xmm( xmm ),
784 make_xmm( xmm ) );
785 }
786
787 static void PIPE_CDECL
788 flr4f(
789 float *store )
790 {
791 store[0] = floorf( store[0] );
792 store[1] = floorf( store[1] );
793 store[2] = floorf( store[2] );
794 store[3] = floorf( store[3] );
795 }
796
797 static void
798 emit_flr(
799 struct x86_function *func,
800 unsigned xmm_save,
801 unsigned xmm_dst )
802 {
803 emit_func_call_dst(
804 func,
805 xmm_save,
806 xmm_dst,
807 flr4f );
808 }
809
810 static void PIPE_CDECL
811 frc4f(
812 float *store )
813 {
814 store[0] -= floorf( store[0] );
815 store[1] -= floorf( store[1] );
816 store[2] -= floorf( store[2] );
817 store[3] -= floorf( store[3] );
818 }
819
820 static void
821 emit_frc(
822 struct x86_function *func,
823 unsigned xmm_save,
824 unsigned xmm_dst )
825 {
826 emit_func_call_dst(
827 func,
828 xmm_save,
829 xmm_dst,
830 frc4f );
831 }
832
833 static void PIPE_CDECL
834 #if defined(PIPE_CC_GCC)
835 __attribute__((force_align_arg_pointer))
836 #endif
837 lg24f(
838 float *store )
839 {
840 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
841 }
842
843 static void
844 emit_lg2(
845 struct x86_function *func,
846 unsigned xmm_save,
847 unsigned xmm_dst )
848 {
849 emit_func_call_dst(
850 func,
851 xmm_save,
852 xmm_dst,
853 lg24f );
854 }
855
856 static void
857 emit_MOV(
858 struct x86_function *func,
859 unsigned xmm_dst,
860 unsigned xmm_src )
861 {
862 sse_movups(
863 func,
864 make_xmm( xmm_dst ),
865 make_xmm( xmm_src ) );
866 }
867
868 static void
869 emit_mul (struct x86_function *func,
870 unsigned xmm_dst,
871 unsigned xmm_src)
872 {
873 sse_mulps(
874 func,
875 make_xmm( xmm_dst ),
876 make_xmm( xmm_src ) );
877 }
878
879 static void
880 emit_neg(
881 struct x86_function *func,
882 unsigned xmm )
883 {
884 sse_xorps(
885 func,
886 make_xmm( xmm ),
887 get_temp(
888 TGSI_EXEC_TEMP_80000000_I,
889 TGSI_EXEC_TEMP_80000000_C ) );
890 }
891
892 static void PIPE_CDECL
893 #if defined(PIPE_CC_GCC)
894 __attribute__((force_align_arg_pointer))
895 #endif
896 pow4f(
897 float *store )
898 {
899 #if 1
900 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
901 #else
902 store[0] = powf( store[0], store[4] );
903 store[1] = powf( store[1], store[5] );
904 store[2] = powf( store[2], store[6] );
905 store[3] = powf( store[3], store[7] );
906 #endif
907 }
908
909 static void
910 emit_pow(
911 struct x86_function *func,
912 unsigned xmm_save,
913 unsigned xmm_dst,
914 unsigned xmm_src )
915 {
916 emit_func_call_dst_src(
917 func,
918 xmm_save,
919 xmm_dst,
920 xmm_src,
921 pow4f );
922 }
923
924 static void
925 emit_rcp (
926 struct x86_function *func,
927 unsigned xmm_dst,
928 unsigned xmm_src )
929 {
930 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
931 * good enough. Need to either emit a proper divide or use the
932 * iterative technique described below in emit_rsqrt().
933 */
934 sse2_rcpps(
935 func,
936 make_xmm( xmm_dst ),
937 make_xmm( xmm_src ) );
938 }
939
940 static void
941 emit_rsqrt(
942 struct x86_function *func,
943 unsigned xmm_dst,
944 unsigned xmm_src )
945 {
946 #if HIGH_PRECISION
947 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
948 * implementations, it is possible to improve its precision at
949 * fairly low cost, using a newton/raphson step, as below:
950 *
951 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
952 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
953 *
954 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
955 */
956 {
957 struct x86_reg dst = make_xmm( xmm_dst );
958 struct x86_reg src = make_xmm( xmm_src );
959 struct x86_reg tmp0 = make_xmm( 2 );
960 struct x86_reg tmp1 = make_xmm( 3 );
961
962 assert( xmm_dst != xmm_src );
963 assert( xmm_dst != 2 && xmm_dst != 3 );
964 assert( xmm_src != 2 && xmm_src != 3 );
965
966 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
967 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
968 sse_rsqrtps( func, tmp1, src );
969 sse_mulps( func, src, tmp1 );
970 sse_mulps( func, dst, tmp1 );
971 sse_mulps( func, src, tmp1 );
972 sse_subps( func, tmp0, src );
973 sse_mulps( func, dst, tmp0 );
974 }
975 #else
976 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
977 * good enough.
978 */
979 sse_rsqrtps(
980 func,
981 make_xmm( xmm_dst ),
982 make_xmm( xmm_src ) );
983 #endif
984 }
985
986 static void
987 emit_setsign(
988 struct x86_function *func,
989 unsigned xmm )
990 {
991 sse_orps(
992 func,
993 make_xmm( xmm ),
994 get_temp(
995 TGSI_EXEC_TEMP_80000000_I,
996 TGSI_EXEC_TEMP_80000000_C ) );
997 }
998
999 static void PIPE_CDECL
1000 sin4f(
1001 float *store )
1002 {
1003 store[0] = sinf( store[0] );
1004 store[1] = sinf( store[1] );
1005 store[2] = sinf( store[2] );
1006 store[3] = sinf( store[3] );
1007 }
1008
1009 static void
1010 emit_sin (struct x86_function *func,
1011 unsigned xmm_save,
1012 unsigned xmm_dst)
1013 {
1014 emit_func_call_dst(
1015 func,
1016 xmm_save,
1017 xmm_dst,
1018 sin4f );
1019 }
1020
1021 static void
1022 emit_sub(
1023 struct x86_function *func,
1024 unsigned xmm_dst,
1025 unsigned xmm_src )
1026 {
1027 sse_subps(
1028 func,
1029 make_xmm( xmm_dst ),
1030 make_xmm( xmm_src ) );
1031 }
1032
1033 /**
1034 * Register fetch.
1035 */
1036
1037 static void
1038 emit_fetch(
1039 struct x86_function *func,
1040 unsigned xmm,
1041 const struct tgsi_full_src_register *reg,
1042 const unsigned chan_index )
1043 {
1044 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1045
1046 switch (swizzle) {
1047 case TGSI_EXTSWIZZLE_X:
1048 case TGSI_EXTSWIZZLE_Y:
1049 case TGSI_EXTSWIZZLE_Z:
1050 case TGSI_EXTSWIZZLE_W:
1051 switch (reg->SrcRegister.File) {
1052 case TGSI_FILE_CONSTANT:
1053 emit_const(
1054 func,
1055 xmm,
1056 reg->SrcRegister.Index,
1057 swizzle,
1058 reg->SrcRegister.Indirect,
1059 reg->SrcRegisterInd.File,
1060 reg->SrcRegisterInd.Index );
1061 break;
1062
1063 case TGSI_FILE_IMMEDIATE:
1064 emit_immediate(
1065 func,
1066 xmm,
1067 reg->SrcRegister.Index,
1068 swizzle );
1069 break;
1070
1071 case TGSI_FILE_INPUT:
1072 emit_inputf(
1073 func,
1074 xmm,
1075 reg->SrcRegister.Index,
1076 swizzle );
1077 break;
1078
1079 case TGSI_FILE_TEMPORARY:
1080 emit_tempf(
1081 func,
1082 xmm,
1083 reg->SrcRegister.Index,
1084 swizzle );
1085 break;
1086
1087 default:
1088 assert( 0 );
1089 }
1090 break;
1091
1092 case TGSI_EXTSWIZZLE_ZERO:
1093 emit_tempf(
1094 func,
1095 xmm,
1096 TGSI_EXEC_TEMP_00000000_I,
1097 TGSI_EXEC_TEMP_00000000_C );
1098 break;
1099
1100 case TGSI_EXTSWIZZLE_ONE:
1101 emit_tempf(
1102 func,
1103 xmm,
1104 TEMP_ONE_I,
1105 TEMP_ONE_C );
1106 break;
1107
1108 default:
1109 assert( 0 );
1110 }
1111
1112 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1113 case TGSI_UTIL_SIGN_CLEAR:
1114 emit_abs( func, xmm );
1115 break;
1116
1117 case TGSI_UTIL_SIGN_SET:
1118 emit_setsign( func, xmm );
1119 break;
1120
1121 case TGSI_UTIL_SIGN_TOGGLE:
1122 emit_neg( func, xmm );
1123 break;
1124
1125 case TGSI_UTIL_SIGN_KEEP:
1126 break;
1127 }
1128 }
1129
1130 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1131 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1132
1133 /**
1134 * Register store.
1135 */
1136
1137 static void
1138 emit_store(
1139 struct x86_function *func,
1140 unsigned xmm,
1141 const struct tgsi_full_dst_register *reg,
1142 const struct tgsi_full_instruction *inst,
1143 unsigned chan_index )
1144 {
1145 switch( reg->DstRegister.File ) {
1146 case TGSI_FILE_OUTPUT:
1147 emit_output(
1148 func,
1149 xmm,
1150 reg->DstRegister.Index,
1151 chan_index );
1152 break;
1153
1154 case TGSI_FILE_TEMPORARY:
1155 emit_temps(
1156 func,
1157 xmm,
1158 reg->DstRegister.Index,
1159 chan_index );
1160 break;
1161
1162 case TGSI_FILE_ADDRESS:
1163 emit_addrs(
1164 func,
1165 xmm,
1166 reg->DstRegister.Index,
1167 chan_index );
1168 break;
1169
1170 default:
1171 assert( 0 );
1172 }
1173
1174 switch( inst->Instruction.Saturate ) {
1175 case TGSI_SAT_NONE:
1176 break;
1177
1178 case TGSI_SAT_ZERO_ONE:
1179 /* assert( 0 ); */
1180 break;
1181
1182 case TGSI_SAT_MINUS_PLUS_ONE:
1183 assert( 0 );
1184 break;
1185 }
1186 }
1187
1188 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1189 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1190
1191 /**
1192 * High-level instruction translators.
1193 */
1194
1195 static void
1196 emit_kil(
1197 struct x86_function *func,
1198 const struct tgsi_full_src_register *reg )
1199 {
1200 unsigned uniquemask;
1201 unsigned registers[4];
1202 unsigned nextregister = 0;
1203 unsigned firstchan = ~0;
1204 unsigned chan_index;
1205
1206 /* This mask stores component bits that were already tested. Note that
1207 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1208 * tested. */
1209 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1210
1211 FOR_EACH_CHANNEL( chan_index ) {
1212 unsigned swizzle;
1213
1214 /* unswizzle channel */
1215 swizzle = tgsi_util_get_full_src_register_extswizzle(
1216 reg,
1217 chan_index );
1218
1219 /* check if the component has not been already tested */
1220 if( !(uniquemask & (1 << swizzle)) ) {
1221 uniquemask |= 1 << swizzle;
1222
1223 /* allocate register */
1224 registers[chan_index] = nextregister;
1225 emit_fetch(
1226 func,
1227 nextregister,
1228 reg,
1229 chan_index );
1230 nextregister++;
1231
1232 /* mark the first channel used */
1233 if( firstchan == ~0 ) {
1234 firstchan = chan_index;
1235 }
1236 }
1237 }
1238
1239 x86_push(
1240 func,
1241 x86_make_reg( file_REG32, reg_AX ) );
1242 x86_push(
1243 func,
1244 x86_make_reg( file_REG32, reg_DX ) );
1245
1246 FOR_EACH_CHANNEL( chan_index ) {
1247 if( uniquemask & (1 << chan_index) ) {
1248 sse_cmpps(
1249 func,
1250 make_xmm( registers[chan_index] ),
1251 get_temp(
1252 TGSI_EXEC_TEMP_00000000_I,
1253 TGSI_EXEC_TEMP_00000000_C ),
1254 cc_LessThan );
1255
1256 if( chan_index == firstchan ) {
1257 sse_pmovmskb(
1258 func,
1259 x86_make_reg( file_REG32, reg_AX ),
1260 make_xmm( registers[chan_index] ) );
1261 }
1262 else {
1263 sse_pmovmskb(
1264 func,
1265 x86_make_reg( file_REG32, reg_DX ),
1266 make_xmm( registers[chan_index] ) );
1267 x86_or(
1268 func,
1269 x86_make_reg( file_REG32, reg_AX ),
1270 x86_make_reg( file_REG32, reg_DX ) );
1271 }
1272 }
1273 }
1274
1275 x86_or(
1276 func,
1277 get_temp(
1278 TGSI_EXEC_TEMP_KILMASK_I,
1279 TGSI_EXEC_TEMP_KILMASK_C ),
1280 x86_make_reg( file_REG32, reg_AX ) );
1281
1282 x86_pop(
1283 func,
1284 x86_make_reg( file_REG32, reg_DX ) );
1285 x86_pop(
1286 func,
1287 x86_make_reg( file_REG32, reg_AX ) );
1288 }
1289
1290
1291 static void
1292 emit_kilp(
1293 struct x86_function *func )
1294 {
1295 /* XXX todo / fix me */
1296 }
1297
1298
1299 static void
1300 emit_setcc(
1301 struct x86_function *func,
1302 struct tgsi_full_instruction *inst,
1303 enum sse_cc cc )
1304 {
1305 unsigned chan_index;
1306
1307 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1308 FETCH( func, *inst, 0, 0, chan_index );
1309 FETCH( func, *inst, 1, 1, chan_index );
1310 sse_cmpps(
1311 func,
1312 make_xmm( 0 ),
1313 make_xmm( 1 ),
1314 cc );
1315 sse_andps(
1316 func,
1317 make_xmm( 0 ),
1318 get_temp(
1319 TEMP_ONE_I,
1320 TEMP_ONE_C ) );
1321 STORE( func, *inst, 0, 0, chan_index );
1322 }
1323 }
1324
1325 static void
1326 emit_cmp(
1327 struct x86_function *func,
1328 struct tgsi_full_instruction *inst )
1329 {
1330 unsigned chan_index;
1331
1332 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1333 FETCH( func, *inst, 0, 0, chan_index );
1334 FETCH( func, *inst, 1, 1, chan_index );
1335 FETCH( func, *inst, 2, 2, chan_index );
1336 sse_cmpps(
1337 func,
1338 make_xmm( 0 ),
1339 get_temp(
1340 TGSI_EXEC_TEMP_00000000_I,
1341 TGSI_EXEC_TEMP_00000000_C ),
1342 cc_LessThan );
1343 sse_andps(
1344 func,
1345 make_xmm( 1 ),
1346 make_xmm( 0 ) );
1347 sse_andnps(
1348 func,
1349 make_xmm( 0 ),
1350 make_xmm( 2 ) );
1351 sse_orps(
1352 func,
1353 make_xmm( 0 ),
1354 make_xmm( 1 ) );
1355 STORE( func, *inst, 0, 0, chan_index );
1356 }
1357 }
1358
1359 static int
1360 emit_instruction(
1361 struct x86_function *func,
1362 struct tgsi_full_instruction *inst )
1363 {
1364 unsigned chan_index;
1365
1366 switch (inst->Instruction.Opcode) {
1367 case TGSI_OPCODE_ARL:
1368 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1369 FETCH( func, *inst, 0, 0, chan_index );
1370 emit_f2it( func, 0 );
1371 STORE( func, *inst, 0, 0, chan_index );
1372 }
1373 break;
1374
1375 case TGSI_OPCODE_MOV:
1376 case TGSI_OPCODE_SWZ:
1377 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1378 FETCH( func, *inst, 0, 0, chan_index );
1379 STORE( func, *inst, 0, 0, chan_index );
1380 }
1381 break;
1382
1383 case TGSI_OPCODE_LIT:
1384 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1385 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1386 emit_tempf(
1387 func,
1388 0,
1389 TEMP_ONE_I,
1390 TEMP_ONE_C);
1391 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1392 STORE( func, *inst, 0, 0, CHAN_X );
1393 }
1394 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1395 STORE( func, *inst, 0, 0, CHAN_W );
1396 }
1397 }
1398 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1399 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1400 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1401 FETCH( func, *inst, 0, 0, CHAN_X );
1402 sse_maxps(
1403 func,
1404 make_xmm( 0 ),
1405 get_temp(
1406 TGSI_EXEC_TEMP_00000000_I,
1407 TGSI_EXEC_TEMP_00000000_C ) );
1408 STORE( func, *inst, 0, 0, CHAN_Y );
1409 }
1410 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1411 /* XMM[1] = SrcReg[0].yyyy */
1412 FETCH( func, *inst, 1, 0, CHAN_Y );
1413 /* XMM[1] = max(XMM[1], 0) */
1414 sse_maxps(
1415 func,
1416 make_xmm( 1 ),
1417 get_temp(
1418 TGSI_EXEC_TEMP_00000000_I,
1419 TGSI_EXEC_TEMP_00000000_C ) );
1420 /* XMM[2] = SrcReg[0].wwww */
1421 FETCH( func, *inst, 2, 0, CHAN_W );
1422 /* XMM[2] = min(XMM[2], 128.0) */
1423 sse_minps(
1424 func,
1425 make_xmm( 2 ),
1426 get_temp(
1427 TGSI_EXEC_TEMP_128_I,
1428 TGSI_EXEC_TEMP_128_C ) );
1429 /* XMM[2] = max(XMM[2], -128.0) */
1430 sse_maxps(
1431 func,
1432 make_xmm( 2 ),
1433 get_temp(
1434 TGSI_EXEC_TEMP_MINUS_128_I,
1435 TGSI_EXEC_TEMP_MINUS_128_C ) );
1436 emit_pow( func, 3, 1, 2 );
1437 FETCH( func, *inst, 0, 0, CHAN_X );
1438 sse_xorps(
1439 func,
1440 make_xmm( 2 ),
1441 make_xmm( 2 ) );
1442 sse_cmpps(
1443 func,
1444 make_xmm( 2 ),
1445 make_xmm( 0 ),
1446 cc_LessThanEqual );
1447 sse_andps(
1448 func,
1449 make_xmm( 2 ),
1450 make_xmm( 1 ) );
1451 STORE( func, *inst, 2, 0, CHAN_Z );
1452 }
1453 }
1454 break;
1455
1456 case TGSI_OPCODE_RCP:
1457 /* TGSI_OPCODE_RECIP */
1458 FETCH( func, *inst, 0, 0, CHAN_X );
1459 emit_rcp( func, 0, 0 );
1460 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1461 STORE( func, *inst, 0, 0, chan_index );
1462 }
1463 break;
1464
1465 case TGSI_OPCODE_RSQ:
1466 /* TGSI_OPCODE_RECIPSQRT */
1467 FETCH( func, *inst, 0, 0, CHAN_X );
1468 emit_rsqrt( func, 1, 0 );
1469 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1470 STORE( func, *inst, 1, 0, chan_index );
1471 }
1472 break;
1473
1474 case TGSI_OPCODE_EXP:
1475 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1476 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1477 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1478 FETCH( func, *inst, 0, 0, CHAN_X );
1479 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1480 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1481 emit_MOV( func, 1, 0 );
1482 emit_flr( func, 2, 1 );
1483 /* dst.x = ex2(floor(src.x)) */
1484 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1485 emit_MOV( func, 2, 1 );
1486 emit_ex2( func, 3, 2 );
1487 STORE( func, *inst, 2, 0, CHAN_X );
1488 }
1489 /* dst.y = src.x - floor(src.x) */
1490 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1491 emit_MOV( func, 2, 0 );
1492 emit_sub( func, 2, 1 );
1493 STORE( func, *inst, 2, 0, CHAN_Y );
1494 }
1495 }
1496 /* dst.z = ex2(src.x) */
1497 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1498 emit_ex2( func, 3, 0 );
1499 STORE( func, *inst, 0, 0, CHAN_Z );
1500 }
1501 }
1502 /* dst.w = 1.0 */
1503 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1504 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1505 STORE( func, *inst, 0, 0, CHAN_W );
1506 }
1507 break;
1508
1509 case TGSI_OPCODE_LOG:
1510 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1511 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1512 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1513 FETCH( func, *inst, 0, 0, CHAN_X );
1514 emit_abs( func, 0 );
1515 emit_MOV( func, 1, 0 );
1516 emit_lg2( func, 2, 1 );
1517 /* dst.z = lg2(abs(src.x)) */
1518 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1519 STORE( func, *inst, 1, 0, CHAN_Z );
1520 }
1521 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1522 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1523 emit_flr( func, 2, 1 );
1524 /* dst.x = floor(lg2(abs(src.x))) */
1525 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1526 STORE( func, *inst, 1, 0, CHAN_X );
1527 }
1528 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1529 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1530 emit_ex2( func, 2, 1 );
1531 emit_rcp( func, 1, 1 );
1532 emit_mul( func, 0, 1 );
1533 STORE( func, *inst, 0, 0, CHAN_Y );
1534 }
1535 }
1536 }
1537 /* dst.w = 1.0 */
1538 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1539 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1540 STORE( func, *inst, 0, 0, CHAN_W );
1541 }
1542 break;
1543
1544 case TGSI_OPCODE_MUL:
1545 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1546 FETCH( func, *inst, 0, 0, chan_index );
1547 FETCH( func, *inst, 1, 1, chan_index );
1548 emit_mul( func, 0, 1 );
1549 STORE( func, *inst, 0, 0, chan_index );
1550 }
1551 break;
1552
1553 case TGSI_OPCODE_ADD:
1554 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1555 FETCH( func, *inst, 0, 0, chan_index );
1556 FETCH( func, *inst, 1, 1, chan_index );
1557 emit_add( func, 0, 1 );
1558 STORE( func, *inst, 0, 0, chan_index );
1559 }
1560 break;
1561
1562 case TGSI_OPCODE_DP3:
1563 /* TGSI_OPCODE_DOT3 */
1564 FETCH( func, *inst, 0, 0, CHAN_X );
1565 FETCH( func, *inst, 1, 1, CHAN_X );
1566 emit_mul( func, 0, 1 );
1567 FETCH( func, *inst, 1, 0, CHAN_Y );
1568 FETCH( func, *inst, 2, 1, CHAN_Y );
1569 emit_mul( func, 1, 2 );
1570 emit_add( func, 0, 1 );
1571 FETCH( func, *inst, 1, 0, CHAN_Z );
1572 FETCH( func, *inst, 2, 1, CHAN_Z );
1573 emit_mul( func, 1, 2 );
1574 emit_add( func, 0, 1 );
1575 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1576 STORE( func, *inst, 0, 0, chan_index );
1577 }
1578 break;
1579
1580 case TGSI_OPCODE_DP4:
1581 /* TGSI_OPCODE_DOT4 */
1582 FETCH( func, *inst, 0, 0, CHAN_X );
1583 FETCH( func, *inst, 1, 1, CHAN_X );
1584 emit_mul( func, 0, 1 );
1585 FETCH( func, *inst, 1, 0, CHAN_Y );
1586 FETCH( func, *inst, 2, 1, CHAN_Y );
1587 emit_mul( func, 1, 2 );
1588 emit_add( func, 0, 1 );
1589 FETCH( func, *inst, 1, 0, CHAN_Z );
1590 FETCH( func, *inst, 2, 1, CHAN_Z );
1591 emit_mul(func, 1, 2 );
1592 emit_add(func, 0, 1 );
1593 FETCH( func, *inst, 1, 0, CHAN_W );
1594 FETCH( func, *inst, 2, 1, CHAN_W );
1595 emit_mul( func, 1, 2 );
1596 emit_add( func, 0, 1 );
1597 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1598 STORE( func, *inst, 0, 0, chan_index );
1599 }
1600 break;
1601
1602 case TGSI_OPCODE_DST:
1603 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1604 emit_tempf(
1605 func,
1606 0,
1607 TEMP_ONE_I,
1608 TEMP_ONE_C );
1609 STORE( func, *inst, 0, 0, CHAN_X );
1610 }
1611 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1612 FETCH( func, *inst, 0, 0, CHAN_Y );
1613 FETCH( func, *inst, 1, 1, CHAN_Y );
1614 emit_mul( func, 0, 1 );
1615 STORE( func, *inst, 0, 0, CHAN_Y );
1616 }
1617 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1618 FETCH( func, *inst, 0, 0, CHAN_Z );
1619 STORE( func, *inst, 0, 0, CHAN_Z );
1620 }
1621 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1622 FETCH( func, *inst, 0, 1, CHAN_W );
1623 STORE( func, *inst, 0, 0, CHAN_W );
1624 }
1625 break;
1626
1627 case TGSI_OPCODE_MIN:
1628 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1629 FETCH( func, *inst, 0, 0, chan_index );
1630 FETCH( func, *inst, 1, 1, chan_index );
1631 sse_minps(
1632 func,
1633 make_xmm( 0 ),
1634 make_xmm( 1 ) );
1635 STORE( func, *inst, 0, 0, chan_index );
1636 }
1637 break;
1638
1639 case TGSI_OPCODE_MAX:
1640 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1641 FETCH( func, *inst, 0, 0, chan_index );
1642 FETCH( func, *inst, 1, 1, chan_index );
1643 sse_maxps(
1644 func,
1645 make_xmm( 0 ),
1646 make_xmm( 1 ) );
1647 STORE( func, *inst, 0, 0, chan_index );
1648 }
1649 break;
1650
1651 case TGSI_OPCODE_SLT:
1652 /* TGSI_OPCODE_SETLT */
1653 emit_setcc( func, inst, cc_LessThan );
1654 break;
1655
1656 case TGSI_OPCODE_SGE:
1657 /* TGSI_OPCODE_SETGE */
1658 emit_setcc( func, inst, cc_NotLessThan );
1659 break;
1660
1661 case TGSI_OPCODE_MAD:
1662 /* TGSI_OPCODE_MADD */
1663 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1664 FETCH( func, *inst, 0, 0, chan_index );
1665 FETCH( func, *inst, 1, 1, chan_index );
1666 FETCH( func, *inst, 2, 2, chan_index );
1667 emit_mul( func, 0, 1 );
1668 emit_add( func, 0, 2 );
1669 STORE( func, *inst, 0, 0, chan_index );
1670 }
1671 break;
1672
1673 case TGSI_OPCODE_SUB:
1674 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1675 FETCH( func, *inst, 0, 0, chan_index );
1676 FETCH( func, *inst, 1, 1, chan_index );
1677 emit_sub( func, 0, 1 );
1678 STORE( func, *inst, 0, 0, chan_index );
1679 }
1680 break;
1681
1682 case TGSI_OPCODE_LERP:
1683 /* TGSI_OPCODE_LRP */
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685 FETCH( func, *inst, 0, 0, chan_index );
1686 FETCH( func, *inst, 1, 1, chan_index );
1687 FETCH( func, *inst, 2, 2, chan_index );
1688 emit_sub( func, 1, 2 );
1689 emit_mul( func, 0, 1 );
1690 emit_add( func, 0, 2 );
1691 STORE( func, *inst, 0, 0, chan_index );
1692 }
1693 break;
1694
1695 case TGSI_OPCODE_CND:
1696 return 0;
1697 break;
1698
1699 case TGSI_OPCODE_CND0:
1700 return 0;
1701 break;
1702
1703 case TGSI_OPCODE_DOT2ADD:
1704 /* TGSI_OPCODE_DP2A */
1705 return 0;
1706 break;
1707
1708 case TGSI_OPCODE_INDEX:
1709 return 0;
1710 break;
1711
1712 case TGSI_OPCODE_NEGATE:
1713 return 0;
1714 break;
1715
1716 case TGSI_OPCODE_FRAC:
1717 /* TGSI_OPCODE_FRC */
1718 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1719 FETCH( func, *inst, 0, 0, chan_index );
1720 emit_frc( func, 0, 0 );
1721 STORE( func, *inst, 0, 0, chan_index );
1722 }
1723 break;
1724
1725 case TGSI_OPCODE_CLAMP:
1726 return 0;
1727 break;
1728
1729 case TGSI_OPCODE_FLOOR:
1730 /* TGSI_OPCODE_FLR */
1731 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1732 FETCH( func, *inst, 0, 0, chan_index );
1733 emit_flr( func, 0, 0 );
1734 STORE( func, *inst, 0, 0, chan_index );
1735 }
1736 break;
1737
1738 case TGSI_OPCODE_ROUND:
1739 return 0;
1740 break;
1741
1742 case TGSI_OPCODE_EXPBASE2:
1743 /* TGSI_OPCODE_EX2 */
1744 FETCH( func, *inst, 0, 0, CHAN_X );
1745 emit_ex2( func, 0, 0 );
1746 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1747 STORE( func, *inst, 0, 0, chan_index );
1748 }
1749 break;
1750
1751 case TGSI_OPCODE_LOGBASE2:
1752 /* TGSI_OPCODE_LG2 */
1753 FETCH( func, *inst, 0, 0, CHAN_X );
1754 emit_lg2( func, 0, 0 );
1755 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1756 STORE( func, *inst, 0, 0, chan_index );
1757 }
1758 break;
1759
1760 case TGSI_OPCODE_POWER:
1761 /* TGSI_OPCODE_POW */
1762 FETCH( func, *inst, 0, 0, CHAN_X );
1763 FETCH( func, *inst, 1, 1, CHAN_X );
1764 emit_pow( func, 0, 0, 1 );
1765 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1766 STORE( func, *inst, 0, 0, chan_index );
1767 }
1768 break;
1769
1770 case TGSI_OPCODE_CROSSPRODUCT:
1771 /* TGSI_OPCODE_XPD */
1772 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1773 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1774 FETCH( func, *inst, 1, 1, CHAN_Z );
1775 FETCH( func, *inst, 3, 0, CHAN_Z );
1776 }
1777 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1778 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1779 FETCH( func, *inst, 0, 0, CHAN_Y );
1780 FETCH( func, *inst, 4, 1, CHAN_Y );
1781 }
1782 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1783 emit_MOV( func, 2, 0 );
1784 emit_mul( func, 2, 1 );
1785 emit_MOV( func, 5, 3 );
1786 emit_mul( func, 5, 4 );
1787 emit_sub( func, 2, 5 );
1788 STORE( func, *inst, 2, 0, CHAN_X );
1789 }
1790 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1791 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1792 FETCH( func, *inst, 2, 1, CHAN_X );
1793 FETCH( func, *inst, 5, 0, CHAN_X );
1794 }
1795 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1796 emit_mul( func, 3, 2 );
1797 emit_mul( func, 1, 5 );
1798 emit_sub( func, 3, 1 );
1799 STORE( func, *inst, 3, 0, CHAN_Y );
1800 }
1801 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1802 emit_mul( func, 5, 4 );
1803 emit_mul( func, 0, 2 );
1804 emit_sub( func, 5, 0 );
1805 STORE( func, *inst, 5, 0, CHAN_Z );
1806 }
1807 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1808 emit_tempf(
1809 func,
1810 0,
1811 TEMP_ONE_I,
1812 TEMP_ONE_C );
1813 STORE( func, *inst, 0, 0, CHAN_W );
1814 }
1815 break;
1816
1817 case TGSI_OPCODE_MULTIPLYMATRIX:
1818 return 0;
1819 break;
1820
1821 case TGSI_OPCODE_ABS:
1822 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1823 FETCH( func, *inst, 0, 0, chan_index );
1824 emit_abs( func, 0) ;
1825
1826 STORE( func, *inst, 0, 0, chan_index );
1827 }
1828 break;
1829
1830 case TGSI_OPCODE_RCC:
1831 return 0;
1832 break;
1833
1834 case TGSI_OPCODE_DPH:
1835 FETCH( func, *inst, 0, 0, CHAN_X );
1836 FETCH( func, *inst, 1, 1, CHAN_X );
1837 emit_mul( func, 0, 1 );
1838 FETCH( func, *inst, 1, 0, CHAN_Y );
1839 FETCH( func, *inst, 2, 1, CHAN_Y );
1840 emit_mul( func, 1, 2 );
1841 emit_add( func, 0, 1 );
1842 FETCH( func, *inst, 1, 0, CHAN_Z );
1843 FETCH( func, *inst, 2, 1, CHAN_Z );
1844 emit_mul( func, 1, 2 );
1845 emit_add( func, 0, 1 );
1846 FETCH( func, *inst, 1, 1, CHAN_W );
1847 emit_add( func, 0, 1 );
1848 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1849 STORE( func, *inst, 0, 0, chan_index );
1850 }
1851 break;
1852
1853 case TGSI_OPCODE_COS:
1854 FETCH( func, *inst, 0, 0, CHAN_X );
1855 emit_cos( func, 0, 0 );
1856 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1857 STORE( func, *inst, 0, 0, chan_index );
1858 }
1859 break;
1860
1861 case TGSI_OPCODE_DDX:
1862 return 0;
1863 break;
1864
1865 case TGSI_OPCODE_DDY:
1866 return 0;
1867 break;
1868
1869 case TGSI_OPCODE_KILP:
1870 /* predicated kill */
1871 emit_kilp( func );
1872 return 0; /* XXX fix me */
1873 break;
1874
1875 case TGSI_OPCODE_KIL:
1876 /* conditional kill */
1877 emit_kil( func, &inst->FullSrcRegisters[0] );
1878 break;
1879
1880 case TGSI_OPCODE_PK2H:
1881 return 0;
1882 break;
1883
1884 case TGSI_OPCODE_PK2US:
1885 return 0;
1886 break;
1887
1888 case TGSI_OPCODE_PK4B:
1889 return 0;
1890 break;
1891
1892 case TGSI_OPCODE_PK4UB:
1893 return 0;
1894 break;
1895
1896 case TGSI_OPCODE_RFL:
1897 return 0;
1898 break;
1899
1900 case TGSI_OPCODE_SEQ:
1901 return 0;
1902 break;
1903
1904 case TGSI_OPCODE_SFL:
1905 return 0;
1906 break;
1907
1908 case TGSI_OPCODE_SGT:
1909 return 0;
1910 break;
1911
1912 case TGSI_OPCODE_SIN:
1913 FETCH( func, *inst, 0, 0, CHAN_X );
1914 emit_sin( func, 0, 0 );
1915 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1916 STORE( func, *inst, 0, 0, chan_index );
1917 }
1918 break;
1919
1920 case TGSI_OPCODE_SLE:
1921 return 0;
1922 break;
1923
1924 case TGSI_OPCODE_SNE:
1925 return 0;
1926 break;
1927
1928 case TGSI_OPCODE_STR:
1929 return 0;
1930 break;
1931
1932 case TGSI_OPCODE_TEX:
1933 if (0) {
1934 /* Disable dummy texture code:
1935 */
1936 emit_tempf(
1937 func,
1938 0,
1939 TEMP_ONE_I,
1940 TEMP_ONE_C );
1941 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1942 STORE( func, *inst, 0, 0, chan_index );
1943 }
1944 }
1945 else {
1946 return 0;
1947 }
1948 break;
1949
1950 case TGSI_OPCODE_TXD:
1951 return 0;
1952 break;
1953
1954 case TGSI_OPCODE_UP2H:
1955 return 0;
1956 break;
1957
1958 case TGSI_OPCODE_UP2US:
1959 return 0;
1960 break;
1961
1962 case TGSI_OPCODE_UP4B:
1963 return 0;
1964 break;
1965
1966 case TGSI_OPCODE_UP4UB:
1967 return 0;
1968 break;
1969
1970 case TGSI_OPCODE_X2D:
1971 return 0;
1972 break;
1973
1974 case TGSI_OPCODE_ARA:
1975 return 0;
1976 break;
1977
1978 case TGSI_OPCODE_ARR:
1979 return 0;
1980 break;
1981
1982 case TGSI_OPCODE_BRA:
1983 return 0;
1984 break;
1985
1986 case TGSI_OPCODE_CAL:
1987 return 0;
1988 break;
1989
1990 case TGSI_OPCODE_RET:
1991 emit_ret( func );
1992 break;
1993
1994 case TGSI_OPCODE_END:
1995 break;
1996
1997 case TGSI_OPCODE_SSG:
1998 return 0;
1999 break;
2000
2001 case TGSI_OPCODE_CMP:
2002 emit_cmp (func, inst);
2003 break;
2004
2005 case TGSI_OPCODE_SCS:
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2007 FETCH( func, *inst, 0, 0, CHAN_X );
2008 emit_cos( func, 0, 0 );
2009 STORE( func, *inst, 0, 0, CHAN_X );
2010 }
2011 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2012 FETCH( func, *inst, 0, 0, CHAN_X );
2013 emit_sin( func, 0, 0 );
2014 STORE( func, *inst, 0, 0, CHAN_Y );
2015 }
2016 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2017 emit_tempf(
2018 func,
2019 0,
2020 TGSI_EXEC_TEMP_00000000_I,
2021 TGSI_EXEC_TEMP_00000000_C );
2022 STORE( func, *inst, 0, 0, CHAN_Z );
2023 }
2024 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2025 emit_tempf(
2026 func,
2027 0,
2028 TEMP_ONE_I,
2029 TEMP_ONE_C );
2030 STORE( func, *inst, 0, 0, CHAN_W );
2031 }
2032 break;
2033
2034 case TGSI_OPCODE_TXB:
2035 return 0;
2036 break;
2037
2038 case TGSI_OPCODE_NRM:
2039 return 0;
2040 break;
2041
2042 case TGSI_OPCODE_DIV:
2043 return 0;
2044 break;
2045
2046 case TGSI_OPCODE_DP2:
2047 return 0;
2048 break;
2049
2050 case TGSI_OPCODE_TXL:
2051 return 0;
2052 break;
2053
2054 case TGSI_OPCODE_BRK:
2055 return 0;
2056 break;
2057
2058 case TGSI_OPCODE_IF:
2059 return 0;
2060 break;
2061
2062 case TGSI_OPCODE_LOOP:
2063 return 0;
2064 break;
2065
2066 case TGSI_OPCODE_REP:
2067 return 0;
2068 break;
2069
2070 case TGSI_OPCODE_ELSE:
2071 return 0;
2072 break;
2073
2074 case TGSI_OPCODE_ENDIF:
2075 return 0;
2076 break;
2077
2078 case TGSI_OPCODE_ENDLOOP:
2079 return 0;
2080 break;
2081
2082 case TGSI_OPCODE_ENDREP:
2083 return 0;
2084 break;
2085
2086 case TGSI_OPCODE_PUSHA:
2087 return 0;
2088 break;
2089
2090 case TGSI_OPCODE_POPA:
2091 return 0;
2092 break;
2093
2094 case TGSI_OPCODE_CEIL:
2095 return 0;
2096 break;
2097
2098 case TGSI_OPCODE_I2F:
2099 return 0;
2100 break;
2101
2102 case TGSI_OPCODE_NOT:
2103 return 0;
2104 break;
2105
2106 case TGSI_OPCODE_TRUNC:
2107 return 0;
2108 break;
2109
2110 case TGSI_OPCODE_SHL:
2111 return 0;
2112 break;
2113
2114 case TGSI_OPCODE_SHR:
2115 return 0;
2116 break;
2117
2118 case TGSI_OPCODE_AND:
2119 return 0;
2120 break;
2121
2122 case TGSI_OPCODE_OR:
2123 return 0;
2124 break;
2125
2126 case TGSI_OPCODE_MOD:
2127 return 0;
2128 break;
2129
2130 case TGSI_OPCODE_XOR:
2131 return 0;
2132 break;
2133
2134 case TGSI_OPCODE_SAD:
2135 return 0;
2136 break;
2137
2138 case TGSI_OPCODE_TXF:
2139 return 0;
2140 break;
2141
2142 case TGSI_OPCODE_TXQ:
2143 return 0;
2144 break;
2145
2146 case TGSI_OPCODE_CONT:
2147 return 0;
2148 break;
2149
2150 case TGSI_OPCODE_EMIT:
2151 return 0;
2152 break;
2153
2154 case TGSI_OPCODE_ENDPRIM:
2155 return 0;
2156 break;
2157
2158 default:
2159 return 0;
2160 }
2161
2162 return 1;
2163 }
2164
2165 static void
2166 emit_declaration(
2167 struct x86_function *func,
2168 struct tgsi_full_declaration *decl )
2169 {
2170 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2171 unsigned first, last, mask;
2172 unsigned i, j;
2173
2174 first = decl->DeclarationRange.First;
2175 last = decl->DeclarationRange.Last;
2176 mask = decl->Declaration.UsageMask;
2177
2178 for( i = first; i <= last; i++ ) {
2179 for( j = 0; j < NUM_CHANNELS; j++ ) {
2180 if( mask & (1 << j) ) {
2181 switch( decl->Declaration.Interpolate ) {
2182 case TGSI_INTERPOLATE_CONSTANT:
2183 emit_coef_a0( func, 0, i, j );
2184 emit_inputs( func, 0, i, j );
2185 break;
2186
2187 case TGSI_INTERPOLATE_LINEAR:
2188 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2189 emit_coef_dadx( func, 1, i, j );
2190 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2191 emit_coef_dady( func, 3, i, j );
2192 emit_mul( func, 0, 1 ); /* x * dadx */
2193 emit_coef_a0( func, 4, i, j );
2194 emit_mul( func, 2, 3 ); /* y * dady */
2195 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2196 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2197 emit_inputs( func, 0, i, j );
2198 break;
2199
2200 case TGSI_INTERPOLATE_PERSPECTIVE:
2201 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2202 emit_coef_dadx( func, 1, i, j );
2203 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2204 emit_coef_dady( func, 3, i, j );
2205 emit_mul( func, 0, 1 ); /* x * dadx */
2206 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2207 emit_coef_a0( func, 5, i, j );
2208 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2209 emit_mul( func, 2, 3 ); /* y * dady */
2210 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2211 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2212 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2213 emit_inputs( func, 0, i, j );
2214 break;
2215
2216 default:
2217 assert( 0 );
2218 break;
2219 }
2220 }
2221 }
2222 }
2223 }
2224 }
2225
2226 static void aos_to_soa( struct x86_function *func,
2227 uint arg_aos,
2228 uint arg_soa,
2229 uint arg_num,
2230 uint arg_stride )
2231 {
2232 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2233 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2234 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2235 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2236 int inner_loop;
2237
2238
2239 /* Save EBX */
2240 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2241
2242 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2243 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2244 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2245 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2246
2247 /* do */
2248 inner_loop = x86_get_label( func );
2249 {
2250 x86_push( func, aos_input );
2251 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2252 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2253 x86_add( func, aos_input, stride );
2254 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2255 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2256 x86_add( func, aos_input, stride );
2257 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2258 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2259 x86_add( func, aos_input, stride );
2260 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2261 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2262 x86_pop( func, aos_input );
2263
2264 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2265 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2266 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2267 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2268 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2269 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2270
2271 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2272 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2273 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2274 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2275
2276 /* Advance to next input */
2277 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2278 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2279 }
2280 /* while --num_inputs */
2281 x86_dec( func, num_inputs );
2282 x86_jcc( func, cc_NE, inner_loop );
2283
2284 /* Restore EBX */
2285 x86_pop( func, aos_input );
2286 }
2287
2288 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2289 {
2290 struct x86_reg soa_output;
2291 struct x86_reg aos_output;
2292 struct x86_reg num_outputs;
2293 struct x86_reg temp;
2294 int inner_loop;
2295
2296 soa_output = x86_make_reg( file_REG32, reg_AX );
2297 aos_output = x86_make_reg( file_REG32, reg_BX );
2298 num_outputs = x86_make_reg( file_REG32, reg_CX );
2299 temp = x86_make_reg( file_REG32, reg_DX );
2300
2301 /* Save EBX */
2302 x86_push( func, aos_output );
2303
2304 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2305 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2306 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2307
2308 /* do */
2309 inner_loop = x86_get_label( func );
2310 {
2311 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2312 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2313 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2314 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2315
2316 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2317 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2318 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2319 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2320 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2321 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2322
2323 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2324 x86_push( func, aos_output );
2325 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2326 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2327 x86_add( func, aos_output, temp );
2328 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2329 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2330 x86_add( func, aos_output, temp );
2331 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2332 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2333 x86_add( func, aos_output, temp );
2334 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2335 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2336 x86_pop( func, aos_output );
2337
2338 /* Advance to next output */
2339 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2340 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2341 }
2342 /* while --num_outputs */
2343 x86_dec( func, num_outputs );
2344 x86_jcc( func, cc_NE, inner_loop );
2345
2346 /* Restore EBX */
2347 x86_pop( func, aos_output );
2348 }
2349
2350 /**
2351 * Translate a TGSI vertex/fragment shader to SSE2 code.
2352 * Slightly different things are done for vertex vs. fragment shaders.
2353 *
2354 * Note that fragment shaders are responsible for interpolating shader
2355 * inputs. Because on x86 we have only 4 GP registers, and here we
2356 * have 5 shader arguments (input, output, const, temp and coef), the
2357 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2358 * GP register holding the output argument is aliased with the coeff
2359 * argument, as outputs are not needed in the DECLARATION phase.
2360 *
2361 * \param tokens the TGSI input shader
2362 * \param func the output SSE code/function
2363 * \param immediates buffer to place immediates, later passed to SSE func
2364 * \param return 1 for success, 0 if translation failed
2365 */
2366 unsigned
2367 tgsi_emit_sse2(
2368 const struct tgsi_token *tokens,
2369 struct x86_function *func,
2370 float (*immediates)[4],
2371 boolean do_swizzles )
2372 {
2373 struct tgsi_parse_context parse;
2374 boolean instruction_phase = FALSE;
2375 unsigned ok = 1;
2376 uint num_immediates = 0;
2377
2378 util_init_math();
2379
2380 func->csr = func->store;
2381
2382 tgsi_parse_init( &parse, tokens );
2383
2384 /* Can't just use EDI, EBX without save/restoring them:
2385 */
2386 x86_push(
2387 func,
2388 get_immediate_base() );
2389
2390 x86_push(
2391 func,
2392 get_temp_base() );
2393
2394
2395 /*
2396 * Different function args for vertex/fragment shaders:
2397 */
2398 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2399 /* DECLARATION phase, do not load output argument. */
2400 x86_mov(
2401 func,
2402 get_input_base(),
2403 x86_fn_arg( func, 1 ) );
2404 /* skipping outputs argument here */
2405 x86_mov(
2406 func,
2407 get_const_base(),
2408 x86_fn_arg( func, 3 ) );
2409 x86_mov(
2410 func,
2411 get_temp_base(),
2412 x86_fn_arg( func, 4 ) );
2413 x86_mov(
2414 func,
2415 get_coef_base(),
2416 x86_fn_arg( func, 5 ) );
2417 x86_mov(
2418 func,
2419 get_immediate_base(),
2420 x86_fn_arg( func, 6 ) );
2421 }
2422 else {
2423 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2424
2425 if (do_swizzles)
2426 aos_to_soa( func,
2427 6, /* aos_input */
2428 1, /* machine->input */
2429 7, /* num_inputs */
2430 8 ); /* input_stride */
2431
2432 x86_mov(
2433 func,
2434 get_input_base(),
2435 x86_fn_arg( func, 1 ) );
2436 x86_mov(
2437 func,
2438 get_output_base(),
2439 x86_fn_arg( func, 2 ) );
2440 x86_mov(
2441 func,
2442 get_const_base(),
2443 x86_fn_arg( func, 3 ) );
2444 x86_mov(
2445 func,
2446 get_temp_base(),
2447 x86_fn_arg( func, 4 ) );
2448 x86_mov(
2449 func,
2450 get_immediate_base(),
2451 x86_fn_arg( func, 5 ) );
2452 }
2453
2454 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2455 tgsi_parse_token( &parse );
2456
2457 switch( parse.FullToken.Token.Type ) {
2458 case TGSI_TOKEN_TYPE_DECLARATION:
2459 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2460 emit_declaration(
2461 func,
2462 &parse.FullToken.FullDeclaration );
2463 }
2464 break;
2465
2466 case TGSI_TOKEN_TYPE_INSTRUCTION:
2467 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2468 if( !instruction_phase ) {
2469 /* INSTRUCTION phase, overwrite coeff with output. */
2470 instruction_phase = TRUE;
2471 x86_mov(
2472 func,
2473 get_output_base(),
2474 x86_fn_arg( func, 2 ) );
2475 }
2476 }
2477
2478 ok = emit_instruction(
2479 func,
2480 &parse.FullToken.FullInstruction );
2481
2482 if (!ok) {
2483 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2484 parse.FullToken.FullInstruction.Instruction.Opcode,
2485 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2486 "vertex shader" : "fragment shader");
2487 }
2488 break;
2489
2490 case TGSI_TOKEN_TYPE_IMMEDIATE:
2491 /* simply copy the immediate values into the next immediates[] slot */
2492 {
2493 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2494 uint i;
2495 assert(size <= 4);
2496 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2497 for( i = 0; i < size; i++ ) {
2498 immediates[num_immediates][i] =
2499 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2500 }
2501 #if 0
2502 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2503 num_immediates,
2504 immediates[num_immediates][0],
2505 immediates[num_immediates][1],
2506 immediates[num_immediates][2],
2507 immediates[num_immediates][3]);
2508 #endif
2509 num_immediates++;
2510 }
2511 break;
2512
2513 default:
2514 ok = 0;
2515 assert( 0 );
2516 }
2517 }
2518
2519 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2520 if (do_swizzles)
2521 soa_to_aos( func, 9, 2, 10, 11 );
2522 }
2523
2524 /* Can't just use EBX, EDI without save/restoring them:
2525 */
2526 x86_pop(
2527 func,
2528 get_temp_base() );
2529
2530 x86_pop(
2531 func,
2532 get_immediate_base() );
2533
2534 emit_ret( func );
2535
2536 tgsi_parse_free( &parse );
2537
2538 return ok;
2539 }
2540
2541 #endif /* PIPE_ARCH_X86 */
2542