cell: Moved X86 checks to wrap #include section so that Cell targets will compile...
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #ifdef PIPE_ARCH_X86
29
30 #include "pipe/p_debug.h"
31 #include "pipe/p_shader_tokens.h"
32 #include "util/u_math.h"
33 #include "util/u_sse.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_util.h"
36 #include "tgsi_exec.h"
37 #include "tgsi_sse2.h"
38
39 #include "rtasm/rtasm_x86sse.h"
40
41 /* for 1/sqrt()
42 *
43 * This costs about 100fps (close to 10%) in gears:
44 */
45 #define HIGH_PRECISION 1
46
47 #define FAST_MATH 1
48
49
50 #define FOR_EACH_CHANNEL( CHAN )\
51 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
52
53 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
54 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
55
56 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
57 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
58
59 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
60 FOR_EACH_CHANNEL( CHAN )\
61 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
62
63 #define CHAN_X 0
64 #define CHAN_Y 1
65 #define CHAN_Z 2
66 #define CHAN_W 3
67
68 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
69 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
70
71 #define TEMP_R0 TGSI_EXEC_TEMP_R0
72 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
73
74 /**
75 * X86 utility functions.
76 */
77
78 static struct x86_reg
79 make_xmm(
80 unsigned xmm )
81 {
82 return x86_make_reg(
83 file_XMM,
84 (enum x86_reg_name) xmm );
85 }
86
87 /**
88 * X86 register mapping helpers.
89 */
90
91 static struct x86_reg
92 get_const_base( void )
93 {
94 return x86_make_reg(
95 file_REG32,
96 reg_CX );
97 }
98
99 static struct x86_reg
100 get_input_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_AX );
105 }
106
107 static struct x86_reg
108 get_output_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_DX );
113 }
114
115 static struct x86_reg
116 get_temp_base( void )
117 {
118 return x86_make_reg(
119 file_REG32,
120 reg_BX );
121 }
122
123 static struct x86_reg
124 get_coef_base( void )
125 {
126 return get_output_base();
127 }
128
129 static struct x86_reg
130 get_immediate_base( void )
131 {
132 return x86_make_reg(
133 file_REG32,
134 reg_DI );
135 }
136
137
138 /**
139 * Data access helpers.
140 */
141
142
143 static struct x86_reg
144 get_immediate(
145 unsigned vec,
146 unsigned chan )
147 {
148 return x86_make_disp(
149 get_immediate_base(),
150 (vec * 4 + chan) * 4 );
151 }
152
153 static struct x86_reg
154 get_const(
155 unsigned vec,
156 unsigned chan )
157 {
158 return x86_make_disp(
159 get_const_base(),
160 (vec * 4 + chan) * 4 );
161 }
162
163 static struct x86_reg
164 get_input(
165 unsigned vec,
166 unsigned chan )
167 {
168 return x86_make_disp(
169 get_input_base(),
170 (vec * 4 + chan) * 16 );
171 }
172
173 static struct x86_reg
174 get_output(
175 unsigned vec,
176 unsigned chan )
177 {
178 return x86_make_disp(
179 get_output_base(),
180 (vec * 4 + chan) * 16 );
181 }
182
183 static struct x86_reg
184 get_temp(
185 unsigned vec,
186 unsigned chan )
187 {
188 return x86_make_disp(
189 get_temp_base(),
190 (vec * 4 + chan) * 16 );
191 }
192
193 static struct x86_reg
194 get_coef(
195 unsigned vec,
196 unsigned chan,
197 unsigned member )
198 {
199 return x86_make_disp(
200 get_coef_base(),
201 ((vec * 3 + member) * 4 + chan) * 4 );
202 }
203
204
205 static void
206 emit_ret(
207 struct x86_function *func )
208 {
209 x86_ret( func );
210 }
211
212
213 /**
214 * Data fetch helpers.
215 */
216
217 /**
218 * Copy a shader constant to xmm register
219 * \param xmm the destination xmm register
220 * \param vec the src const buffer index
221 * \param chan src channel to fetch (X, Y, Z or W)
222 */
223 static void
224 emit_const(
225 struct x86_function *func,
226 uint xmm,
227 int vec,
228 uint chan,
229 uint indirect,
230 uint indirectFile,
231 int indirectIndex )
232 {
233 if (indirect) {
234 struct x86_reg r0 = get_input_base();
235 struct x86_reg r1 = get_output_base();
236 uint i;
237
238 assert( indirectFile == TGSI_FILE_ADDRESS );
239 assert( indirectIndex == 0 );
240
241 x86_push( func, r0 );
242 x86_push( func, r1 );
243
244 for (i = 0; i < QUAD_SIZE; i++) {
245 x86_lea( func, r0, get_const( vec, chan ) );
246 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
247
248 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
249 */
250 x86_add( func, r1, r1 );
251 x86_add( func, r1, r1 );
252 x86_add( func, r1, r1 );
253 x86_add( func, r1, r1 );
254
255 x86_add( func, r0, r1 );
256 x86_mov( func, r1, x86_deref( r0 ) );
257 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
258 }
259
260 x86_pop( func, r1 );
261 x86_pop( func, r0 );
262
263 sse_movaps(
264 func,
265 make_xmm( xmm ),
266 get_temp( TEMP_R0, CHAN_X ) );
267 }
268 else {
269 assert( vec >= 0 );
270
271 sse_movss(
272 func,
273 make_xmm( xmm ),
274 get_const( vec, chan ) );
275 sse_shufps(
276 func,
277 make_xmm( xmm ),
278 make_xmm( xmm ),
279 SHUF( 0, 0, 0, 0 ) );
280 }
281 }
282
283 static void
284 emit_immediate(
285 struct x86_function *func,
286 unsigned xmm,
287 unsigned vec,
288 unsigned chan )
289 {
290 sse_movss(
291 func,
292 make_xmm( xmm ),
293 get_immediate( vec, chan ) );
294 sse_shufps(
295 func,
296 make_xmm( xmm ),
297 make_xmm( xmm ),
298 SHUF( 0, 0, 0, 0 ) );
299 }
300
301
302 /**
303 * Copy a shader input to xmm register
304 * \param xmm the destination xmm register
305 * \param vec the src input attrib
306 * \param chan src channel to fetch (X, Y, Z or W)
307 */
308 static void
309 emit_inputf(
310 struct x86_function *func,
311 unsigned xmm,
312 unsigned vec,
313 unsigned chan )
314 {
315 sse_movups(
316 func,
317 make_xmm( xmm ),
318 get_input( vec, chan ) );
319 }
320
321 /**
322 * Store an xmm register to a shader output
323 * \param xmm the source xmm register
324 * \param vec the dest output attrib
325 * \param chan src dest channel to store (X, Y, Z or W)
326 */
327 static void
328 emit_output(
329 struct x86_function *func,
330 unsigned xmm,
331 unsigned vec,
332 unsigned chan )
333 {
334 sse_movups(
335 func,
336 get_output( vec, chan ),
337 make_xmm( xmm ) );
338 }
339
340 /**
341 * Copy a shader temporary to xmm register
342 * \param xmm the destination xmm register
343 * \param vec the src temp register
344 * \param chan src channel to fetch (X, Y, Z or W)
345 */
346 static void
347 emit_tempf(
348 struct x86_function *func,
349 unsigned xmm,
350 unsigned vec,
351 unsigned chan )
352 {
353 sse_movaps(
354 func,
355 make_xmm( xmm ),
356 get_temp( vec, chan ) );
357 }
358
359 /**
360 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
361 * \param xmm the destination xmm register
362 * \param vec the src input/attribute coefficient index
363 * \param chan src channel to fetch (X, Y, Z or W)
364 * \param member 0=a0, 1=dadx, 2=dady
365 */
366 static void
367 emit_coef(
368 struct x86_function *func,
369 unsigned xmm,
370 unsigned vec,
371 unsigned chan,
372 unsigned member )
373 {
374 sse_movss(
375 func,
376 make_xmm( xmm ),
377 get_coef( vec, chan, member ) );
378 sse_shufps(
379 func,
380 make_xmm( xmm ),
381 make_xmm( xmm ),
382 SHUF( 0, 0, 0, 0 ) );
383 }
384
385 /**
386 * Data store helpers.
387 */
388
389 static void
390 emit_inputs(
391 struct x86_function *func,
392 unsigned xmm,
393 unsigned vec,
394 unsigned chan )
395 {
396 sse_movups(
397 func,
398 get_input( vec, chan ),
399 make_xmm( xmm ) );
400 }
401
402 static void
403 emit_temps(
404 struct x86_function *func,
405 unsigned xmm,
406 unsigned vec,
407 unsigned chan )
408 {
409 sse_movaps(
410 func,
411 get_temp( vec, chan ),
412 make_xmm( xmm ) );
413 }
414
415 static void
416 emit_addrs(
417 struct x86_function *func,
418 unsigned xmm,
419 unsigned vec,
420 unsigned chan )
421 {
422 assert( vec == 0 );
423
424 emit_temps(
425 func,
426 xmm,
427 vec + TGSI_EXEC_TEMP_ADDR,
428 chan );
429 }
430
431 /**
432 * Coefficent fetch helpers.
433 */
434
435 static void
436 emit_coef_a0(
437 struct x86_function *func,
438 unsigned xmm,
439 unsigned vec,
440 unsigned chan )
441 {
442 emit_coef(
443 func,
444 xmm,
445 vec,
446 chan,
447 0 );
448 }
449
450 static void
451 emit_coef_dadx(
452 struct x86_function *func,
453 unsigned xmm,
454 unsigned vec,
455 unsigned chan )
456 {
457 emit_coef(
458 func,
459 xmm,
460 vec,
461 chan,
462 1 );
463 }
464
465 static void
466 emit_coef_dady(
467 struct x86_function *func,
468 unsigned xmm,
469 unsigned vec,
470 unsigned chan )
471 {
472 emit_coef(
473 func,
474 xmm,
475 vec,
476 chan,
477 2 );
478 }
479
480 /**
481 * Function call helpers.
482 */
483
484 /**
485 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
486 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
487 * that the stack pointer is 16 byte aligned, as expected.
488 */
489 static void
490 emit_func_call_dst(
491 struct x86_function *func,
492 unsigned xmm_save,
493 unsigned xmm_dst,
494 void (PIPE_CDECL *code)() )
495 {
496 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
497 unsigned i, n, xmm;
498 unsigned xmm_mask;
499
500 /* Bitmask of the xmm registers to save */
501 xmm_mask = (1 << xmm_save) - 1;
502 xmm_mask &= ~(1 << xmm_dst);
503
504 sse_movaps(
505 func,
506 get_temp( TEMP_R0, 0 ),
507 make_xmm( xmm_dst ) );
508
509 x86_push(
510 func,
511 x86_make_reg( file_REG32, reg_AX) );
512 x86_push(
513 func,
514 x86_make_reg( file_REG32, reg_CX) );
515 x86_push(
516 func,
517 x86_make_reg( file_REG32, reg_DX) );
518
519 for(i = 0, n = 0; i < 8; ++i)
520 if(xmm_mask & (1 << i))
521 ++n;
522
523 x86_sub_imm(
524 func,
525 x86_make_reg( file_REG32, reg_SP ),
526 n*16);
527
528 for(i = 0, n = 0; i < 8; ++i)
529 if(xmm_mask & (1 << i)) {
530 sse_movups(
531 func,
532 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
533 make_xmm( xmm ) );
534 ++n;
535 }
536
537 x86_lea(
538 func,
539 ecx,
540 get_temp( TEMP_R0, 0 ) );
541
542 x86_push( func, ecx );
543 x86_mov_reg_imm( func, ecx, (unsigned long) code );
544 x86_call( func, ecx );
545 x86_pop(func, ecx );
546
547 for(i = 0, n = 0; i < 8; ++i)
548 if(xmm_mask & (1 << i)) {
549 sse_movups(
550 func,
551 make_xmm( xmm ),
552 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
553 ++n;
554 }
555
556 x86_add_imm(
557 func,
558 x86_make_reg( file_REG32, reg_SP ),
559 n*16);
560
561 /* Restore GP registers in a reverse order.
562 */
563 x86_pop(
564 func,
565 x86_make_reg( file_REG32, reg_DX) );
566 x86_pop(
567 func,
568 x86_make_reg( file_REG32, reg_CX) );
569 x86_pop(
570 func,
571 x86_make_reg( file_REG32, reg_AX) );
572
573 sse_movaps(
574 func,
575 make_xmm( xmm_dst ),
576 get_temp( TEMP_R0, 0 ) );
577 }
578
579 static void
580 emit_func_call_dst_src(
581 struct x86_function *func,
582 unsigned xmm_save,
583 unsigned xmm_dst,
584 unsigned xmm_src,
585 void (PIPE_CDECL *code)() )
586 {
587 sse_movaps(
588 func,
589 get_temp( TEMP_R0, 1 ),
590 make_xmm( xmm_src ) );
591
592 emit_func_call_dst(
593 func,
594 xmm_save,
595 xmm_dst,
596 code );
597 }
598
599 /*
600 * Fast SSE2 implementation of special math functions.
601 */
602
603 #define POLY0(x, c0) _mm_set1_ps(c0)
604 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
605 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
606 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
607 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
608 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
609
610 #define EXP_POLY_DEGREE 3
611 #define LOG_POLY_DEGREE 5
612
613 /**
614 * See http://www.devmaster.net/forums/showthread.php?p=43580
615 */
616 static INLINE __m128
617 exp2f4(__m128 x)
618 {
619 __m128i ipart;
620 __m128 fpart, expipart, expfpart;
621
622 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
623 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
624
625 /* ipart = int(x - 0.5) */
626 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
627
628 /* fpart = x - ipart */
629 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
630
631 /* expipart = (float) (1 << ipart) */
632 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
633
634 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
635 #if EXP_POLY_DEGREE == 5
636 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
637 #elif EXP_POLY_DEGREE == 4
638 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
639 #elif EXP_POLY_DEGREE == 3
640 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
641 #elif EXP_POLY_DEGREE == 2
642 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
643 #else
644 #error
645 #endif
646
647 return _mm_mul_ps(expipart, expfpart);
648 }
649
650 /**
651 * See http://www.devmaster.net/forums/showthread.php?p=43580
652 */
653 static INLINE __m128
654 log2f4(__m128 x)
655 {
656 __m128i expmask = _mm_set1_epi32(0x7f800000);
657 __m128i mantmask = _mm_set1_epi32(0x007fffff);
658 __m128 one = _mm_set1_ps(1.0f);
659
660 __m128i i = _mm_castps_si128(x);
661
662 /* exp = (float) exponent(x) */
663 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
664
665 /* mant = (float) mantissa(x) */
666 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
667
668 __m128 logmant;
669
670 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
671 * These coefficients can be generate with
672 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
673 */
674 #if LOG_POLY_DEGREE == 6
675 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
676 #elif LOG_POLY_DEGREE == 5
677 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
678 #elif LOG_POLY_DEGREE == 4
679 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
680 #elif LOG_POLY_DEGREE == 3
681 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
682 #else
683 #error
684 #endif
685
686 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
687 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
688
689 return _mm_add_ps(logmant, exp);
690 }
691
692 static INLINE __m128
693 powf4(__m128 x, __m128 y)
694 {
695 return exp2f4(_mm_mul_ps(log2f4(x), y));
696 }
697
698
699 /**
700 * Low-level instruction translators.
701 */
702
703 static void
704 emit_abs(
705 struct x86_function *func,
706 unsigned xmm )
707 {
708 sse_andps(
709 func,
710 make_xmm( xmm ),
711 get_temp(
712 TGSI_EXEC_TEMP_7FFFFFFF_I,
713 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
714 }
715
716 static void
717 emit_add(
718 struct x86_function *func,
719 unsigned xmm_dst,
720 unsigned xmm_src )
721 {
722 sse_addps(
723 func,
724 make_xmm( xmm_dst ),
725 make_xmm( xmm_src ) );
726 }
727
728 static void PIPE_CDECL
729 cos4f(
730 float *store )
731 {
732 store[0] = cosf( store[0] );
733 store[1] = cosf( store[1] );
734 store[2] = cosf( store[2] );
735 store[3] = cosf( store[3] );
736 }
737
738 static void
739 emit_cos(
740 struct x86_function *func,
741 unsigned xmm_save,
742 unsigned xmm_dst )
743 {
744 emit_func_call_dst(
745 func,
746 xmm_save,
747 xmm_dst,
748 cos4f );
749 }
750
751 static void PIPE_CDECL
752 #if defined(PIPE_CC_GCC)
753 __attribute__((force_align_arg_pointer))
754 #endif
755 ex24f(
756 float *store )
757 {
758 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
759 }
760
761 static void
762 emit_ex2(
763 struct x86_function *func,
764 unsigned xmm_save,
765 unsigned xmm_dst )
766 {
767 emit_func_call_dst(
768 func,
769 xmm_save,
770 xmm_dst,
771 ex24f );
772 }
773
774 static void
775 emit_f2it(
776 struct x86_function *func,
777 unsigned xmm )
778 {
779 sse2_cvttps2dq(
780 func,
781 make_xmm( xmm ),
782 make_xmm( xmm ) );
783 }
784
785 static void PIPE_CDECL
786 flr4f(
787 float *store )
788 {
789 store[0] = floorf( store[0] );
790 store[1] = floorf( store[1] );
791 store[2] = floorf( store[2] );
792 store[3] = floorf( store[3] );
793 }
794
795 static void
796 emit_flr(
797 struct x86_function *func,
798 unsigned xmm_save,
799 unsigned xmm_dst )
800 {
801 emit_func_call_dst(
802 func,
803 xmm_save,
804 xmm_dst,
805 flr4f );
806 }
807
808 static void PIPE_CDECL
809 frc4f(
810 float *store )
811 {
812 store[0] -= floorf( store[0] );
813 store[1] -= floorf( store[1] );
814 store[2] -= floorf( store[2] );
815 store[3] -= floorf( store[3] );
816 }
817
818 static void
819 emit_frc(
820 struct x86_function *func,
821 unsigned xmm_save,
822 unsigned xmm_dst )
823 {
824 emit_func_call_dst(
825 func,
826 xmm_save,
827 xmm_dst,
828 frc4f );
829 }
830
831 static void PIPE_CDECL
832 #if defined(PIPE_CC_GCC)
833 __attribute__((force_align_arg_pointer))
834 #endif
835 lg24f(
836 float *store )
837 {
838 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
839 }
840
841 static void
842 emit_lg2(
843 struct x86_function *func,
844 unsigned xmm_save,
845 unsigned xmm_dst )
846 {
847 emit_func_call_dst(
848 func,
849 xmm_save,
850 xmm_dst,
851 lg24f );
852 }
853
854 static void
855 emit_MOV(
856 struct x86_function *func,
857 unsigned xmm_dst,
858 unsigned xmm_src )
859 {
860 sse_movups(
861 func,
862 make_xmm( xmm_dst ),
863 make_xmm( xmm_src ) );
864 }
865
866 static void
867 emit_mul (struct x86_function *func,
868 unsigned xmm_dst,
869 unsigned xmm_src)
870 {
871 sse_mulps(
872 func,
873 make_xmm( xmm_dst ),
874 make_xmm( xmm_src ) );
875 }
876
877 static void
878 emit_neg(
879 struct x86_function *func,
880 unsigned xmm )
881 {
882 sse_xorps(
883 func,
884 make_xmm( xmm ),
885 get_temp(
886 TGSI_EXEC_TEMP_80000000_I,
887 TGSI_EXEC_TEMP_80000000_C ) );
888 }
889
890 static void PIPE_CDECL
891 #if defined(PIPE_CC_GCC)
892 __attribute__((force_align_arg_pointer))
893 #endif
894 pow4f(
895 float *store )
896 {
897 #if 1
898 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
899 #else
900 store[0] = powf( store[0], store[4] );
901 store[1] = powf( store[1], store[5] );
902 store[2] = powf( store[2], store[6] );
903 store[3] = powf( store[3], store[7] );
904 #endif
905 }
906
907 static void
908 emit_pow(
909 struct x86_function *func,
910 unsigned xmm_save,
911 unsigned xmm_dst,
912 unsigned xmm_src )
913 {
914 emit_func_call_dst_src(
915 func,
916 xmm_save,
917 xmm_dst,
918 xmm_src,
919 pow4f );
920 }
921
922 static void
923 emit_rcp (
924 struct x86_function *func,
925 unsigned xmm_dst,
926 unsigned xmm_src )
927 {
928 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
929 * good enough. Need to either emit a proper divide or use the
930 * iterative technique described below in emit_rsqrt().
931 */
932 sse2_rcpps(
933 func,
934 make_xmm( xmm_dst ),
935 make_xmm( xmm_src ) );
936 }
937
938 static void
939 emit_rsqrt(
940 struct x86_function *func,
941 unsigned xmm_dst,
942 unsigned xmm_src )
943 {
944 #if HIGH_PRECISION
945 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
946 * implementations, it is possible to improve its precision at
947 * fairly low cost, using a newton/raphson step, as below:
948 *
949 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
950 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
951 *
952 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
953 */
954 {
955 struct x86_reg dst = make_xmm( xmm_dst );
956 struct x86_reg src = make_xmm( xmm_src );
957 struct x86_reg tmp0 = make_xmm( 2 );
958 struct x86_reg tmp1 = make_xmm( 3 );
959
960 assert( xmm_dst != xmm_src );
961 assert( xmm_dst != 2 && xmm_dst != 3 );
962 assert( xmm_src != 2 && xmm_src != 3 );
963
964 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
965 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
966 sse_rsqrtps( func, tmp1, src );
967 sse_mulps( func, src, tmp1 );
968 sse_mulps( func, dst, tmp1 );
969 sse_mulps( func, src, tmp1 );
970 sse_subps( func, tmp0, src );
971 sse_mulps( func, dst, tmp0 );
972 }
973 #else
974 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
975 * good enough.
976 */
977 sse_rsqrtps(
978 func,
979 make_xmm( xmm_dst ),
980 make_xmm( xmm_src ) );
981 #endif
982 }
983
984 static void
985 emit_setsign(
986 struct x86_function *func,
987 unsigned xmm )
988 {
989 sse_orps(
990 func,
991 make_xmm( xmm ),
992 get_temp(
993 TGSI_EXEC_TEMP_80000000_I,
994 TGSI_EXEC_TEMP_80000000_C ) );
995 }
996
997 static void PIPE_CDECL
998 sin4f(
999 float *store )
1000 {
1001 store[0] = sinf( store[0] );
1002 store[1] = sinf( store[1] );
1003 store[2] = sinf( store[2] );
1004 store[3] = sinf( store[3] );
1005 }
1006
1007 static void
1008 emit_sin (struct x86_function *func,
1009 unsigned xmm_save,
1010 unsigned xmm_dst)
1011 {
1012 emit_func_call_dst(
1013 func,
1014 xmm_save,
1015 xmm_dst,
1016 sin4f );
1017 }
1018
1019 static void
1020 emit_sub(
1021 struct x86_function *func,
1022 unsigned xmm_dst,
1023 unsigned xmm_src )
1024 {
1025 sse_subps(
1026 func,
1027 make_xmm( xmm_dst ),
1028 make_xmm( xmm_src ) );
1029 }
1030
1031 /**
1032 * Register fetch.
1033 */
1034
1035 static void
1036 emit_fetch(
1037 struct x86_function *func,
1038 unsigned xmm,
1039 const struct tgsi_full_src_register *reg,
1040 const unsigned chan_index )
1041 {
1042 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1043
1044 switch (swizzle) {
1045 case TGSI_EXTSWIZZLE_X:
1046 case TGSI_EXTSWIZZLE_Y:
1047 case TGSI_EXTSWIZZLE_Z:
1048 case TGSI_EXTSWIZZLE_W:
1049 switch (reg->SrcRegister.File) {
1050 case TGSI_FILE_CONSTANT:
1051 emit_const(
1052 func,
1053 xmm,
1054 reg->SrcRegister.Index,
1055 swizzle,
1056 reg->SrcRegister.Indirect,
1057 reg->SrcRegisterInd.File,
1058 reg->SrcRegisterInd.Index );
1059 break;
1060
1061 case TGSI_FILE_IMMEDIATE:
1062 emit_immediate(
1063 func,
1064 xmm,
1065 reg->SrcRegister.Index,
1066 swizzle );
1067 break;
1068
1069 case TGSI_FILE_INPUT:
1070 emit_inputf(
1071 func,
1072 xmm,
1073 reg->SrcRegister.Index,
1074 swizzle );
1075 break;
1076
1077 case TGSI_FILE_TEMPORARY:
1078 emit_tempf(
1079 func,
1080 xmm,
1081 reg->SrcRegister.Index,
1082 swizzle );
1083 break;
1084
1085 default:
1086 assert( 0 );
1087 }
1088 break;
1089
1090 case TGSI_EXTSWIZZLE_ZERO:
1091 emit_tempf(
1092 func,
1093 xmm,
1094 TGSI_EXEC_TEMP_00000000_I,
1095 TGSI_EXEC_TEMP_00000000_C );
1096 break;
1097
1098 case TGSI_EXTSWIZZLE_ONE:
1099 emit_tempf(
1100 func,
1101 xmm,
1102 TEMP_ONE_I,
1103 TEMP_ONE_C );
1104 break;
1105
1106 default:
1107 assert( 0 );
1108 }
1109
1110 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1111 case TGSI_UTIL_SIGN_CLEAR:
1112 emit_abs( func, xmm );
1113 break;
1114
1115 case TGSI_UTIL_SIGN_SET:
1116 emit_setsign( func, xmm );
1117 break;
1118
1119 case TGSI_UTIL_SIGN_TOGGLE:
1120 emit_neg( func, xmm );
1121 break;
1122
1123 case TGSI_UTIL_SIGN_KEEP:
1124 break;
1125 }
1126 }
1127
1128 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1129 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1130
1131 /**
1132 * Register store.
1133 */
1134
1135 static void
1136 emit_store(
1137 struct x86_function *func,
1138 unsigned xmm,
1139 const struct tgsi_full_dst_register *reg,
1140 const struct tgsi_full_instruction *inst,
1141 unsigned chan_index )
1142 {
1143 switch( reg->DstRegister.File ) {
1144 case TGSI_FILE_OUTPUT:
1145 emit_output(
1146 func,
1147 xmm,
1148 reg->DstRegister.Index,
1149 chan_index );
1150 break;
1151
1152 case TGSI_FILE_TEMPORARY:
1153 emit_temps(
1154 func,
1155 xmm,
1156 reg->DstRegister.Index,
1157 chan_index );
1158 break;
1159
1160 case TGSI_FILE_ADDRESS:
1161 emit_addrs(
1162 func,
1163 xmm,
1164 reg->DstRegister.Index,
1165 chan_index );
1166 break;
1167
1168 default:
1169 assert( 0 );
1170 }
1171
1172 switch( inst->Instruction.Saturate ) {
1173 case TGSI_SAT_NONE:
1174 break;
1175
1176 case TGSI_SAT_ZERO_ONE:
1177 /* assert( 0 ); */
1178 break;
1179
1180 case TGSI_SAT_MINUS_PLUS_ONE:
1181 assert( 0 );
1182 break;
1183 }
1184 }
1185
1186 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1187 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1188
1189 /**
1190 * High-level instruction translators.
1191 */
1192
1193 static void
1194 emit_kil(
1195 struct x86_function *func,
1196 const struct tgsi_full_src_register *reg )
1197 {
1198 unsigned uniquemask;
1199 unsigned registers[4];
1200 unsigned nextregister = 0;
1201 unsigned firstchan = ~0;
1202 unsigned chan_index;
1203
1204 /* This mask stores component bits that were already tested. Note that
1205 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1206 * tested. */
1207 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1208
1209 FOR_EACH_CHANNEL( chan_index ) {
1210 unsigned swizzle;
1211
1212 /* unswizzle channel */
1213 swizzle = tgsi_util_get_full_src_register_extswizzle(
1214 reg,
1215 chan_index );
1216
1217 /* check if the component has not been already tested */
1218 if( !(uniquemask & (1 << swizzle)) ) {
1219 uniquemask |= 1 << swizzle;
1220
1221 /* allocate register */
1222 registers[chan_index] = nextregister;
1223 emit_fetch(
1224 func,
1225 nextregister,
1226 reg,
1227 chan_index );
1228 nextregister++;
1229
1230 /* mark the first channel used */
1231 if( firstchan == ~0 ) {
1232 firstchan = chan_index;
1233 }
1234 }
1235 }
1236
1237 x86_push(
1238 func,
1239 x86_make_reg( file_REG32, reg_AX ) );
1240 x86_push(
1241 func,
1242 x86_make_reg( file_REG32, reg_DX ) );
1243
1244 FOR_EACH_CHANNEL( chan_index ) {
1245 if( uniquemask & (1 << chan_index) ) {
1246 sse_cmpps(
1247 func,
1248 make_xmm( registers[chan_index] ),
1249 get_temp(
1250 TGSI_EXEC_TEMP_00000000_I,
1251 TGSI_EXEC_TEMP_00000000_C ),
1252 cc_LessThan );
1253
1254 if( chan_index == firstchan ) {
1255 sse_pmovmskb(
1256 func,
1257 x86_make_reg( file_REG32, reg_AX ),
1258 make_xmm( registers[chan_index] ) );
1259 }
1260 else {
1261 sse_pmovmskb(
1262 func,
1263 x86_make_reg( file_REG32, reg_DX ),
1264 make_xmm( registers[chan_index] ) );
1265 x86_or(
1266 func,
1267 x86_make_reg( file_REG32, reg_AX ),
1268 x86_make_reg( file_REG32, reg_DX ) );
1269 }
1270 }
1271 }
1272
1273 x86_or(
1274 func,
1275 get_temp(
1276 TGSI_EXEC_TEMP_KILMASK_I,
1277 TGSI_EXEC_TEMP_KILMASK_C ),
1278 x86_make_reg( file_REG32, reg_AX ) );
1279
1280 x86_pop(
1281 func,
1282 x86_make_reg( file_REG32, reg_DX ) );
1283 x86_pop(
1284 func,
1285 x86_make_reg( file_REG32, reg_AX ) );
1286 }
1287
1288
1289 static void
1290 emit_kilp(
1291 struct x86_function *func )
1292 {
1293 /* XXX todo / fix me */
1294 }
1295
1296
1297 static void
1298 emit_setcc(
1299 struct x86_function *func,
1300 struct tgsi_full_instruction *inst,
1301 enum sse_cc cc )
1302 {
1303 unsigned chan_index;
1304
1305 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1306 FETCH( func, *inst, 0, 0, chan_index );
1307 FETCH( func, *inst, 1, 1, chan_index );
1308 sse_cmpps(
1309 func,
1310 make_xmm( 0 ),
1311 make_xmm( 1 ),
1312 cc );
1313 sse_andps(
1314 func,
1315 make_xmm( 0 ),
1316 get_temp(
1317 TEMP_ONE_I,
1318 TEMP_ONE_C ) );
1319 STORE( func, *inst, 0, 0, chan_index );
1320 }
1321 }
1322
1323 static void
1324 emit_cmp(
1325 struct x86_function *func,
1326 struct tgsi_full_instruction *inst )
1327 {
1328 unsigned chan_index;
1329
1330 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1331 FETCH( func, *inst, 0, 0, chan_index );
1332 FETCH( func, *inst, 1, 1, chan_index );
1333 FETCH( func, *inst, 2, 2, chan_index );
1334 sse_cmpps(
1335 func,
1336 make_xmm( 0 ),
1337 get_temp(
1338 TGSI_EXEC_TEMP_00000000_I,
1339 TGSI_EXEC_TEMP_00000000_C ),
1340 cc_LessThan );
1341 sse_andps(
1342 func,
1343 make_xmm( 1 ),
1344 make_xmm( 0 ) );
1345 sse_andnps(
1346 func,
1347 make_xmm( 0 ),
1348 make_xmm( 2 ) );
1349 sse_orps(
1350 func,
1351 make_xmm( 0 ),
1352 make_xmm( 1 ) );
1353 STORE( func, *inst, 0, 0, chan_index );
1354 }
1355 }
1356
1357 static int
1358 emit_instruction(
1359 struct x86_function *func,
1360 struct tgsi_full_instruction *inst )
1361 {
1362 unsigned chan_index;
1363
1364 switch (inst->Instruction.Opcode) {
1365 case TGSI_OPCODE_ARL:
1366 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1367 FETCH( func, *inst, 0, 0, chan_index );
1368 emit_f2it( func, 0 );
1369 STORE( func, *inst, 0, 0, chan_index );
1370 }
1371 break;
1372
1373 case TGSI_OPCODE_MOV:
1374 case TGSI_OPCODE_SWZ:
1375 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1376 FETCH( func, *inst, 0, 0, chan_index );
1377 STORE( func, *inst, 0, 0, chan_index );
1378 }
1379 break;
1380
1381 case TGSI_OPCODE_LIT:
1382 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1383 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1384 emit_tempf(
1385 func,
1386 0,
1387 TEMP_ONE_I,
1388 TEMP_ONE_C);
1389 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1390 STORE( func, *inst, 0, 0, CHAN_X );
1391 }
1392 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1393 STORE( func, *inst, 0, 0, CHAN_W );
1394 }
1395 }
1396 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1397 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1398 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1399 FETCH( func, *inst, 0, 0, CHAN_X );
1400 sse_maxps(
1401 func,
1402 make_xmm( 0 ),
1403 get_temp(
1404 TGSI_EXEC_TEMP_00000000_I,
1405 TGSI_EXEC_TEMP_00000000_C ) );
1406 STORE( func, *inst, 0, 0, CHAN_Y );
1407 }
1408 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1409 /* XMM[1] = SrcReg[0].yyyy */
1410 FETCH( func, *inst, 1, 0, CHAN_Y );
1411 /* XMM[1] = max(XMM[1], 0) */
1412 sse_maxps(
1413 func,
1414 make_xmm( 1 ),
1415 get_temp(
1416 TGSI_EXEC_TEMP_00000000_I,
1417 TGSI_EXEC_TEMP_00000000_C ) );
1418 /* XMM[2] = SrcReg[0].wwww */
1419 FETCH( func, *inst, 2, 0, CHAN_W );
1420 /* XMM[2] = min(XMM[2], 128.0) */
1421 sse_minps(
1422 func,
1423 make_xmm( 2 ),
1424 get_temp(
1425 TGSI_EXEC_TEMP_128_I,
1426 TGSI_EXEC_TEMP_128_C ) );
1427 /* XMM[2] = max(XMM[2], -128.0) */
1428 sse_maxps(
1429 func,
1430 make_xmm( 2 ),
1431 get_temp(
1432 TGSI_EXEC_TEMP_MINUS_128_I,
1433 TGSI_EXEC_TEMP_MINUS_128_C ) );
1434 emit_pow( func, 3, 1, 2 );
1435 FETCH( func, *inst, 0, 0, CHAN_X );
1436 sse_xorps(
1437 func,
1438 make_xmm( 2 ),
1439 make_xmm( 2 ) );
1440 sse_cmpps(
1441 func,
1442 make_xmm( 2 ),
1443 make_xmm( 0 ),
1444 cc_LessThanEqual );
1445 sse_andps(
1446 func,
1447 make_xmm( 2 ),
1448 make_xmm( 1 ) );
1449 STORE( func, *inst, 2, 0, CHAN_Z );
1450 }
1451 }
1452 break;
1453
1454 case TGSI_OPCODE_RCP:
1455 /* TGSI_OPCODE_RECIP */
1456 FETCH( func, *inst, 0, 0, CHAN_X );
1457 emit_rcp( func, 0, 0 );
1458 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1459 STORE( func, *inst, 0, 0, chan_index );
1460 }
1461 break;
1462
1463 case TGSI_OPCODE_RSQ:
1464 /* TGSI_OPCODE_RECIPSQRT */
1465 FETCH( func, *inst, 0, 0, CHAN_X );
1466 emit_rsqrt( func, 1, 0 );
1467 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1468 STORE( func, *inst, 1, 0, chan_index );
1469 }
1470 break;
1471
1472 case TGSI_OPCODE_EXP:
1473 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1474 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1475 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1476 FETCH( func, *inst, 0, 0, CHAN_X );
1477 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1478 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1479 emit_MOV( func, 1, 0 );
1480 emit_flr( func, 2, 1 );
1481 /* dst.x = ex2(floor(src.x)) */
1482 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1483 emit_MOV( func, 2, 1 );
1484 emit_ex2( func, 3, 2 );
1485 STORE( func, *inst, 2, 0, CHAN_X );
1486 }
1487 /* dst.y = src.x - floor(src.x) */
1488 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1489 emit_MOV( func, 2, 0 );
1490 emit_sub( func, 2, 1 );
1491 STORE( func, *inst, 2, 0, CHAN_Y );
1492 }
1493 }
1494 /* dst.z = ex2(src.x) */
1495 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1496 emit_ex2( func, 3, 0 );
1497 STORE( func, *inst, 0, 0, CHAN_Z );
1498 }
1499 }
1500 /* dst.w = 1.0 */
1501 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1502 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1503 STORE( func, *inst, 0, 0, CHAN_W );
1504 }
1505 break;
1506
1507 case TGSI_OPCODE_LOG:
1508 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1509 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1510 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1511 FETCH( func, *inst, 0, 0, CHAN_X );
1512 emit_abs( func, 0 );
1513 emit_MOV( func, 1, 0 );
1514 emit_lg2( func, 2, 1 );
1515 /* dst.z = lg2(abs(src.x)) */
1516 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1517 STORE( func, *inst, 1, 0, CHAN_Z );
1518 }
1519 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1520 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1521 emit_flr( func, 2, 1 );
1522 /* dst.x = floor(lg2(abs(src.x))) */
1523 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1524 STORE( func, *inst, 1, 0, CHAN_X );
1525 }
1526 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1527 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1528 emit_ex2( func, 2, 1 );
1529 emit_rcp( func, 1, 1 );
1530 emit_mul( func, 0, 1 );
1531 STORE( func, *inst, 0, 0, CHAN_Y );
1532 }
1533 }
1534 }
1535 /* dst.w = 1.0 */
1536 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1537 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1538 STORE( func, *inst, 0, 0, CHAN_W );
1539 }
1540 break;
1541
1542 case TGSI_OPCODE_MUL:
1543 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1544 FETCH( func, *inst, 0, 0, chan_index );
1545 FETCH( func, *inst, 1, 1, chan_index );
1546 emit_mul( func, 0, 1 );
1547 STORE( func, *inst, 0, 0, chan_index );
1548 }
1549 break;
1550
1551 case TGSI_OPCODE_ADD:
1552 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1553 FETCH( func, *inst, 0, 0, chan_index );
1554 FETCH( func, *inst, 1, 1, chan_index );
1555 emit_add( func, 0, 1 );
1556 STORE( func, *inst, 0, 0, chan_index );
1557 }
1558 break;
1559
1560 case TGSI_OPCODE_DP3:
1561 /* TGSI_OPCODE_DOT3 */
1562 FETCH( func, *inst, 0, 0, CHAN_X );
1563 FETCH( func, *inst, 1, 1, CHAN_X );
1564 emit_mul( func, 0, 1 );
1565 FETCH( func, *inst, 1, 0, CHAN_Y );
1566 FETCH( func, *inst, 2, 1, CHAN_Y );
1567 emit_mul( func, 1, 2 );
1568 emit_add( func, 0, 1 );
1569 FETCH( func, *inst, 1, 0, CHAN_Z );
1570 FETCH( func, *inst, 2, 1, CHAN_Z );
1571 emit_mul( func, 1, 2 );
1572 emit_add( func, 0, 1 );
1573 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1574 STORE( func, *inst, 0, 0, chan_index );
1575 }
1576 break;
1577
1578 case TGSI_OPCODE_DP4:
1579 /* TGSI_OPCODE_DOT4 */
1580 FETCH( func, *inst, 0, 0, CHAN_X );
1581 FETCH( func, *inst, 1, 1, CHAN_X );
1582 emit_mul( func, 0, 1 );
1583 FETCH( func, *inst, 1, 0, CHAN_Y );
1584 FETCH( func, *inst, 2, 1, CHAN_Y );
1585 emit_mul( func, 1, 2 );
1586 emit_add( func, 0, 1 );
1587 FETCH( func, *inst, 1, 0, CHAN_Z );
1588 FETCH( func, *inst, 2, 1, CHAN_Z );
1589 emit_mul(func, 1, 2 );
1590 emit_add(func, 0, 1 );
1591 FETCH( func, *inst, 1, 0, CHAN_W );
1592 FETCH( func, *inst, 2, 1, CHAN_W );
1593 emit_mul( func, 1, 2 );
1594 emit_add( func, 0, 1 );
1595 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1596 STORE( func, *inst, 0, 0, chan_index );
1597 }
1598 break;
1599
1600 case TGSI_OPCODE_DST:
1601 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1602 emit_tempf(
1603 func,
1604 0,
1605 TEMP_ONE_I,
1606 TEMP_ONE_C );
1607 STORE( func, *inst, 0, 0, CHAN_X );
1608 }
1609 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1610 FETCH( func, *inst, 0, 0, CHAN_Y );
1611 FETCH( func, *inst, 1, 1, CHAN_Y );
1612 emit_mul( func, 0, 1 );
1613 STORE( func, *inst, 0, 0, CHAN_Y );
1614 }
1615 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1616 FETCH( func, *inst, 0, 0, CHAN_Z );
1617 STORE( func, *inst, 0, 0, CHAN_Z );
1618 }
1619 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1620 FETCH( func, *inst, 0, 1, CHAN_W );
1621 STORE( func, *inst, 0, 0, CHAN_W );
1622 }
1623 break;
1624
1625 case TGSI_OPCODE_MIN:
1626 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1627 FETCH( func, *inst, 0, 0, chan_index );
1628 FETCH( func, *inst, 1, 1, chan_index );
1629 sse_minps(
1630 func,
1631 make_xmm( 0 ),
1632 make_xmm( 1 ) );
1633 STORE( func, *inst, 0, 0, chan_index );
1634 }
1635 break;
1636
1637 case TGSI_OPCODE_MAX:
1638 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1639 FETCH( func, *inst, 0, 0, chan_index );
1640 FETCH( func, *inst, 1, 1, chan_index );
1641 sse_maxps(
1642 func,
1643 make_xmm( 0 ),
1644 make_xmm( 1 ) );
1645 STORE( func, *inst, 0, 0, chan_index );
1646 }
1647 break;
1648
1649 case TGSI_OPCODE_SLT:
1650 /* TGSI_OPCODE_SETLT */
1651 emit_setcc( func, inst, cc_LessThan );
1652 break;
1653
1654 case TGSI_OPCODE_SGE:
1655 /* TGSI_OPCODE_SETGE */
1656 emit_setcc( func, inst, cc_NotLessThan );
1657 break;
1658
1659 case TGSI_OPCODE_MAD:
1660 /* TGSI_OPCODE_MADD */
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662 FETCH( func, *inst, 0, 0, chan_index );
1663 FETCH( func, *inst, 1, 1, chan_index );
1664 FETCH( func, *inst, 2, 2, chan_index );
1665 emit_mul( func, 0, 1 );
1666 emit_add( func, 0, 2 );
1667 STORE( func, *inst, 0, 0, chan_index );
1668 }
1669 break;
1670
1671 case TGSI_OPCODE_SUB:
1672 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1673 FETCH( func, *inst, 0, 0, chan_index );
1674 FETCH( func, *inst, 1, 1, chan_index );
1675 emit_sub( func, 0, 1 );
1676 STORE( func, *inst, 0, 0, chan_index );
1677 }
1678 break;
1679
1680 case TGSI_OPCODE_LERP:
1681 /* TGSI_OPCODE_LRP */
1682 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1683 FETCH( func, *inst, 0, 0, chan_index );
1684 FETCH( func, *inst, 1, 1, chan_index );
1685 FETCH( func, *inst, 2, 2, chan_index );
1686 emit_sub( func, 1, 2 );
1687 emit_mul( func, 0, 1 );
1688 emit_add( func, 0, 2 );
1689 STORE( func, *inst, 0, 0, chan_index );
1690 }
1691 break;
1692
1693 case TGSI_OPCODE_CND:
1694 return 0;
1695 break;
1696
1697 case TGSI_OPCODE_CND0:
1698 return 0;
1699 break;
1700
1701 case TGSI_OPCODE_DOT2ADD:
1702 /* TGSI_OPCODE_DP2A */
1703 return 0;
1704 break;
1705
1706 case TGSI_OPCODE_INDEX:
1707 return 0;
1708 break;
1709
1710 case TGSI_OPCODE_NEGATE:
1711 return 0;
1712 break;
1713
1714 case TGSI_OPCODE_FRAC:
1715 /* TGSI_OPCODE_FRC */
1716 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1717 FETCH( func, *inst, 0, 0, chan_index );
1718 emit_frc( func, 0, 0 );
1719 STORE( func, *inst, 0, 0, chan_index );
1720 }
1721 break;
1722
1723 case TGSI_OPCODE_CLAMP:
1724 return 0;
1725 break;
1726
1727 case TGSI_OPCODE_FLOOR:
1728 /* TGSI_OPCODE_FLR */
1729 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1730 FETCH( func, *inst, 0, 0, chan_index );
1731 emit_flr( func, 0, 0 );
1732 STORE( func, *inst, 0, 0, chan_index );
1733 }
1734 break;
1735
1736 case TGSI_OPCODE_ROUND:
1737 return 0;
1738 break;
1739
1740 case TGSI_OPCODE_EXPBASE2:
1741 /* TGSI_OPCODE_EX2 */
1742 FETCH( func, *inst, 0, 0, CHAN_X );
1743 emit_ex2( func, 0, 0 );
1744 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1745 STORE( func, *inst, 0, 0, chan_index );
1746 }
1747 break;
1748
1749 case TGSI_OPCODE_LOGBASE2:
1750 /* TGSI_OPCODE_LG2 */
1751 FETCH( func, *inst, 0, 0, CHAN_X );
1752 emit_lg2( func, 0, 0 );
1753 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1754 STORE( func, *inst, 0, 0, chan_index );
1755 }
1756 break;
1757
1758 case TGSI_OPCODE_POWER:
1759 /* TGSI_OPCODE_POW */
1760 FETCH( func, *inst, 0, 0, CHAN_X );
1761 FETCH( func, *inst, 1, 1, CHAN_X );
1762 emit_pow( func, 0, 0, 1 );
1763 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1764 STORE( func, *inst, 0, 0, chan_index );
1765 }
1766 break;
1767
1768 case TGSI_OPCODE_CROSSPRODUCT:
1769 /* TGSI_OPCODE_XPD */
1770 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1771 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1772 FETCH( func, *inst, 1, 1, CHAN_Z );
1773 FETCH( func, *inst, 3, 0, CHAN_Z );
1774 }
1775 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1776 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1777 FETCH( func, *inst, 0, 0, CHAN_Y );
1778 FETCH( func, *inst, 4, 1, CHAN_Y );
1779 }
1780 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1781 emit_MOV( func, 2, 0 );
1782 emit_mul( func, 2, 1 );
1783 emit_MOV( func, 5, 3 );
1784 emit_mul( func, 5, 4 );
1785 emit_sub( func, 2, 5 );
1786 STORE( func, *inst, 2, 0, CHAN_X );
1787 }
1788 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1789 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1790 FETCH( func, *inst, 2, 1, CHAN_X );
1791 FETCH( func, *inst, 5, 0, CHAN_X );
1792 }
1793 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1794 emit_mul( func, 3, 2 );
1795 emit_mul( func, 1, 5 );
1796 emit_sub( func, 3, 1 );
1797 STORE( func, *inst, 3, 0, CHAN_Y );
1798 }
1799 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1800 emit_mul( func, 5, 4 );
1801 emit_mul( func, 0, 2 );
1802 emit_sub( func, 5, 0 );
1803 STORE( func, *inst, 5, 0, CHAN_Z );
1804 }
1805 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1806 emit_tempf(
1807 func,
1808 0,
1809 TEMP_ONE_I,
1810 TEMP_ONE_C );
1811 STORE( func, *inst, 0, 0, CHAN_W );
1812 }
1813 break;
1814
1815 case TGSI_OPCODE_MULTIPLYMATRIX:
1816 return 0;
1817 break;
1818
1819 case TGSI_OPCODE_ABS:
1820 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1821 FETCH( func, *inst, 0, 0, chan_index );
1822 emit_abs( func, 0) ;
1823
1824 STORE( func, *inst, 0, 0, chan_index );
1825 }
1826 break;
1827
1828 case TGSI_OPCODE_RCC:
1829 return 0;
1830 break;
1831
1832 case TGSI_OPCODE_DPH:
1833 FETCH( func, *inst, 0, 0, CHAN_X );
1834 FETCH( func, *inst, 1, 1, CHAN_X );
1835 emit_mul( func, 0, 1 );
1836 FETCH( func, *inst, 1, 0, CHAN_Y );
1837 FETCH( func, *inst, 2, 1, CHAN_Y );
1838 emit_mul( func, 1, 2 );
1839 emit_add( func, 0, 1 );
1840 FETCH( func, *inst, 1, 0, CHAN_Z );
1841 FETCH( func, *inst, 2, 1, CHAN_Z );
1842 emit_mul( func, 1, 2 );
1843 emit_add( func, 0, 1 );
1844 FETCH( func, *inst, 1, 1, CHAN_W );
1845 emit_add( func, 0, 1 );
1846 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1847 STORE( func, *inst, 0, 0, chan_index );
1848 }
1849 break;
1850
1851 case TGSI_OPCODE_COS:
1852 FETCH( func, *inst, 0, 0, CHAN_X );
1853 emit_cos( func, 0, 0 );
1854 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855 STORE( func, *inst, 0, 0, chan_index );
1856 }
1857 break;
1858
1859 case TGSI_OPCODE_DDX:
1860 return 0;
1861 break;
1862
1863 case TGSI_OPCODE_DDY:
1864 return 0;
1865 break;
1866
1867 case TGSI_OPCODE_KILP:
1868 /* predicated kill */
1869 emit_kilp( func );
1870 return 0; /* XXX fix me */
1871 break;
1872
1873 case TGSI_OPCODE_KIL:
1874 /* conditional kill */
1875 emit_kil( func, &inst->FullSrcRegisters[0] );
1876 break;
1877
1878 case TGSI_OPCODE_PK2H:
1879 return 0;
1880 break;
1881
1882 case TGSI_OPCODE_PK2US:
1883 return 0;
1884 break;
1885
1886 case TGSI_OPCODE_PK4B:
1887 return 0;
1888 break;
1889
1890 case TGSI_OPCODE_PK4UB:
1891 return 0;
1892 break;
1893
1894 case TGSI_OPCODE_RFL:
1895 return 0;
1896 break;
1897
1898 case TGSI_OPCODE_SEQ:
1899 return 0;
1900 break;
1901
1902 case TGSI_OPCODE_SFL:
1903 return 0;
1904 break;
1905
1906 case TGSI_OPCODE_SGT:
1907 return 0;
1908 break;
1909
1910 case TGSI_OPCODE_SIN:
1911 FETCH( func, *inst, 0, 0, CHAN_X );
1912 emit_sin( func, 0, 0 );
1913 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1914 STORE( func, *inst, 0, 0, chan_index );
1915 }
1916 break;
1917
1918 case TGSI_OPCODE_SLE:
1919 return 0;
1920 break;
1921
1922 case TGSI_OPCODE_SNE:
1923 return 0;
1924 break;
1925
1926 case TGSI_OPCODE_STR:
1927 return 0;
1928 break;
1929
1930 case TGSI_OPCODE_TEX:
1931 if (0) {
1932 /* Disable dummy texture code:
1933 */
1934 emit_tempf(
1935 func,
1936 0,
1937 TEMP_ONE_I,
1938 TEMP_ONE_C );
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940 STORE( func, *inst, 0, 0, chan_index );
1941 }
1942 }
1943 else {
1944 return 0;
1945 }
1946 break;
1947
1948 case TGSI_OPCODE_TXD:
1949 return 0;
1950 break;
1951
1952 case TGSI_OPCODE_UP2H:
1953 return 0;
1954 break;
1955
1956 case TGSI_OPCODE_UP2US:
1957 return 0;
1958 break;
1959
1960 case TGSI_OPCODE_UP4B:
1961 return 0;
1962 break;
1963
1964 case TGSI_OPCODE_UP4UB:
1965 return 0;
1966 break;
1967
1968 case TGSI_OPCODE_X2D:
1969 return 0;
1970 break;
1971
1972 case TGSI_OPCODE_ARA:
1973 return 0;
1974 break;
1975
1976 case TGSI_OPCODE_ARR:
1977 return 0;
1978 break;
1979
1980 case TGSI_OPCODE_BRA:
1981 return 0;
1982 break;
1983
1984 case TGSI_OPCODE_CAL:
1985 return 0;
1986 break;
1987
1988 case TGSI_OPCODE_RET:
1989 emit_ret( func );
1990 break;
1991
1992 case TGSI_OPCODE_END:
1993 break;
1994
1995 case TGSI_OPCODE_SSG:
1996 return 0;
1997 break;
1998
1999 case TGSI_OPCODE_CMP:
2000 emit_cmp (func, inst);
2001 break;
2002
2003 case TGSI_OPCODE_SCS:
2004 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2005 FETCH( func, *inst, 0, 0, CHAN_X );
2006 emit_cos( func, 0, 0 );
2007 STORE( func, *inst, 0, 0, CHAN_X );
2008 }
2009 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2010 FETCH( func, *inst, 0, 0, CHAN_X );
2011 emit_sin( func, 0, 0 );
2012 STORE( func, *inst, 0, 0, CHAN_Y );
2013 }
2014 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2015 emit_tempf(
2016 func,
2017 0,
2018 TGSI_EXEC_TEMP_00000000_I,
2019 TGSI_EXEC_TEMP_00000000_C );
2020 STORE( func, *inst, 0, 0, CHAN_Z );
2021 }
2022 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2023 emit_tempf(
2024 func,
2025 0,
2026 TEMP_ONE_I,
2027 TEMP_ONE_C );
2028 STORE( func, *inst, 0, 0, CHAN_W );
2029 }
2030 break;
2031
2032 case TGSI_OPCODE_TXB:
2033 return 0;
2034 break;
2035
2036 case TGSI_OPCODE_NRM:
2037 return 0;
2038 break;
2039
2040 case TGSI_OPCODE_DIV:
2041 return 0;
2042 break;
2043
2044 case TGSI_OPCODE_DP2:
2045 return 0;
2046 break;
2047
2048 case TGSI_OPCODE_TXL:
2049 return 0;
2050 break;
2051
2052 case TGSI_OPCODE_BRK:
2053 return 0;
2054 break;
2055
2056 case TGSI_OPCODE_IF:
2057 return 0;
2058 break;
2059
2060 case TGSI_OPCODE_LOOP:
2061 return 0;
2062 break;
2063
2064 case TGSI_OPCODE_REP:
2065 return 0;
2066 break;
2067
2068 case TGSI_OPCODE_ELSE:
2069 return 0;
2070 break;
2071
2072 case TGSI_OPCODE_ENDIF:
2073 return 0;
2074 break;
2075
2076 case TGSI_OPCODE_ENDLOOP:
2077 return 0;
2078 break;
2079
2080 case TGSI_OPCODE_ENDREP:
2081 return 0;
2082 break;
2083
2084 case TGSI_OPCODE_PUSHA:
2085 return 0;
2086 break;
2087
2088 case TGSI_OPCODE_POPA:
2089 return 0;
2090 break;
2091
2092 case TGSI_OPCODE_CEIL:
2093 return 0;
2094 break;
2095
2096 case TGSI_OPCODE_I2F:
2097 return 0;
2098 break;
2099
2100 case TGSI_OPCODE_NOT:
2101 return 0;
2102 break;
2103
2104 case TGSI_OPCODE_TRUNC:
2105 return 0;
2106 break;
2107
2108 case TGSI_OPCODE_SHL:
2109 return 0;
2110 break;
2111
2112 case TGSI_OPCODE_SHR:
2113 return 0;
2114 break;
2115
2116 case TGSI_OPCODE_AND:
2117 return 0;
2118 break;
2119
2120 case TGSI_OPCODE_OR:
2121 return 0;
2122 break;
2123
2124 case TGSI_OPCODE_MOD:
2125 return 0;
2126 break;
2127
2128 case TGSI_OPCODE_XOR:
2129 return 0;
2130 break;
2131
2132 case TGSI_OPCODE_SAD:
2133 return 0;
2134 break;
2135
2136 case TGSI_OPCODE_TXF:
2137 return 0;
2138 break;
2139
2140 case TGSI_OPCODE_TXQ:
2141 return 0;
2142 break;
2143
2144 case TGSI_OPCODE_CONT:
2145 return 0;
2146 break;
2147
2148 case TGSI_OPCODE_EMIT:
2149 return 0;
2150 break;
2151
2152 case TGSI_OPCODE_ENDPRIM:
2153 return 0;
2154 break;
2155
2156 default:
2157 return 0;
2158 }
2159
2160 return 1;
2161 }
2162
2163 static void
2164 emit_declaration(
2165 struct x86_function *func,
2166 struct tgsi_full_declaration *decl )
2167 {
2168 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2169 unsigned first, last, mask;
2170 unsigned i, j;
2171
2172 first = decl->DeclarationRange.First;
2173 last = decl->DeclarationRange.Last;
2174 mask = decl->Declaration.UsageMask;
2175
2176 for( i = first; i <= last; i++ ) {
2177 for( j = 0; j < NUM_CHANNELS; j++ ) {
2178 if( mask & (1 << j) ) {
2179 switch( decl->Declaration.Interpolate ) {
2180 case TGSI_INTERPOLATE_CONSTANT:
2181 emit_coef_a0( func, 0, i, j );
2182 emit_inputs( func, 0, i, j );
2183 break;
2184
2185 case TGSI_INTERPOLATE_LINEAR:
2186 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2187 emit_coef_dadx( func, 1, i, j );
2188 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2189 emit_coef_dady( func, 3, i, j );
2190 emit_mul( func, 0, 1 ); /* x * dadx */
2191 emit_coef_a0( func, 4, i, j );
2192 emit_mul( func, 2, 3 ); /* y * dady */
2193 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2194 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2195 emit_inputs( func, 0, i, j );
2196 break;
2197
2198 case TGSI_INTERPOLATE_PERSPECTIVE:
2199 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2200 emit_coef_dadx( func, 1, i, j );
2201 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2202 emit_coef_dady( func, 3, i, j );
2203 emit_mul( func, 0, 1 ); /* x * dadx */
2204 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2205 emit_coef_a0( func, 5, i, j );
2206 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2207 emit_mul( func, 2, 3 ); /* y * dady */
2208 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2209 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2210 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2211 emit_inputs( func, 0, i, j );
2212 break;
2213
2214 default:
2215 assert( 0 );
2216 break;
2217 }
2218 }
2219 }
2220 }
2221 }
2222 }
2223
2224 static void aos_to_soa( struct x86_function *func,
2225 uint arg_aos,
2226 uint arg_soa,
2227 uint arg_num,
2228 uint arg_stride )
2229 {
2230 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2231 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2232 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2233 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2234 int inner_loop;
2235
2236
2237 /* Save EBX */
2238 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2239
2240 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2241 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2242 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2243 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2244
2245 /* do */
2246 inner_loop = x86_get_label( func );
2247 {
2248 x86_push( func, aos_input );
2249 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2250 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2251 x86_add( func, aos_input, stride );
2252 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2253 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2254 x86_add( func, aos_input, stride );
2255 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2256 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2257 x86_add( func, aos_input, stride );
2258 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2259 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2260 x86_pop( func, aos_input );
2261
2262 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2263 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2264 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2265 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2266 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2267 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2268
2269 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2270 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2271 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2272 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2273
2274 /* Advance to next input */
2275 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2276 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2277 }
2278 /* while --num_inputs */
2279 x86_dec( func, num_inputs );
2280 x86_jcc( func, cc_NE, inner_loop );
2281
2282 /* Restore EBX */
2283 x86_pop( func, aos_input );
2284 }
2285
2286 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2287 {
2288 struct x86_reg soa_output;
2289 struct x86_reg aos_output;
2290 struct x86_reg num_outputs;
2291 struct x86_reg temp;
2292 int inner_loop;
2293
2294 soa_output = x86_make_reg( file_REG32, reg_AX );
2295 aos_output = x86_make_reg( file_REG32, reg_BX );
2296 num_outputs = x86_make_reg( file_REG32, reg_CX );
2297 temp = x86_make_reg( file_REG32, reg_DX );
2298
2299 /* Save EBX */
2300 x86_push( func, aos_output );
2301
2302 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2303 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2304 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2305
2306 /* do */
2307 inner_loop = x86_get_label( func );
2308 {
2309 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2310 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2311 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2312 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2313
2314 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2315 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2316 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2317 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2318 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2319 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2320
2321 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2322 x86_push( func, aos_output );
2323 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2324 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2325 x86_add( func, aos_output, temp );
2326 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2327 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2328 x86_add( func, aos_output, temp );
2329 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2330 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2331 x86_add( func, aos_output, temp );
2332 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2333 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2334 x86_pop( func, aos_output );
2335
2336 /* Advance to next output */
2337 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2338 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2339 }
2340 /* while --num_outputs */
2341 x86_dec( func, num_outputs );
2342 x86_jcc( func, cc_NE, inner_loop );
2343
2344 /* Restore EBX */
2345 x86_pop( func, aos_output );
2346 }
2347
2348 /**
2349 * Translate a TGSI vertex/fragment shader to SSE2 code.
2350 * Slightly different things are done for vertex vs. fragment shaders.
2351 *
2352 * Note that fragment shaders are responsible for interpolating shader
2353 * inputs. Because on x86 we have only 4 GP registers, and here we
2354 * have 5 shader arguments (input, output, const, temp and coef), the
2355 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2356 * GP register holding the output argument is aliased with the coeff
2357 * argument, as outputs are not needed in the DECLARATION phase.
2358 *
2359 * \param tokens the TGSI input shader
2360 * \param func the output SSE code/function
2361 * \param immediates buffer to place immediates, later passed to SSE func
2362 * \param return 1 for success, 0 if translation failed
2363 */
2364 unsigned
2365 tgsi_emit_sse2(
2366 const struct tgsi_token *tokens,
2367 struct x86_function *func,
2368 float (*immediates)[4],
2369 boolean do_swizzles )
2370 {
2371 struct tgsi_parse_context parse;
2372 boolean instruction_phase = FALSE;
2373 unsigned ok = 1;
2374 uint num_immediates = 0;
2375
2376 util_init_math();
2377
2378 func->csr = func->store;
2379
2380 tgsi_parse_init( &parse, tokens );
2381
2382 /* Can't just use EDI, EBX without save/restoring them:
2383 */
2384 x86_push(
2385 func,
2386 get_immediate_base() );
2387
2388 x86_push(
2389 func,
2390 get_temp_base() );
2391
2392
2393 /*
2394 * Different function args for vertex/fragment shaders:
2395 */
2396 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2397 /* DECLARATION phase, do not load output argument. */
2398 x86_mov(
2399 func,
2400 get_input_base(),
2401 x86_fn_arg( func, 1 ) );
2402 /* skipping outputs argument here */
2403 x86_mov(
2404 func,
2405 get_const_base(),
2406 x86_fn_arg( func, 3 ) );
2407 x86_mov(
2408 func,
2409 get_temp_base(),
2410 x86_fn_arg( func, 4 ) );
2411 x86_mov(
2412 func,
2413 get_coef_base(),
2414 x86_fn_arg( func, 5 ) );
2415 x86_mov(
2416 func,
2417 get_immediate_base(),
2418 x86_fn_arg( func, 6 ) );
2419 }
2420 else {
2421 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2422
2423 if (do_swizzles)
2424 aos_to_soa( func,
2425 6, /* aos_input */
2426 1, /* machine->input */
2427 7, /* num_inputs */
2428 8 ); /* input_stride */
2429
2430 x86_mov(
2431 func,
2432 get_input_base(),
2433 x86_fn_arg( func, 1 ) );
2434 x86_mov(
2435 func,
2436 get_output_base(),
2437 x86_fn_arg( func, 2 ) );
2438 x86_mov(
2439 func,
2440 get_const_base(),
2441 x86_fn_arg( func, 3 ) );
2442 x86_mov(
2443 func,
2444 get_temp_base(),
2445 x86_fn_arg( func, 4 ) );
2446 x86_mov(
2447 func,
2448 get_immediate_base(),
2449 x86_fn_arg( func, 5 ) );
2450 }
2451
2452 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2453 tgsi_parse_token( &parse );
2454
2455 switch( parse.FullToken.Token.Type ) {
2456 case TGSI_TOKEN_TYPE_DECLARATION:
2457 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2458 emit_declaration(
2459 func,
2460 &parse.FullToken.FullDeclaration );
2461 }
2462 break;
2463
2464 case TGSI_TOKEN_TYPE_INSTRUCTION:
2465 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2466 if( !instruction_phase ) {
2467 /* INSTRUCTION phase, overwrite coeff with output. */
2468 instruction_phase = TRUE;
2469 x86_mov(
2470 func,
2471 get_output_base(),
2472 x86_fn_arg( func, 2 ) );
2473 }
2474 }
2475
2476 ok = emit_instruction(
2477 func,
2478 &parse.FullToken.FullInstruction );
2479
2480 if (!ok) {
2481 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2482 parse.FullToken.FullInstruction.Instruction.Opcode,
2483 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2484 "vertex shader" : "fragment shader");
2485 }
2486 break;
2487
2488 case TGSI_TOKEN_TYPE_IMMEDIATE:
2489 /* simply copy the immediate values into the next immediates[] slot */
2490 {
2491 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2492 uint i;
2493 assert(size <= 4);
2494 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2495 for( i = 0; i < size; i++ ) {
2496 immediates[num_immediates][i] =
2497 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2498 }
2499 #if 0
2500 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2501 num_immediates,
2502 immediates[num_immediates][0],
2503 immediates[num_immediates][1],
2504 immediates[num_immediates][2],
2505 immediates[num_immediates][3]);
2506 #endif
2507 num_immediates++;
2508 }
2509 break;
2510
2511 default:
2512 ok = 0;
2513 assert( 0 );
2514 }
2515 }
2516
2517 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2518 if (do_swizzles)
2519 soa_to_aos( func, 9, 2, 10, 11 );
2520 }
2521
2522 /* Can't just use EBX, EDI without save/restoring them:
2523 */
2524 x86_pop(
2525 func,
2526 get_temp_base() );
2527
2528 x86_pop(
2529 func,
2530 get_immediate_base() );
2531
2532 emit_ret( func );
2533
2534 tgsi_parse_free( &parse );
2535
2536 return ok;
2537 }
2538
2539 #endif /* PIPE_ARCH_X86 */
2540