Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #if defined(PIPE_ARCH_SSE)
36 #include "util/u_sse.h"
37 #endif
38 #include "tgsi/tgsi_parse.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi_exec.h"
41 #include "tgsi_sse2.h"
42
43 #include "rtasm/rtasm_x86sse.h"
44
45 /* for 1/sqrt()
46 *
47 * This costs about 100fps (close to 10%) in gears:
48 */
49 #define HIGH_PRECISION 1
50
51 #define FAST_MATH 1
52
53
54 #define FOR_EACH_CHANNEL( CHAN )\
55 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
56
57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
58 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
59
60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
61 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
62
63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
64 FOR_EACH_CHANNEL( CHAN )\
65 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
66
67 #define CHAN_X 0
68 #define CHAN_Y 1
69 #define CHAN_Z 2
70 #define CHAN_W 3
71
72 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
73 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
74
75 #define TEMP_R0 TGSI_EXEC_TEMP_R0
76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
79
80
81 /**
82 * X86 utility functions.
83 */
84
85 static struct x86_reg
86 make_xmm(
87 unsigned xmm )
88 {
89 return x86_make_reg(
90 file_XMM,
91 (enum x86_reg_name) xmm );
92 }
93
94 /**
95 * X86 register mapping helpers.
96 */
97
98 static struct x86_reg
99 get_const_base( void )
100 {
101 return x86_make_reg(
102 file_REG32,
103 reg_CX );
104 }
105
106 static struct x86_reg
107 get_input_base( void )
108 {
109 return x86_make_reg(
110 file_REG32,
111 reg_AX );
112 }
113
114 static struct x86_reg
115 get_output_base( void )
116 {
117 return x86_make_reg(
118 file_REG32,
119 reg_DX );
120 }
121
122 static struct x86_reg
123 get_temp_base( void )
124 {
125 return x86_make_reg(
126 file_REG32,
127 reg_BX );
128 }
129
130 static struct x86_reg
131 get_coef_base( void )
132 {
133 return get_output_base();
134 }
135
136 static struct x86_reg
137 get_immediate_base( void )
138 {
139 return x86_make_reg(
140 file_REG32,
141 reg_DI );
142 }
143
144
145 /**
146 * Data access helpers.
147 */
148
149
150 static struct x86_reg
151 get_immediate(
152 unsigned vec,
153 unsigned chan )
154 {
155 return x86_make_disp(
156 get_immediate_base(),
157 (vec * 4 + chan) * 4 );
158 }
159
160 static struct x86_reg
161 get_const(
162 unsigned vec,
163 unsigned chan )
164 {
165 return x86_make_disp(
166 get_const_base(),
167 (vec * 4 + chan) * 4 );
168 }
169
170 static struct x86_reg
171 get_input(
172 unsigned vec,
173 unsigned chan )
174 {
175 return x86_make_disp(
176 get_input_base(),
177 (vec * 4 + chan) * 16 );
178 }
179
180 static struct x86_reg
181 get_output(
182 unsigned vec,
183 unsigned chan )
184 {
185 return x86_make_disp(
186 get_output_base(),
187 (vec * 4 + chan) * 16 );
188 }
189
190 static struct x86_reg
191 get_temp(
192 unsigned vec,
193 unsigned chan )
194 {
195 return x86_make_disp(
196 get_temp_base(),
197 (vec * 4 + chan) * 16 );
198 }
199
200 static struct x86_reg
201 get_coef(
202 unsigned vec,
203 unsigned chan,
204 unsigned member )
205 {
206 return x86_make_disp(
207 get_coef_base(),
208 ((vec * 3 + member) * 4 + chan) * 4 );
209 }
210
211
212 static void
213 emit_ret(
214 struct x86_function *func )
215 {
216 x86_ret( func );
217 }
218
219
220 /**
221 * Data fetch helpers.
222 */
223
224 /**
225 * Copy a shader constant to xmm register
226 * \param xmm the destination xmm register
227 * \param vec the src const buffer index
228 * \param chan src channel to fetch (X, Y, Z or W)
229 */
230 static void
231 emit_const(
232 struct x86_function *func,
233 uint xmm,
234 int vec,
235 uint chan,
236 uint indirect,
237 uint indirectFile,
238 int indirectIndex )
239 {
240 if (indirect) {
241 /* 'vec' is the offset from the address register's value.
242 * We're loading CONST[ADDR+vec] into an xmm register.
243 */
244 struct x86_reg r0 = get_input_base();
245 struct x86_reg r1 = get_output_base();
246 uint i;
247
248 assert( indirectFile == TGSI_FILE_ADDRESS );
249 assert( indirectIndex == 0 );
250
251 x86_push( func, r0 );
252 x86_push( func, r1 );
253
254 /*
255 * Loop over the four pixels or vertices in the quad.
256 * Get the value of the address (offset) register for pixel/vertex[i],
257 * add it to the src offset and index into the constant buffer.
258 * Note that we're working on SOA data.
259 * If any of the pixel/vertex execution channels are unused their
260 * values will be garbage. It's very important that we don't use
261 * those garbage values as indexes into the constant buffer since
262 * that'll cause segfaults.
263 * The solution is to bitwise-AND the offset with the execution mask
264 * register whose values are either 0 or ~0.
265 * The caller must setup the execution mask register to indicate
266 * which channels are valid/alive before running the shader.
267 * The execution mask will also figure into loops and conditionals
268 * someday.
269 */
270 for (i = 0; i < QUAD_SIZE; i++) {
271 /* r1 = address register[i] */
272 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
273 /* r0 = execution mask[i] */
274 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
275 /* r1 = r1 & r0 */
276 x86_and( func, r1, r0 );
277 /* r0 = 'vec', the offset */
278 x86_lea( func, r0, get_const( vec, chan ) );
279
280 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
281 */
282 x86_add( func, r1, r1 );
283 x86_add( func, r1, r1 );
284 x86_add( func, r1, r1 );
285 x86_add( func, r1, r1 );
286
287 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
288 x86_mov( func, r1, x86_deref( r0 ) );
289 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
290 }
291
292 x86_pop( func, r1 );
293 x86_pop( func, r0 );
294
295 sse_movaps(
296 func,
297 make_xmm( xmm ),
298 get_temp( TEMP_R0, CHAN_X ) );
299 }
300 else {
301 /* 'vec' is the index into the src register file, such as TEMP[vec] */
302 assert( vec >= 0 );
303
304 sse_movss(
305 func,
306 make_xmm( xmm ),
307 get_const( vec, chan ) );
308 sse_shufps(
309 func,
310 make_xmm( xmm ),
311 make_xmm( xmm ),
312 SHUF( 0, 0, 0, 0 ) );
313 }
314 }
315
316 static void
317 emit_immediate(
318 struct x86_function *func,
319 unsigned xmm,
320 unsigned vec,
321 unsigned chan )
322 {
323 sse_movss(
324 func,
325 make_xmm( xmm ),
326 get_immediate( vec, chan ) );
327 sse_shufps(
328 func,
329 make_xmm( xmm ),
330 make_xmm( xmm ),
331 SHUF( 0, 0, 0, 0 ) );
332 }
333
334
335 /**
336 * Copy a shader input to xmm register
337 * \param xmm the destination xmm register
338 * \param vec the src input attrib
339 * \param chan src channel to fetch (X, Y, Z or W)
340 */
341 static void
342 emit_inputf(
343 struct x86_function *func,
344 unsigned xmm,
345 unsigned vec,
346 unsigned chan )
347 {
348 sse_movups(
349 func,
350 make_xmm( xmm ),
351 get_input( vec, chan ) );
352 }
353
354 /**
355 * Store an xmm register to a shader output
356 * \param xmm the source xmm register
357 * \param vec the dest output attrib
358 * \param chan src dest channel to store (X, Y, Z or W)
359 */
360 static void
361 emit_output(
362 struct x86_function *func,
363 unsigned xmm,
364 unsigned vec,
365 unsigned chan )
366 {
367 sse_movups(
368 func,
369 get_output( vec, chan ),
370 make_xmm( xmm ) );
371 }
372
373 /**
374 * Copy a shader temporary to xmm register
375 * \param xmm the destination xmm register
376 * \param vec the src temp register
377 * \param chan src channel to fetch (X, Y, Z or W)
378 */
379 static void
380 emit_tempf(
381 struct x86_function *func,
382 unsigned xmm,
383 unsigned vec,
384 unsigned chan )
385 {
386 sse_movaps(
387 func,
388 make_xmm( xmm ),
389 get_temp( vec, chan ) );
390 }
391
392 /**
393 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
394 * \param xmm the destination xmm register
395 * \param vec the src input/attribute coefficient index
396 * \param chan src channel to fetch (X, Y, Z or W)
397 * \param member 0=a0, 1=dadx, 2=dady
398 */
399 static void
400 emit_coef(
401 struct x86_function *func,
402 unsigned xmm,
403 unsigned vec,
404 unsigned chan,
405 unsigned member )
406 {
407 sse_movss(
408 func,
409 make_xmm( xmm ),
410 get_coef( vec, chan, member ) );
411 sse_shufps(
412 func,
413 make_xmm( xmm ),
414 make_xmm( xmm ),
415 SHUF( 0, 0, 0, 0 ) );
416 }
417
418 /**
419 * Data store helpers.
420 */
421
422 static void
423 emit_inputs(
424 struct x86_function *func,
425 unsigned xmm,
426 unsigned vec,
427 unsigned chan )
428 {
429 sse_movups(
430 func,
431 get_input( vec, chan ),
432 make_xmm( xmm ) );
433 }
434
435 static void
436 emit_temps(
437 struct x86_function *func,
438 unsigned xmm,
439 unsigned vec,
440 unsigned chan )
441 {
442 sse_movaps(
443 func,
444 get_temp( vec, chan ),
445 make_xmm( xmm ) );
446 }
447
448 static void
449 emit_addrs(
450 struct x86_function *func,
451 unsigned xmm,
452 unsigned vec,
453 unsigned chan )
454 {
455 assert( vec == 0 );
456
457 emit_temps(
458 func,
459 xmm,
460 vec + TGSI_EXEC_TEMP_ADDR,
461 chan );
462 }
463
464 /**
465 * Coefficent fetch helpers.
466 */
467
468 static void
469 emit_coef_a0(
470 struct x86_function *func,
471 unsigned xmm,
472 unsigned vec,
473 unsigned chan )
474 {
475 emit_coef(
476 func,
477 xmm,
478 vec,
479 chan,
480 0 );
481 }
482
483 static void
484 emit_coef_dadx(
485 struct x86_function *func,
486 unsigned xmm,
487 unsigned vec,
488 unsigned chan )
489 {
490 emit_coef(
491 func,
492 xmm,
493 vec,
494 chan,
495 1 );
496 }
497
498 static void
499 emit_coef_dady(
500 struct x86_function *func,
501 unsigned xmm,
502 unsigned vec,
503 unsigned chan )
504 {
505 emit_coef(
506 func,
507 xmm,
508 vec,
509 chan,
510 2 );
511 }
512
513 /**
514 * Function call helpers.
515 */
516
517 /**
518 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
519 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
520 * that the stack pointer is 16 byte aligned, as expected.
521 */
522 static void
523 emit_func_call_dst(
524 struct x86_function *func,
525 unsigned xmm_save,
526 unsigned xmm_dst,
527 void (PIPE_CDECL *code)() )
528 {
529 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
530 unsigned i, n, xmm;
531 unsigned xmm_mask;
532
533 /* Bitmask of the xmm registers to save */
534 xmm_mask = (1 << xmm_save) - 1;
535 xmm_mask &= ~(1 << xmm_dst);
536
537 sse_movaps(
538 func,
539 get_temp( TEMP_R0, 0 ),
540 make_xmm( xmm_dst ) );
541
542 x86_push(
543 func,
544 x86_make_reg( file_REG32, reg_AX) );
545 x86_push(
546 func,
547 x86_make_reg( file_REG32, reg_CX) );
548 x86_push(
549 func,
550 x86_make_reg( file_REG32, reg_DX) );
551
552 for(i = 0, n = 0; i < 8; ++i)
553 if(xmm_mask & (1 << i))
554 ++n;
555
556 x86_sub_imm(
557 func,
558 x86_make_reg( file_REG32, reg_SP ),
559 n*16);
560
561 for(i = 0, n = 0; i < 8; ++i)
562 if(xmm_mask & (1 << i)) {
563 sse_movups(
564 func,
565 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
566 make_xmm( xmm ) );
567 ++n;
568 }
569
570 x86_lea(
571 func,
572 ecx,
573 get_temp( TEMP_R0, 0 ) );
574
575 x86_push( func, ecx );
576 x86_mov_reg_imm( func, ecx, (unsigned long) code );
577 x86_call( func, ecx );
578 x86_pop(func, ecx );
579
580 for(i = 0, n = 0; i < 8; ++i)
581 if(xmm_mask & (1 << i)) {
582 sse_movups(
583 func,
584 make_xmm( xmm ),
585 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
586 ++n;
587 }
588
589 x86_add_imm(
590 func,
591 x86_make_reg( file_REG32, reg_SP ),
592 n*16);
593
594 /* Restore GP registers in a reverse order.
595 */
596 x86_pop(
597 func,
598 x86_make_reg( file_REG32, reg_DX) );
599 x86_pop(
600 func,
601 x86_make_reg( file_REG32, reg_CX) );
602 x86_pop(
603 func,
604 x86_make_reg( file_REG32, reg_AX) );
605
606 sse_movaps(
607 func,
608 make_xmm( xmm_dst ),
609 get_temp( TEMP_R0, 0 ) );
610 }
611
612 static void
613 emit_func_call_dst_src(
614 struct x86_function *func,
615 unsigned xmm_save,
616 unsigned xmm_dst,
617 unsigned xmm_src,
618 void (PIPE_CDECL *code)() )
619 {
620 sse_movaps(
621 func,
622 get_temp( TEMP_R0, 1 ),
623 make_xmm( xmm_src ) );
624
625 emit_func_call_dst(
626 func,
627 xmm_save,
628 xmm_dst,
629 code );
630 }
631
632
633 #if defined(PIPE_ARCH_SSE)
634
635 /*
636 * Fast SSE2 implementation of special math functions.
637 */
638
639 #define POLY0(x, c0) _mm_set1_ps(c0)
640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
645
646 #define EXP_POLY_DEGREE 3
647 #define LOG_POLY_DEGREE 5
648
649 /**
650 * See http://www.devmaster.net/forums/showthread.php?p=43580
651 */
652 static INLINE __m128
653 exp2f4(__m128 x)
654 {
655 __m128i ipart;
656 __m128 fpart, expipart, expfpart;
657
658 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
659 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
660
661 /* ipart = int(x - 0.5) */
662 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
663
664 /* fpart = x - ipart */
665 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
666
667 /* expipart = (float) (1 << ipart) */
668 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
669
670 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
671 #if EXP_POLY_DEGREE == 5
672 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
673 #elif EXP_POLY_DEGREE == 4
674 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
675 #elif EXP_POLY_DEGREE == 3
676 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
677 #elif EXP_POLY_DEGREE == 2
678 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
679 #else
680 #error
681 #endif
682
683 return _mm_mul_ps(expipart, expfpart);
684 }
685
686
687 /**
688 * See http://www.devmaster.net/forums/showthread.php?p=43580
689 */
690 static INLINE __m128
691 log2f4(__m128 x)
692 {
693 __m128i expmask = _mm_set1_epi32(0x7f800000);
694 __m128i mantmask = _mm_set1_epi32(0x007fffff);
695 __m128 one = _mm_set1_ps(1.0f);
696
697 __m128i i = _mm_castps_si128(x);
698
699 /* exp = (float) exponent(x) */
700 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
701
702 /* mant = (float) mantissa(x) */
703 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
704
705 __m128 logmant;
706
707 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
708 * These coefficients can be generate with
709 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
710 */
711 #if LOG_POLY_DEGREE == 6
712 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
713 #elif LOG_POLY_DEGREE == 5
714 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
715 #elif LOG_POLY_DEGREE == 4
716 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
717 #elif LOG_POLY_DEGREE == 3
718 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
719 #else
720 #error
721 #endif
722
723 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
724 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
725
726 return _mm_add_ps(logmant, exp);
727 }
728
729
730 static INLINE __m128
731 powf4(__m128 x, __m128 y)
732 {
733 return exp2f4(_mm_mul_ps(log2f4(x), y));
734 }
735
736 #endif /* PIPE_ARCH_SSE */
737
738
739
740 /**
741 * Low-level instruction translators.
742 */
743
744 static void
745 emit_abs(
746 struct x86_function *func,
747 unsigned xmm )
748 {
749 sse_andps(
750 func,
751 make_xmm( xmm ),
752 get_temp(
753 TGSI_EXEC_TEMP_7FFFFFFF_I,
754 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
755 }
756
757 static void
758 emit_add(
759 struct x86_function *func,
760 unsigned xmm_dst,
761 unsigned xmm_src )
762 {
763 sse_addps(
764 func,
765 make_xmm( xmm_dst ),
766 make_xmm( xmm_src ) );
767 }
768
769 static void PIPE_CDECL
770 cos4f(
771 float *store )
772 {
773 store[0] = cosf( store[0] );
774 store[1] = cosf( store[1] );
775 store[2] = cosf( store[2] );
776 store[3] = cosf( store[3] );
777 }
778
779 static void
780 emit_cos(
781 struct x86_function *func,
782 unsigned xmm_save,
783 unsigned xmm_dst )
784 {
785 emit_func_call_dst(
786 func,
787 xmm_save,
788 xmm_dst,
789 cos4f );
790 }
791
792 static void PIPE_CDECL
793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
794 __attribute__((force_align_arg_pointer))
795 #endif
796 ex24f(
797 float *store )
798 {
799 #if defined(PIPE_ARCH_SSE)
800 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
801 #else
802 store[0] = util_fast_exp2( store[0] );
803 store[1] = util_fast_exp2( store[1] );
804 store[2] = util_fast_exp2( store[2] );
805 store[3] = util_fast_exp2( store[3] );
806 #endif
807 }
808
809 static void
810 emit_ex2(
811 struct x86_function *func,
812 unsigned xmm_save,
813 unsigned xmm_dst )
814 {
815 emit_func_call_dst(
816 func,
817 xmm_save,
818 xmm_dst,
819 ex24f );
820 }
821
822 static void
823 emit_f2it(
824 struct x86_function *func,
825 unsigned xmm )
826 {
827 sse2_cvttps2dq(
828 func,
829 make_xmm( xmm ),
830 make_xmm( xmm ) );
831 }
832
833 static void
834 emit_i2f(
835 struct x86_function *func,
836 unsigned xmm )
837 {
838 sse2_cvtdq2ps(
839 func,
840 make_xmm( xmm ),
841 make_xmm( xmm ) );
842 }
843
844 static void PIPE_CDECL
845 flr4f(
846 float *store )
847 {
848 store[0] = floorf( store[0] );
849 store[1] = floorf( store[1] );
850 store[2] = floorf( store[2] );
851 store[3] = floorf( store[3] );
852 }
853
854 static void
855 emit_flr(
856 struct x86_function *func,
857 unsigned xmm_save,
858 unsigned xmm_dst )
859 {
860 emit_func_call_dst(
861 func,
862 xmm_save,
863 xmm_dst,
864 flr4f );
865 }
866
867 static void PIPE_CDECL
868 frc4f(
869 float *store )
870 {
871 store[0] -= floorf( store[0] );
872 store[1] -= floorf( store[1] );
873 store[2] -= floorf( store[2] );
874 store[3] -= floorf( store[3] );
875 }
876
877 static void
878 emit_frc(
879 struct x86_function *func,
880 unsigned xmm_save,
881 unsigned xmm_dst )
882 {
883 emit_func_call_dst(
884 func,
885 xmm_save,
886 xmm_dst,
887 frc4f );
888 }
889
890 static void PIPE_CDECL
891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
892 __attribute__((force_align_arg_pointer))
893 #endif
894 lg24f(
895 float *store )
896 {
897 #if defined(PIPE_ARCH_SSE)
898 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
899 #else
900 store[0] = util_fast_log2( store[0] );
901 store[1] = util_fast_log2( store[1] );
902 store[2] = util_fast_log2( store[2] );
903 store[3] = util_fast_log2( store[3] );
904 #endif
905 }
906
907 static void
908 emit_lg2(
909 struct x86_function *func,
910 unsigned xmm_save,
911 unsigned xmm_dst )
912 {
913 emit_func_call_dst(
914 func,
915 xmm_save,
916 xmm_dst,
917 lg24f );
918 }
919
920 static void
921 emit_MOV(
922 struct x86_function *func,
923 unsigned xmm_dst,
924 unsigned xmm_src )
925 {
926 sse_movups(
927 func,
928 make_xmm( xmm_dst ),
929 make_xmm( xmm_src ) );
930 }
931
932 static void
933 emit_mul (struct x86_function *func,
934 unsigned xmm_dst,
935 unsigned xmm_src)
936 {
937 sse_mulps(
938 func,
939 make_xmm( xmm_dst ),
940 make_xmm( xmm_src ) );
941 }
942
943 static void
944 emit_neg(
945 struct x86_function *func,
946 unsigned xmm )
947 {
948 sse_xorps(
949 func,
950 make_xmm( xmm ),
951 get_temp(
952 TGSI_EXEC_TEMP_80000000_I,
953 TGSI_EXEC_TEMP_80000000_C ) );
954 }
955
956 static void PIPE_CDECL
957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
958 __attribute__((force_align_arg_pointer))
959 #endif
960 pow4f(
961 float *store )
962 {
963 #if defined(PIPE_ARCH_SSE)
964 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
965 #else
966 store[0] = util_fast_pow( store[0], store[4] );
967 store[1] = util_fast_pow( store[1], store[5] );
968 store[2] = util_fast_pow( store[2], store[6] );
969 store[3] = util_fast_pow( store[3], store[7] );
970 #endif
971 }
972
973 static void
974 emit_pow(
975 struct x86_function *func,
976 unsigned xmm_save,
977 unsigned xmm_dst,
978 unsigned xmm_src )
979 {
980 emit_func_call_dst_src(
981 func,
982 xmm_save,
983 xmm_dst,
984 xmm_src,
985 pow4f );
986 }
987
988 static void
989 emit_rcp (
990 struct x86_function *func,
991 unsigned xmm_dst,
992 unsigned xmm_src )
993 {
994 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
995 * good enough. Need to either emit a proper divide or use the
996 * iterative technique described below in emit_rsqrt().
997 */
998 sse2_rcpps(
999 func,
1000 make_xmm( xmm_dst ),
1001 make_xmm( xmm_src ) );
1002 }
1003
1004 static void
1005 emit_rsqrt(
1006 struct x86_function *func,
1007 unsigned xmm_dst,
1008 unsigned xmm_src )
1009 {
1010 #if HIGH_PRECISION
1011 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1012 * implementations, it is possible to improve its precision at
1013 * fairly low cost, using a newton/raphson step, as below:
1014 *
1015 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1016 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1017 *
1018 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1019 */
1020 {
1021 struct x86_reg dst = make_xmm( xmm_dst );
1022 struct x86_reg src = make_xmm( xmm_src );
1023 struct x86_reg tmp0 = make_xmm( 2 );
1024 struct x86_reg tmp1 = make_xmm( 3 );
1025
1026 assert( xmm_dst != xmm_src );
1027 assert( xmm_dst != 2 && xmm_dst != 3 );
1028 assert( xmm_src != 2 && xmm_src != 3 );
1029
1030 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1031 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1032 sse_rsqrtps( func, tmp1, src );
1033 sse_mulps( func, src, tmp1 );
1034 sse_mulps( func, dst, tmp1 );
1035 sse_mulps( func, src, tmp1 );
1036 sse_subps( func, tmp0, src );
1037 sse_mulps( func, dst, tmp0 );
1038 }
1039 #else
1040 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1041 * good enough.
1042 */
1043 sse_rsqrtps(
1044 func,
1045 make_xmm( xmm_dst ),
1046 make_xmm( xmm_src ) );
1047 #endif
1048 }
1049
1050 static void
1051 emit_setsign(
1052 struct x86_function *func,
1053 unsigned xmm )
1054 {
1055 sse_orps(
1056 func,
1057 make_xmm( xmm ),
1058 get_temp(
1059 TGSI_EXEC_TEMP_80000000_I,
1060 TGSI_EXEC_TEMP_80000000_C ) );
1061 }
1062
1063 static void PIPE_CDECL
1064 sin4f(
1065 float *store )
1066 {
1067 store[0] = sinf( store[0] );
1068 store[1] = sinf( store[1] );
1069 store[2] = sinf( store[2] );
1070 store[3] = sinf( store[3] );
1071 }
1072
1073 static void
1074 emit_sin (struct x86_function *func,
1075 unsigned xmm_save,
1076 unsigned xmm_dst)
1077 {
1078 emit_func_call_dst(
1079 func,
1080 xmm_save,
1081 xmm_dst,
1082 sin4f );
1083 }
1084
1085 static void
1086 emit_sub(
1087 struct x86_function *func,
1088 unsigned xmm_dst,
1089 unsigned xmm_src )
1090 {
1091 sse_subps(
1092 func,
1093 make_xmm( xmm_dst ),
1094 make_xmm( xmm_src ) );
1095 }
1096
1097 /**
1098 * Register fetch.
1099 */
1100
1101 static void
1102 emit_fetch(
1103 struct x86_function *func,
1104 unsigned xmm,
1105 const struct tgsi_full_src_register *reg,
1106 const unsigned chan_index )
1107 {
1108 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1109
1110 switch (swizzle) {
1111 case TGSI_EXTSWIZZLE_X:
1112 case TGSI_EXTSWIZZLE_Y:
1113 case TGSI_EXTSWIZZLE_Z:
1114 case TGSI_EXTSWIZZLE_W:
1115 switch (reg->SrcRegister.File) {
1116 case TGSI_FILE_CONSTANT:
1117 emit_const(
1118 func,
1119 xmm,
1120 reg->SrcRegister.Index,
1121 swizzle,
1122 reg->SrcRegister.Indirect,
1123 reg->SrcRegisterInd.File,
1124 reg->SrcRegisterInd.Index );
1125 break;
1126
1127 case TGSI_FILE_IMMEDIATE:
1128 emit_immediate(
1129 func,
1130 xmm,
1131 reg->SrcRegister.Index,
1132 swizzle );
1133 break;
1134
1135 case TGSI_FILE_INPUT:
1136 emit_inputf(
1137 func,
1138 xmm,
1139 reg->SrcRegister.Index,
1140 swizzle );
1141 break;
1142
1143 case TGSI_FILE_TEMPORARY:
1144 emit_tempf(
1145 func,
1146 xmm,
1147 reg->SrcRegister.Index,
1148 swizzle );
1149 break;
1150
1151 default:
1152 assert( 0 );
1153 }
1154 break;
1155
1156 case TGSI_EXTSWIZZLE_ZERO:
1157 emit_tempf(
1158 func,
1159 xmm,
1160 TGSI_EXEC_TEMP_00000000_I,
1161 TGSI_EXEC_TEMP_00000000_C );
1162 break;
1163
1164 case TGSI_EXTSWIZZLE_ONE:
1165 emit_tempf(
1166 func,
1167 xmm,
1168 TEMP_ONE_I,
1169 TEMP_ONE_C );
1170 break;
1171
1172 default:
1173 assert( 0 );
1174 }
1175
1176 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1177 case TGSI_UTIL_SIGN_CLEAR:
1178 emit_abs( func, xmm );
1179 break;
1180
1181 case TGSI_UTIL_SIGN_SET:
1182 emit_setsign( func, xmm );
1183 break;
1184
1185 case TGSI_UTIL_SIGN_TOGGLE:
1186 emit_neg( func, xmm );
1187 break;
1188
1189 case TGSI_UTIL_SIGN_KEEP:
1190 break;
1191 }
1192 }
1193
1194 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1195 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1196
1197 /**
1198 * Register store.
1199 */
1200
1201 static void
1202 emit_store(
1203 struct x86_function *func,
1204 unsigned xmm,
1205 const struct tgsi_full_dst_register *reg,
1206 const struct tgsi_full_instruction *inst,
1207 unsigned chan_index )
1208 {
1209 switch( reg->DstRegister.File ) {
1210 case TGSI_FILE_OUTPUT:
1211 emit_output(
1212 func,
1213 xmm,
1214 reg->DstRegister.Index,
1215 chan_index );
1216 break;
1217
1218 case TGSI_FILE_TEMPORARY:
1219 emit_temps(
1220 func,
1221 xmm,
1222 reg->DstRegister.Index,
1223 chan_index );
1224 break;
1225
1226 case TGSI_FILE_ADDRESS:
1227 emit_addrs(
1228 func,
1229 xmm,
1230 reg->DstRegister.Index,
1231 chan_index );
1232 break;
1233
1234 default:
1235 assert( 0 );
1236 }
1237
1238 switch( inst->Instruction.Saturate ) {
1239 case TGSI_SAT_NONE:
1240 break;
1241
1242 case TGSI_SAT_ZERO_ONE:
1243 /* assert( 0 ); */
1244 break;
1245
1246 case TGSI_SAT_MINUS_PLUS_ONE:
1247 assert( 0 );
1248 break;
1249 }
1250 }
1251
1252 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1253 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1254
1255 /**
1256 * High-level instruction translators.
1257 */
1258
1259 static void
1260 emit_kil(
1261 struct x86_function *func,
1262 const struct tgsi_full_src_register *reg )
1263 {
1264 unsigned uniquemask;
1265 unsigned registers[4];
1266 unsigned nextregister = 0;
1267 unsigned firstchan = ~0;
1268 unsigned chan_index;
1269
1270 /* This mask stores component bits that were already tested. Note that
1271 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1272 * tested. */
1273 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1274
1275 FOR_EACH_CHANNEL( chan_index ) {
1276 unsigned swizzle;
1277
1278 /* unswizzle channel */
1279 swizzle = tgsi_util_get_full_src_register_extswizzle(
1280 reg,
1281 chan_index );
1282
1283 /* check if the component has not been already tested */
1284 if( !(uniquemask & (1 << swizzle)) ) {
1285 uniquemask |= 1 << swizzle;
1286
1287 /* allocate register */
1288 registers[chan_index] = nextregister;
1289 emit_fetch(
1290 func,
1291 nextregister,
1292 reg,
1293 chan_index );
1294 nextregister++;
1295
1296 /* mark the first channel used */
1297 if( firstchan == ~0 ) {
1298 firstchan = chan_index;
1299 }
1300 }
1301 }
1302
1303 x86_push(
1304 func,
1305 x86_make_reg( file_REG32, reg_AX ) );
1306 x86_push(
1307 func,
1308 x86_make_reg( file_REG32, reg_DX ) );
1309
1310 FOR_EACH_CHANNEL( chan_index ) {
1311 if( uniquemask & (1 << chan_index) ) {
1312 sse_cmpps(
1313 func,
1314 make_xmm( registers[chan_index] ),
1315 get_temp(
1316 TGSI_EXEC_TEMP_00000000_I,
1317 TGSI_EXEC_TEMP_00000000_C ),
1318 cc_LessThan );
1319
1320 if( chan_index == firstchan ) {
1321 sse_pmovmskb(
1322 func,
1323 x86_make_reg( file_REG32, reg_AX ),
1324 make_xmm( registers[chan_index] ) );
1325 }
1326 else {
1327 sse_pmovmskb(
1328 func,
1329 x86_make_reg( file_REG32, reg_DX ),
1330 make_xmm( registers[chan_index] ) );
1331 x86_or(
1332 func,
1333 x86_make_reg( file_REG32, reg_AX ),
1334 x86_make_reg( file_REG32, reg_DX ) );
1335 }
1336 }
1337 }
1338
1339 x86_or(
1340 func,
1341 get_temp(
1342 TGSI_EXEC_TEMP_KILMASK_I,
1343 TGSI_EXEC_TEMP_KILMASK_C ),
1344 x86_make_reg( file_REG32, reg_AX ) );
1345
1346 x86_pop(
1347 func,
1348 x86_make_reg( file_REG32, reg_DX ) );
1349 x86_pop(
1350 func,
1351 x86_make_reg( file_REG32, reg_AX ) );
1352 }
1353
1354
1355 static void
1356 emit_kilp(
1357 struct x86_function *func )
1358 {
1359 /* XXX todo / fix me */
1360 }
1361
1362
1363 static void
1364 emit_setcc(
1365 struct x86_function *func,
1366 struct tgsi_full_instruction *inst,
1367 enum sse_cc cc )
1368 {
1369 unsigned chan_index;
1370
1371 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1372 FETCH( func, *inst, 0, 0, chan_index );
1373 FETCH( func, *inst, 1, 1, chan_index );
1374 sse_cmpps(
1375 func,
1376 make_xmm( 0 ),
1377 make_xmm( 1 ),
1378 cc );
1379 sse_andps(
1380 func,
1381 make_xmm( 0 ),
1382 get_temp(
1383 TEMP_ONE_I,
1384 TEMP_ONE_C ) );
1385 STORE( func, *inst, 0, 0, chan_index );
1386 }
1387 }
1388
1389 static void
1390 emit_cmp(
1391 struct x86_function *func,
1392 struct tgsi_full_instruction *inst )
1393 {
1394 unsigned chan_index;
1395
1396 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1397 FETCH( func, *inst, 0, 0, chan_index );
1398 FETCH( func, *inst, 1, 1, chan_index );
1399 FETCH( func, *inst, 2, 2, chan_index );
1400 sse_cmpps(
1401 func,
1402 make_xmm( 0 ),
1403 get_temp(
1404 TGSI_EXEC_TEMP_00000000_I,
1405 TGSI_EXEC_TEMP_00000000_C ),
1406 cc_LessThan );
1407 sse_andps(
1408 func,
1409 make_xmm( 1 ),
1410 make_xmm( 0 ) );
1411 sse_andnps(
1412 func,
1413 make_xmm( 0 ),
1414 make_xmm( 2 ) );
1415 sse_orps(
1416 func,
1417 make_xmm( 0 ),
1418 make_xmm( 1 ) );
1419 STORE( func, *inst, 0, 0, chan_index );
1420 }
1421 }
1422
1423 static int
1424 emit_instruction(
1425 struct x86_function *func,
1426 struct tgsi_full_instruction *inst )
1427 {
1428 unsigned chan_index;
1429
1430 switch (inst->Instruction.Opcode) {
1431 case TGSI_OPCODE_ARL:
1432 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1433 FETCH( func, *inst, 0, 0, chan_index );
1434 emit_f2it( func, 0 );
1435 STORE( func, *inst, 0, 0, chan_index );
1436 }
1437 break;
1438
1439 case TGSI_OPCODE_MOV:
1440 case TGSI_OPCODE_SWZ:
1441 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1442 FETCH( func, *inst, 0, 0, chan_index );
1443 STORE( func, *inst, 0, 0, chan_index );
1444 }
1445 break;
1446
1447 case TGSI_OPCODE_LIT:
1448 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1449 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1450 emit_tempf(
1451 func,
1452 0,
1453 TEMP_ONE_I,
1454 TEMP_ONE_C);
1455 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1456 STORE( func, *inst, 0, 0, CHAN_X );
1457 }
1458 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1459 STORE( func, *inst, 0, 0, CHAN_W );
1460 }
1461 }
1462 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1463 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1464 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1465 FETCH( func, *inst, 0, 0, CHAN_X );
1466 sse_maxps(
1467 func,
1468 make_xmm( 0 ),
1469 get_temp(
1470 TGSI_EXEC_TEMP_00000000_I,
1471 TGSI_EXEC_TEMP_00000000_C ) );
1472 STORE( func, *inst, 0, 0, CHAN_Y );
1473 }
1474 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1475 /* XMM[1] = SrcReg[0].yyyy */
1476 FETCH( func, *inst, 1, 0, CHAN_Y );
1477 /* XMM[1] = max(XMM[1], 0) */
1478 sse_maxps(
1479 func,
1480 make_xmm( 1 ),
1481 get_temp(
1482 TGSI_EXEC_TEMP_00000000_I,
1483 TGSI_EXEC_TEMP_00000000_C ) );
1484 /* XMM[2] = SrcReg[0].wwww */
1485 FETCH( func, *inst, 2, 0, CHAN_W );
1486 /* XMM[2] = min(XMM[2], 128.0) */
1487 sse_minps(
1488 func,
1489 make_xmm( 2 ),
1490 get_temp(
1491 TGSI_EXEC_TEMP_128_I,
1492 TGSI_EXEC_TEMP_128_C ) );
1493 /* XMM[2] = max(XMM[2], -128.0) */
1494 sse_maxps(
1495 func,
1496 make_xmm( 2 ),
1497 get_temp(
1498 TGSI_EXEC_TEMP_MINUS_128_I,
1499 TGSI_EXEC_TEMP_MINUS_128_C ) );
1500 emit_pow( func, 3, 1, 2 );
1501 FETCH( func, *inst, 0, 0, CHAN_X );
1502 sse_xorps(
1503 func,
1504 make_xmm( 2 ),
1505 make_xmm( 2 ) );
1506 sse_cmpps(
1507 func,
1508 make_xmm( 2 ),
1509 make_xmm( 0 ),
1510 cc_LessThanEqual );
1511 sse_andps(
1512 func,
1513 make_xmm( 2 ),
1514 make_xmm( 1 ) );
1515 STORE( func, *inst, 2, 0, CHAN_Z );
1516 }
1517 }
1518 break;
1519
1520 case TGSI_OPCODE_RCP:
1521 /* TGSI_OPCODE_RECIP */
1522 FETCH( func, *inst, 0, 0, CHAN_X );
1523 emit_rcp( func, 0, 0 );
1524 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1525 STORE( func, *inst, 0, 0, chan_index );
1526 }
1527 break;
1528
1529 case TGSI_OPCODE_RSQ:
1530 /* TGSI_OPCODE_RECIPSQRT */
1531 FETCH( func, *inst, 0, 0, CHAN_X );
1532 emit_rsqrt( func, 1, 0 );
1533 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1534 STORE( func, *inst, 1, 0, chan_index );
1535 }
1536 break;
1537
1538 case TGSI_OPCODE_EXP:
1539 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1540 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1541 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1542 FETCH( func, *inst, 0, 0, CHAN_X );
1543 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1544 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1545 emit_MOV( func, 1, 0 );
1546 emit_flr( func, 2, 1 );
1547 /* dst.x = ex2(floor(src.x)) */
1548 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1549 emit_MOV( func, 2, 1 );
1550 emit_ex2( func, 3, 2 );
1551 STORE( func, *inst, 2, 0, CHAN_X );
1552 }
1553 /* dst.y = src.x - floor(src.x) */
1554 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1555 emit_MOV( func, 2, 0 );
1556 emit_sub( func, 2, 1 );
1557 STORE( func, *inst, 2, 0, CHAN_Y );
1558 }
1559 }
1560 /* dst.z = ex2(src.x) */
1561 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1562 emit_ex2( func, 3, 0 );
1563 STORE( func, *inst, 0, 0, CHAN_Z );
1564 }
1565 }
1566 /* dst.w = 1.0 */
1567 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1568 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1569 STORE( func, *inst, 0, 0, CHAN_W );
1570 }
1571 break;
1572
1573 case TGSI_OPCODE_LOG:
1574 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1575 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1576 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1577 FETCH( func, *inst, 0, 0, CHAN_X );
1578 emit_abs( func, 0 );
1579 emit_MOV( func, 1, 0 );
1580 emit_lg2( func, 2, 1 );
1581 /* dst.z = lg2(abs(src.x)) */
1582 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1583 STORE( func, *inst, 1, 0, CHAN_Z );
1584 }
1585 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1586 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1587 emit_flr( func, 2, 1 );
1588 /* dst.x = floor(lg2(abs(src.x))) */
1589 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1590 STORE( func, *inst, 1, 0, CHAN_X );
1591 }
1592 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1593 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1594 emit_ex2( func, 2, 1 );
1595 emit_rcp( func, 1, 1 );
1596 emit_mul( func, 0, 1 );
1597 STORE( func, *inst, 0, 0, CHAN_Y );
1598 }
1599 }
1600 }
1601 /* dst.w = 1.0 */
1602 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1603 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1604 STORE( func, *inst, 0, 0, CHAN_W );
1605 }
1606 break;
1607
1608 case TGSI_OPCODE_MUL:
1609 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1610 FETCH( func, *inst, 0, 0, chan_index );
1611 FETCH( func, *inst, 1, 1, chan_index );
1612 emit_mul( func, 0, 1 );
1613 STORE( func, *inst, 0, 0, chan_index );
1614 }
1615 break;
1616
1617 case TGSI_OPCODE_ADD:
1618 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1619 FETCH( func, *inst, 0, 0, chan_index );
1620 FETCH( func, *inst, 1, 1, chan_index );
1621 emit_add( func, 0, 1 );
1622 STORE( func, *inst, 0, 0, chan_index );
1623 }
1624 break;
1625
1626 case TGSI_OPCODE_DP3:
1627 /* TGSI_OPCODE_DOT3 */
1628 FETCH( func, *inst, 0, 0, CHAN_X );
1629 FETCH( func, *inst, 1, 1, CHAN_X );
1630 emit_mul( func, 0, 1 );
1631 FETCH( func, *inst, 1, 0, CHAN_Y );
1632 FETCH( func, *inst, 2, 1, CHAN_Y );
1633 emit_mul( func, 1, 2 );
1634 emit_add( func, 0, 1 );
1635 FETCH( func, *inst, 1, 0, CHAN_Z );
1636 FETCH( func, *inst, 2, 1, CHAN_Z );
1637 emit_mul( func, 1, 2 );
1638 emit_add( func, 0, 1 );
1639 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1640 STORE( func, *inst, 0, 0, chan_index );
1641 }
1642 break;
1643
1644 case TGSI_OPCODE_DP4:
1645 /* TGSI_OPCODE_DOT4 */
1646 FETCH( func, *inst, 0, 0, CHAN_X );
1647 FETCH( func, *inst, 1, 1, CHAN_X );
1648 emit_mul( func, 0, 1 );
1649 FETCH( func, *inst, 1, 0, CHAN_Y );
1650 FETCH( func, *inst, 2, 1, CHAN_Y );
1651 emit_mul( func, 1, 2 );
1652 emit_add( func, 0, 1 );
1653 FETCH( func, *inst, 1, 0, CHAN_Z );
1654 FETCH( func, *inst, 2, 1, CHAN_Z );
1655 emit_mul(func, 1, 2 );
1656 emit_add(func, 0, 1 );
1657 FETCH( func, *inst, 1, 0, CHAN_W );
1658 FETCH( func, *inst, 2, 1, CHAN_W );
1659 emit_mul( func, 1, 2 );
1660 emit_add( func, 0, 1 );
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662 STORE( func, *inst, 0, 0, chan_index );
1663 }
1664 break;
1665
1666 case TGSI_OPCODE_DST:
1667 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1668 emit_tempf(
1669 func,
1670 0,
1671 TEMP_ONE_I,
1672 TEMP_ONE_C );
1673 STORE( func, *inst, 0, 0, CHAN_X );
1674 }
1675 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1676 FETCH( func, *inst, 0, 0, CHAN_Y );
1677 FETCH( func, *inst, 1, 1, CHAN_Y );
1678 emit_mul( func, 0, 1 );
1679 STORE( func, *inst, 0, 0, CHAN_Y );
1680 }
1681 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1682 FETCH( func, *inst, 0, 0, CHAN_Z );
1683 STORE( func, *inst, 0, 0, CHAN_Z );
1684 }
1685 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1686 FETCH( func, *inst, 0, 1, CHAN_W );
1687 STORE( func, *inst, 0, 0, CHAN_W );
1688 }
1689 break;
1690
1691 case TGSI_OPCODE_MIN:
1692 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1693 FETCH( func, *inst, 0, 0, chan_index );
1694 FETCH( func, *inst, 1, 1, chan_index );
1695 sse_minps(
1696 func,
1697 make_xmm( 0 ),
1698 make_xmm( 1 ) );
1699 STORE( func, *inst, 0, 0, chan_index );
1700 }
1701 break;
1702
1703 case TGSI_OPCODE_MAX:
1704 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1705 FETCH( func, *inst, 0, 0, chan_index );
1706 FETCH( func, *inst, 1, 1, chan_index );
1707 sse_maxps(
1708 func,
1709 make_xmm( 0 ),
1710 make_xmm( 1 ) );
1711 STORE( func, *inst, 0, 0, chan_index );
1712 }
1713 break;
1714
1715 case TGSI_OPCODE_SLT:
1716 /* TGSI_OPCODE_SETLT */
1717 emit_setcc( func, inst, cc_LessThan );
1718 break;
1719
1720 case TGSI_OPCODE_SGE:
1721 /* TGSI_OPCODE_SETGE */
1722 emit_setcc( func, inst, cc_NotLessThan );
1723 break;
1724
1725 case TGSI_OPCODE_MAD:
1726 /* TGSI_OPCODE_MADD */
1727 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1728 FETCH( func, *inst, 0, 0, chan_index );
1729 FETCH( func, *inst, 1, 1, chan_index );
1730 FETCH( func, *inst, 2, 2, chan_index );
1731 emit_mul( func, 0, 1 );
1732 emit_add( func, 0, 2 );
1733 STORE( func, *inst, 0, 0, chan_index );
1734 }
1735 break;
1736
1737 case TGSI_OPCODE_SUB:
1738 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1739 FETCH( func, *inst, 0, 0, chan_index );
1740 FETCH( func, *inst, 1, 1, chan_index );
1741 emit_sub( func, 0, 1 );
1742 STORE( func, *inst, 0, 0, chan_index );
1743 }
1744 break;
1745
1746 case TGSI_OPCODE_LERP:
1747 /* TGSI_OPCODE_LRP */
1748 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1749 FETCH( func, *inst, 0, 0, chan_index );
1750 FETCH( func, *inst, 1, 1, chan_index );
1751 FETCH( func, *inst, 2, 2, chan_index );
1752 emit_sub( func, 1, 2 );
1753 emit_mul( func, 0, 1 );
1754 emit_add( func, 0, 2 );
1755 STORE( func, *inst, 0, 0, chan_index );
1756 }
1757 break;
1758
1759 case TGSI_OPCODE_CND:
1760 return 0;
1761 break;
1762
1763 case TGSI_OPCODE_CND0:
1764 return 0;
1765 break;
1766
1767 case TGSI_OPCODE_DOT2ADD:
1768 /* TGSI_OPCODE_DP2A */
1769 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
1770 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
1771 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1772 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
1773 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
1774 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1775 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1776 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
1777 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1778 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1779 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
1780 }
1781 break;
1782
1783 case TGSI_OPCODE_INDEX:
1784 return 0;
1785 break;
1786
1787 case TGSI_OPCODE_NEGATE:
1788 return 0;
1789 break;
1790
1791 case TGSI_OPCODE_FRAC:
1792 /* TGSI_OPCODE_FRC */
1793 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1794 FETCH( func, *inst, 0, 0, chan_index );
1795 emit_frc( func, 0, 0 );
1796 STORE( func, *inst, 0, 0, chan_index );
1797 }
1798 break;
1799
1800 case TGSI_OPCODE_CLAMP:
1801 return 0;
1802 break;
1803
1804 case TGSI_OPCODE_FLOOR:
1805 /* TGSI_OPCODE_FLR */
1806 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1807 FETCH( func, *inst, 0, 0, chan_index );
1808 emit_flr( func, 0, 0 );
1809 STORE( func, *inst, 0, 0, chan_index );
1810 }
1811 break;
1812
1813 case TGSI_OPCODE_ROUND:
1814 return 0;
1815 break;
1816
1817 case TGSI_OPCODE_EXPBASE2:
1818 /* TGSI_OPCODE_EX2 */
1819 FETCH( func, *inst, 0, 0, CHAN_X );
1820 emit_ex2( func, 0, 0 );
1821 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1822 STORE( func, *inst, 0, 0, chan_index );
1823 }
1824 break;
1825
1826 case TGSI_OPCODE_LOGBASE2:
1827 /* TGSI_OPCODE_LG2 */
1828 FETCH( func, *inst, 0, 0, CHAN_X );
1829 emit_lg2( func, 0, 0 );
1830 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1831 STORE( func, *inst, 0, 0, chan_index );
1832 }
1833 break;
1834
1835 case TGSI_OPCODE_POWER:
1836 /* TGSI_OPCODE_POW */
1837 FETCH( func, *inst, 0, 0, CHAN_X );
1838 FETCH( func, *inst, 1, 1, CHAN_X );
1839 emit_pow( func, 0, 0, 1 );
1840 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1841 STORE( func, *inst, 0, 0, chan_index );
1842 }
1843 break;
1844
1845 case TGSI_OPCODE_CROSSPRODUCT:
1846 /* TGSI_OPCODE_XPD */
1847 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1848 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1849 FETCH( func, *inst, 1, 1, CHAN_Z );
1850 FETCH( func, *inst, 3, 0, CHAN_Z );
1851 }
1852 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1853 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1854 FETCH( func, *inst, 0, 0, CHAN_Y );
1855 FETCH( func, *inst, 4, 1, CHAN_Y );
1856 }
1857 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1858 emit_MOV( func, 2, 0 );
1859 emit_mul( func, 2, 1 );
1860 emit_MOV( func, 5, 3 );
1861 emit_mul( func, 5, 4 );
1862 emit_sub( func, 2, 5 );
1863 STORE( func, *inst, 2, 0, CHAN_X );
1864 }
1865 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1866 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1867 FETCH( func, *inst, 2, 1, CHAN_X );
1868 FETCH( func, *inst, 5, 0, CHAN_X );
1869 }
1870 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1871 emit_mul( func, 3, 2 );
1872 emit_mul( func, 1, 5 );
1873 emit_sub( func, 3, 1 );
1874 STORE( func, *inst, 3, 0, CHAN_Y );
1875 }
1876 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1877 emit_mul( func, 5, 4 );
1878 emit_mul( func, 0, 2 );
1879 emit_sub( func, 5, 0 );
1880 STORE( func, *inst, 5, 0, CHAN_Z );
1881 }
1882 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1883 emit_tempf(
1884 func,
1885 0,
1886 TEMP_ONE_I,
1887 TEMP_ONE_C );
1888 STORE( func, *inst, 0, 0, CHAN_W );
1889 }
1890 break;
1891
1892 case TGSI_OPCODE_MULTIPLYMATRIX:
1893 return 0;
1894 break;
1895
1896 case TGSI_OPCODE_ABS:
1897 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1898 FETCH( func, *inst, 0, 0, chan_index );
1899 emit_abs( func, 0) ;
1900
1901 STORE( func, *inst, 0, 0, chan_index );
1902 }
1903 break;
1904
1905 case TGSI_OPCODE_RCC:
1906 return 0;
1907 break;
1908
1909 case TGSI_OPCODE_DPH:
1910 FETCH( func, *inst, 0, 0, CHAN_X );
1911 FETCH( func, *inst, 1, 1, CHAN_X );
1912 emit_mul( func, 0, 1 );
1913 FETCH( func, *inst, 1, 0, CHAN_Y );
1914 FETCH( func, *inst, 2, 1, CHAN_Y );
1915 emit_mul( func, 1, 2 );
1916 emit_add( func, 0, 1 );
1917 FETCH( func, *inst, 1, 0, CHAN_Z );
1918 FETCH( func, *inst, 2, 1, CHAN_Z );
1919 emit_mul( func, 1, 2 );
1920 emit_add( func, 0, 1 );
1921 FETCH( func, *inst, 1, 1, CHAN_W );
1922 emit_add( func, 0, 1 );
1923 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1924 STORE( func, *inst, 0, 0, chan_index );
1925 }
1926 break;
1927
1928 case TGSI_OPCODE_COS:
1929 FETCH( func, *inst, 0, 0, CHAN_X );
1930 emit_cos( func, 0, 0 );
1931 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1932 STORE( func, *inst, 0, 0, chan_index );
1933 }
1934 break;
1935
1936 case TGSI_OPCODE_DDX:
1937 return 0;
1938 break;
1939
1940 case TGSI_OPCODE_DDY:
1941 return 0;
1942 break;
1943
1944 case TGSI_OPCODE_KILP:
1945 /* predicated kill */
1946 emit_kilp( func );
1947 return 0; /* XXX fix me */
1948 break;
1949
1950 case TGSI_OPCODE_KIL:
1951 /* conditional kill */
1952 emit_kil( func, &inst->FullSrcRegisters[0] );
1953 break;
1954
1955 case TGSI_OPCODE_PK2H:
1956 return 0;
1957 break;
1958
1959 case TGSI_OPCODE_PK2US:
1960 return 0;
1961 break;
1962
1963 case TGSI_OPCODE_PK4B:
1964 return 0;
1965 break;
1966
1967 case TGSI_OPCODE_PK4UB:
1968 return 0;
1969 break;
1970
1971 case TGSI_OPCODE_RFL:
1972 return 0;
1973 break;
1974
1975 case TGSI_OPCODE_SEQ:
1976 return 0;
1977 break;
1978
1979 case TGSI_OPCODE_SFL:
1980 return 0;
1981 break;
1982
1983 case TGSI_OPCODE_SGT:
1984 return 0;
1985 break;
1986
1987 case TGSI_OPCODE_SIN:
1988 FETCH( func, *inst, 0, 0, CHAN_X );
1989 emit_sin( func, 0, 0 );
1990 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1991 STORE( func, *inst, 0, 0, chan_index );
1992 }
1993 break;
1994
1995 case TGSI_OPCODE_SLE:
1996 return 0;
1997 break;
1998
1999 case TGSI_OPCODE_SNE:
2000 return 0;
2001 break;
2002
2003 case TGSI_OPCODE_STR:
2004 return 0;
2005 break;
2006
2007 case TGSI_OPCODE_TEX:
2008 if (0) {
2009 /* Disable dummy texture code:
2010 */
2011 emit_tempf(
2012 func,
2013 0,
2014 TEMP_ONE_I,
2015 TEMP_ONE_C );
2016 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2017 STORE( func, *inst, 0, 0, chan_index );
2018 }
2019 }
2020 else {
2021 return 0;
2022 }
2023 break;
2024
2025 case TGSI_OPCODE_TXD:
2026 return 0;
2027 break;
2028
2029 case TGSI_OPCODE_UP2H:
2030 return 0;
2031 break;
2032
2033 case TGSI_OPCODE_UP2US:
2034 return 0;
2035 break;
2036
2037 case TGSI_OPCODE_UP4B:
2038 return 0;
2039 break;
2040
2041 case TGSI_OPCODE_UP4UB:
2042 return 0;
2043 break;
2044
2045 case TGSI_OPCODE_X2D:
2046 return 0;
2047 break;
2048
2049 case TGSI_OPCODE_ARA:
2050 return 0;
2051 break;
2052
2053 case TGSI_OPCODE_ARR:
2054 return 0;
2055 break;
2056
2057 case TGSI_OPCODE_BRA:
2058 return 0;
2059 break;
2060
2061 case TGSI_OPCODE_CAL:
2062 return 0;
2063 break;
2064
2065 case TGSI_OPCODE_RET:
2066 emit_ret( func );
2067 break;
2068
2069 case TGSI_OPCODE_END:
2070 break;
2071
2072 case TGSI_OPCODE_SSG:
2073 return 0;
2074 break;
2075
2076 case TGSI_OPCODE_CMP:
2077 emit_cmp (func, inst);
2078 break;
2079
2080 case TGSI_OPCODE_SCS:
2081 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2082 FETCH( func, *inst, 0, 0, CHAN_X );
2083 emit_cos( func, 0, 0 );
2084 STORE( func, *inst, 0, 0, CHAN_X );
2085 }
2086 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2087 FETCH( func, *inst, 0, 0, CHAN_X );
2088 emit_sin( func, 0, 0 );
2089 STORE( func, *inst, 0, 0, CHAN_Y );
2090 }
2091 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2092 emit_tempf(
2093 func,
2094 0,
2095 TGSI_EXEC_TEMP_00000000_I,
2096 TGSI_EXEC_TEMP_00000000_C );
2097 STORE( func, *inst, 0, 0, CHAN_Z );
2098 }
2099 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2100 emit_tempf(
2101 func,
2102 0,
2103 TEMP_ONE_I,
2104 TEMP_ONE_C );
2105 STORE( func, *inst, 0, 0, CHAN_W );
2106 }
2107 break;
2108
2109 case TGSI_OPCODE_TXB:
2110 return 0;
2111 break;
2112
2113 case TGSI_OPCODE_NRM:
2114 /* fall-through */
2115 case TGSI_OPCODE_NRM4:
2116 /* 3 or 4-component normalization */
2117 {
2118 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2119 /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2120 FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
2121 FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
2122 FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
2123 if (dims == 4) {
2124 FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2125 }
2126 emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
2127 emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
2128 emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
2129 emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
2130 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2131 emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
2132 emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
2133 emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
2134 if (dims == 4) {
2135 emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
2136 emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
2137 emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
2138 }
2139 emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
2140 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2141 if (chan_index < dims) {
2142 emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2143 STORE( func, *inst, 4+chan_index, 0, chan_index );
2144 }
2145 }
2146 }
2147 break;
2148
2149 case TGSI_OPCODE_DIV:
2150 return 0;
2151 break;
2152
2153 case TGSI_OPCODE_DP2:
2154 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2155 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2156 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2157 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2158 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2159 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2160 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2161 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2162 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2163 }
2164 break;
2165
2166 case TGSI_OPCODE_TXL:
2167 return 0;
2168 break;
2169
2170 case TGSI_OPCODE_BRK:
2171 return 0;
2172 break;
2173
2174 case TGSI_OPCODE_IF:
2175 return 0;
2176 break;
2177
2178 case TGSI_OPCODE_LOOP:
2179 return 0;
2180 break;
2181
2182 case TGSI_OPCODE_REP:
2183 return 0;
2184 break;
2185
2186 case TGSI_OPCODE_ELSE:
2187 return 0;
2188 break;
2189
2190 case TGSI_OPCODE_ENDIF:
2191 return 0;
2192 break;
2193
2194 case TGSI_OPCODE_ENDLOOP:
2195 return 0;
2196 break;
2197
2198 case TGSI_OPCODE_ENDREP:
2199 return 0;
2200 break;
2201
2202 case TGSI_OPCODE_PUSHA:
2203 return 0;
2204 break;
2205
2206 case TGSI_OPCODE_POPA:
2207 return 0;
2208 break;
2209
2210 case TGSI_OPCODE_CEIL:
2211 return 0;
2212 break;
2213
2214 case TGSI_OPCODE_I2F:
2215 return 0;
2216 break;
2217
2218 case TGSI_OPCODE_NOT:
2219 return 0;
2220 break;
2221
2222 case TGSI_OPCODE_TRUNC:
2223 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2224 FETCH( func, *inst, 0, 0, chan_index );
2225 emit_f2it( func, 0 );
2226 emit_i2f( func, 0 );
2227 STORE( func, *inst, 0, 0, chan_index );
2228 }
2229 break;
2230
2231 case TGSI_OPCODE_SHL:
2232 return 0;
2233 break;
2234
2235 case TGSI_OPCODE_SHR:
2236 return 0;
2237 break;
2238
2239 case TGSI_OPCODE_AND:
2240 return 0;
2241 break;
2242
2243 case TGSI_OPCODE_OR:
2244 return 0;
2245 break;
2246
2247 case TGSI_OPCODE_MOD:
2248 return 0;
2249 break;
2250
2251 case TGSI_OPCODE_XOR:
2252 return 0;
2253 break;
2254
2255 case TGSI_OPCODE_SAD:
2256 return 0;
2257 break;
2258
2259 case TGSI_OPCODE_TXF:
2260 return 0;
2261 break;
2262
2263 case TGSI_OPCODE_TXQ:
2264 return 0;
2265 break;
2266
2267 case TGSI_OPCODE_CONT:
2268 return 0;
2269 break;
2270
2271 case TGSI_OPCODE_EMIT:
2272 return 0;
2273 break;
2274
2275 case TGSI_OPCODE_ENDPRIM:
2276 return 0;
2277 break;
2278
2279 default:
2280 return 0;
2281 }
2282
2283 return 1;
2284 }
2285
2286 static void
2287 emit_declaration(
2288 struct x86_function *func,
2289 struct tgsi_full_declaration *decl )
2290 {
2291 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2292 unsigned first, last, mask;
2293 unsigned i, j;
2294
2295 first = decl->DeclarationRange.First;
2296 last = decl->DeclarationRange.Last;
2297 mask = decl->Declaration.UsageMask;
2298
2299 for( i = first; i <= last; i++ ) {
2300 for( j = 0; j < NUM_CHANNELS; j++ ) {
2301 if( mask & (1 << j) ) {
2302 switch( decl->Declaration.Interpolate ) {
2303 case TGSI_INTERPOLATE_CONSTANT:
2304 emit_coef_a0( func, 0, i, j );
2305 emit_inputs( func, 0, i, j );
2306 break;
2307
2308 case TGSI_INTERPOLATE_LINEAR:
2309 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2310 emit_coef_dadx( func, 1, i, j );
2311 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2312 emit_coef_dady( func, 3, i, j );
2313 emit_mul( func, 0, 1 ); /* x * dadx */
2314 emit_coef_a0( func, 4, i, j );
2315 emit_mul( func, 2, 3 ); /* y * dady */
2316 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2317 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2318 emit_inputs( func, 0, i, j );
2319 break;
2320
2321 case TGSI_INTERPOLATE_PERSPECTIVE:
2322 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2323 emit_coef_dadx( func, 1, i, j );
2324 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2325 emit_coef_dady( func, 3, i, j );
2326 emit_mul( func, 0, 1 ); /* x * dadx */
2327 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2328 emit_coef_a0( func, 5, i, j );
2329 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2330 emit_mul( func, 2, 3 ); /* y * dady */
2331 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2332 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2333 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2334 emit_inputs( func, 0, i, j );
2335 break;
2336
2337 default:
2338 assert( 0 );
2339 break;
2340 }
2341 }
2342 }
2343 }
2344 }
2345 }
2346
2347 static void aos_to_soa( struct x86_function *func,
2348 uint arg_aos,
2349 uint arg_soa,
2350 uint arg_num,
2351 uint arg_stride )
2352 {
2353 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2354 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2355 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2356 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2357 int inner_loop;
2358
2359
2360 /* Save EBX */
2361 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2362
2363 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2364 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2365 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2366 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2367
2368 /* do */
2369 inner_loop = x86_get_label( func );
2370 {
2371 x86_push( func, aos_input );
2372 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2373 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2374 x86_add( func, aos_input, stride );
2375 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2376 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2377 x86_add( func, aos_input, stride );
2378 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2379 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2380 x86_add( func, aos_input, stride );
2381 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2382 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2383 x86_pop( func, aos_input );
2384
2385 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2386 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2387 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2388 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2389 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2390 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2391
2392 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2393 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2394 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2395 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2396
2397 /* Advance to next input */
2398 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2399 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2400 }
2401 /* while --num_inputs */
2402 x86_dec( func, num_inputs );
2403 x86_jcc( func, cc_NE, inner_loop );
2404
2405 /* Restore EBX */
2406 x86_pop( func, aos_input );
2407 }
2408
2409 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2410 {
2411 struct x86_reg soa_output;
2412 struct x86_reg aos_output;
2413 struct x86_reg num_outputs;
2414 struct x86_reg temp;
2415 int inner_loop;
2416
2417 soa_output = x86_make_reg( file_REG32, reg_AX );
2418 aos_output = x86_make_reg( file_REG32, reg_BX );
2419 num_outputs = x86_make_reg( file_REG32, reg_CX );
2420 temp = x86_make_reg( file_REG32, reg_DX );
2421
2422 /* Save EBX */
2423 x86_push( func, aos_output );
2424
2425 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2426 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2427 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2428
2429 /* do */
2430 inner_loop = x86_get_label( func );
2431 {
2432 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2433 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2434 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2435 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2436
2437 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2438 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2439 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2440 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2441 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2442 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2443
2444 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2445 x86_push( func, aos_output );
2446 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2447 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2448 x86_add( func, aos_output, temp );
2449 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2450 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2451 x86_add( func, aos_output, temp );
2452 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2453 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2454 x86_add( func, aos_output, temp );
2455 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2456 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2457 x86_pop( func, aos_output );
2458
2459 /* Advance to next output */
2460 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2461 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2462 }
2463 /* while --num_outputs */
2464 x86_dec( func, num_outputs );
2465 x86_jcc( func, cc_NE, inner_loop );
2466
2467 /* Restore EBX */
2468 x86_pop( func, aos_output );
2469 }
2470
2471 /**
2472 * Translate a TGSI vertex/fragment shader to SSE2 code.
2473 * Slightly different things are done for vertex vs. fragment shaders.
2474 *
2475 * Note that fragment shaders are responsible for interpolating shader
2476 * inputs. Because on x86 we have only 4 GP registers, and here we
2477 * have 5 shader arguments (input, output, const, temp and coef), the
2478 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2479 * GP register holding the output argument is aliased with the coeff
2480 * argument, as outputs are not needed in the DECLARATION phase.
2481 *
2482 * \param tokens the TGSI input shader
2483 * \param func the output SSE code/function
2484 * \param immediates buffer to place immediates, later passed to SSE func
2485 * \param return 1 for success, 0 if translation failed
2486 */
2487 unsigned
2488 tgsi_emit_sse2(
2489 const struct tgsi_token *tokens,
2490 struct x86_function *func,
2491 float (*immediates)[4],
2492 boolean do_swizzles )
2493 {
2494 struct tgsi_parse_context parse;
2495 boolean instruction_phase = FALSE;
2496 unsigned ok = 1;
2497 uint num_immediates = 0;
2498
2499 util_init_math();
2500
2501 func->csr = func->store;
2502
2503 tgsi_parse_init( &parse, tokens );
2504
2505 /* Can't just use EDI, EBX without save/restoring them:
2506 */
2507 x86_push(
2508 func,
2509 get_immediate_base() );
2510
2511 x86_push(
2512 func,
2513 get_temp_base() );
2514
2515
2516 /*
2517 * Different function args for vertex/fragment shaders:
2518 */
2519 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2520 /* DECLARATION phase, do not load output argument. */
2521 x86_mov(
2522 func,
2523 get_input_base(),
2524 x86_fn_arg( func, 1 ) );
2525 /* skipping outputs argument here */
2526 x86_mov(
2527 func,
2528 get_const_base(),
2529 x86_fn_arg( func, 3 ) );
2530 x86_mov(
2531 func,
2532 get_temp_base(),
2533 x86_fn_arg( func, 4 ) );
2534 x86_mov(
2535 func,
2536 get_coef_base(),
2537 x86_fn_arg( func, 5 ) );
2538 x86_mov(
2539 func,
2540 get_immediate_base(),
2541 x86_fn_arg( func, 6 ) );
2542 }
2543 else {
2544 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2545
2546 if (do_swizzles)
2547 aos_to_soa( func,
2548 6, /* aos_input */
2549 1, /* machine->input */
2550 7, /* num_inputs */
2551 8 ); /* input_stride */
2552
2553 x86_mov(
2554 func,
2555 get_input_base(),
2556 x86_fn_arg( func, 1 ) );
2557 x86_mov(
2558 func,
2559 get_output_base(),
2560 x86_fn_arg( func, 2 ) );
2561 x86_mov(
2562 func,
2563 get_const_base(),
2564 x86_fn_arg( func, 3 ) );
2565 x86_mov(
2566 func,
2567 get_temp_base(),
2568 x86_fn_arg( func, 4 ) );
2569 x86_mov(
2570 func,
2571 get_immediate_base(),
2572 x86_fn_arg( func, 5 ) );
2573 }
2574
2575 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2576 tgsi_parse_token( &parse );
2577
2578 switch( parse.FullToken.Token.Type ) {
2579 case TGSI_TOKEN_TYPE_DECLARATION:
2580 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2581 emit_declaration(
2582 func,
2583 &parse.FullToken.FullDeclaration );
2584 }
2585 break;
2586
2587 case TGSI_TOKEN_TYPE_INSTRUCTION:
2588 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2589 if( !instruction_phase ) {
2590 /* INSTRUCTION phase, overwrite coeff with output. */
2591 instruction_phase = TRUE;
2592 x86_mov(
2593 func,
2594 get_output_base(),
2595 x86_fn_arg( func, 2 ) );
2596 }
2597 }
2598
2599 ok = emit_instruction(
2600 func,
2601 &parse.FullToken.FullInstruction );
2602
2603 if (!ok) {
2604 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2605 parse.FullToken.FullInstruction.Instruction.Opcode,
2606 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2607 "vertex shader" : "fragment shader");
2608 }
2609 break;
2610
2611 case TGSI_TOKEN_TYPE_IMMEDIATE:
2612 /* simply copy the immediate values into the next immediates[] slot */
2613 {
2614 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2615 uint i;
2616 assert(size <= 4);
2617 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2618 for( i = 0; i < size; i++ ) {
2619 immediates[num_immediates][i] =
2620 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2621 }
2622 #if 0
2623 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2624 num_immediates,
2625 immediates[num_immediates][0],
2626 immediates[num_immediates][1],
2627 immediates[num_immediates][2],
2628 immediates[num_immediates][3]);
2629 #endif
2630 num_immediates++;
2631 }
2632 break;
2633
2634 default:
2635 ok = 0;
2636 assert( 0 );
2637 }
2638 }
2639
2640 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2641 if (do_swizzles)
2642 soa_to_aos( func, 9, 2, 10, 11 );
2643 }
2644
2645 /* Can't just use EBX, EDI without save/restoring them:
2646 */
2647 x86_pop(
2648 func,
2649 get_temp_base() );
2650
2651 x86_pop(
2652 func,
2653 get_immediate_base() );
2654
2655 emit_ret( func );
2656
2657 tgsi_parse_free( &parse );
2658
2659 return ok;
2660 }
2661
2662 #endif /* PIPE_ARCH_X86 */
2663