Merge branch 'mesa_7_5_branch' into mesa_7_6_branch
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_info.h"
40 #include "tgsi/tgsi_parse.h"
41 #include "tgsi/tgsi_util.h"
42 #include "tgsi_exec.h"
43 #include "tgsi_sse2.h"
44
45 #include "rtasm/rtasm_x86sse.h"
46
47 /* for 1/sqrt()
48 *
49 * This costs about 100fps (close to 10%) in gears:
50 */
51 #define HIGH_PRECISION 1
52
53 #define FAST_MATH 1
54
55
56 #define FOR_EACH_CHANNEL( CHAN )\
57 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
58
59 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
60 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
61
62 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
63 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
64
65 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
66 FOR_EACH_CHANNEL( CHAN )\
67 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
68
69 #define CHAN_X 0
70 #define CHAN_Y 1
71 #define CHAN_Z 2
72 #define CHAN_W 3
73
74 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
75 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
76
77 #define TEMP_R0 TGSI_EXEC_TEMP_R0
78 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
79 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
80 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
81
82
83 /**
84 * X86 utility functions.
85 */
86
87 static struct x86_reg
88 make_xmm(
89 unsigned xmm )
90 {
91 return x86_make_reg(
92 file_XMM,
93 (enum x86_reg_name) xmm );
94 }
95
96 /**
97 * X86 register mapping helpers.
98 */
99
100 static struct x86_reg
101 get_const_base( void )
102 {
103 return x86_make_reg(
104 file_REG32,
105 reg_AX );
106 }
107
108 static struct x86_reg
109 get_machine_base( void )
110 {
111 return x86_make_reg(
112 file_REG32,
113 reg_CX );
114 }
115
116 static struct x86_reg
117 get_input_base( void )
118 {
119 return x86_make_disp(
120 get_machine_base(),
121 Offset(struct tgsi_exec_machine, Inputs) );
122 }
123
124 static struct x86_reg
125 get_output_base( void )
126 {
127 return x86_make_disp(
128 get_machine_base(),
129 Offset(struct tgsi_exec_machine, Outputs) );
130 }
131
132 static struct x86_reg
133 get_temp_base( void )
134 {
135 return x86_make_disp(
136 get_machine_base(),
137 Offset(struct tgsi_exec_machine, Temps) );
138 }
139
140 static struct x86_reg
141 get_coef_base( void )
142 {
143 return x86_make_reg(
144 file_REG32,
145 reg_BX );
146 }
147
148 static struct x86_reg
149 get_sampler_base( void )
150 {
151 return x86_make_reg(
152 file_REG32,
153 reg_DI );
154 }
155
156 static struct x86_reg
157 get_immediate_base( void )
158 {
159 return x86_make_reg(
160 file_REG32,
161 reg_DX );
162 }
163
164
165 /**
166 * Data access helpers.
167 */
168
169
170 static struct x86_reg
171 get_immediate(
172 unsigned vec,
173 unsigned chan )
174 {
175 return x86_make_disp(
176 get_immediate_base(),
177 (vec * 4 + chan) * 4 );
178 }
179
180 static struct x86_reg
181 get_const(
182 unsigned vec,
183 unsigned chan )
184 {
185 return x86_make_disp(
186 get_const_base(),
187 (vec * 4 + chan) * 4 );
188 }
189
190 static struct x86_reg
191 get_sampler_ptr(
192 unsigned unit )
193 {
194 return x86_make_disp(
195 get_sampler_base(),
196 unit * sizeof( struct tgsi_sampler * ) );
197 }
198
199 static struct x86_reg
200 get_input(
201 unsigned vec,
202 unsigned chan )
203 {
204 return x86_make_disp(
205 get_input_base(),
206 (vec * 4 + chan) * 16 );
207 }
208
209 static struct x86_reg
210 get_output(
211 unsigned vec,
212 unsigned chan )
213 {
214 return x86_make_disp(
215 get_output_base(),
216 (vec * 4 + chan) * 16 );
217 }
218
219 static struct x86_reg
220 get_temp(
221 unsigned vec,
222 unsigned chan )
223 {
224 return x86_make_disp(
225 get_temp_base(),
226 (vec * 4 + chan) * 16 );
227 }
228
229 static struct x86_reg
230 get_coef(
231 unsigned vec,
232 unsigned chan,
233 unsigned member )
234 {
235 return x86_make_disp(
236 get_coef_base(),
237 ((vec * 3 + member) * 4 + chan) * 4 );
238 }
239
240
241 static void
242 emit_ret(
243 struct x86_function *func )
244 {
245 x86_ret( func );
246 }
247
248
249 /**
250 * Data fetch helpers.
251 */
252
253 /**
254 * Copy a shader constant to xmm register
255 * \param xmm the destination xmm register
256 * \param vec the src const buffer index
257 * \param chan src channel to fetch (X, Y, Z or W)
258 */
259 static void
260 emit_const(
261 struct x86_function *func,
262 uint xmm,
263 int vec,
264 uint chan,
265 uint indirect,
266 uint indirectFile,
267 int indirectIndex )
268 {
269 if (indirect) {
270 /* 'vec' is the offset from the address register's value.
271 * We're loading CONST[ADDR+vec] into an xmm register.
272 */
273 struct x86_reg r0 = get_immediate_base();
274 struct x86_reg r1 = get_coef_base();
275 uint i;
276
277 assert( indirectFile == TGSI_FILE_ADDRESS );
278 assert( indirectIndex == 0 );
279 assert( r0.mod == mod_REG );
280 assert( r1.mod == mod_REG );
281
282 x86_push( func, r0 );
283 x86_push( func, r1 );
284
285 /*
286 * Loop over the four pixels or vertices in the quad.
287 * Get the value of the address (offset) register for pixel/vertex[i],
288 * add it to the src offset and index into the constant buffer.
289 * Note that we're working on SOA data.
290 * If any of the pixel/vertex execution channels are unused their
291 * values will be garbage. It's very important that we don't use
292 * those garbage values as indexes into the constant buffer since
293 * that'll cause segfaults.
294 * The solution is to bitwise-AND the offset with the execution mask
295 * register whose values are either 0 or ~0.
296 * The caller must setup the execution mask register to indicate
297 * which channels are valid/alive before running the shader.
298 * The execution mask will also figure into loops and conditionals
299 * someday.
300 */
301 for (i = 0; i < QUAD_SIZE; i++) {
302 /* r1 = address register[i] */
303 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
304 /* r0 = execution mask[i] */
305 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
306 /* r1 = r1 & r0 */
307 x86_and( func, r1, r0 );
308 /* r0 = 'vec', the offset */
309 x86_lea( func, r0, get_const( vec, chan ) );
310
311 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
312 */
313 x86_add( func, r1, r1 );
314 x86_add( func, r1, r1 );
315 x86_add( func, r1, r1 );
316 x86_add( func, r1, r1 );
317
318 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
319 x86_mov( func, r1, x86_deref( r0 ) );
320 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
321 }
322
323 x86_pop( func, r1 );
324 x86_pop( func, r0 );
325
326 sse_movaps(
327 func,
328 make_xmm( xmm ),
329 get_temp( TEMP_R0, CHAN_X ) );
330 }
331 else {
332 /* 'vec' is the index into the src register file, such as TEMP[vec] */
333 assert( vec >= 0 );
334
335 sse_movss(
336 func,
337 make_xmm( xmm ),
338 get_const( vec, chan ) );
339 sse_shufps(
340 func,
341 make_xmm( xmm ),
342 make_xmm( xmm ),
343 SHUF( 0, 0, 0, 0 ) );
344 }
345 }
346
347 static void
348 emit_immediate(
349 struct x86_function *func,
350 unsigned xmm,
351 unsigned vec,
352 unsigned chan )
353 {
354 sse_movss(
355 func,
356 make_xmm( xmm ),
357 get_immediate( vec, chan ) );
358 sse_shufps(
359 func,
360 make_xmm( xmm ),
361 make_xmm( xmm ),
362 SHUF( 0, 0, 0, 0 ) );
363 }
364
365
366 /**
367 * Copy a shader input to xmm register
368 * \param xmm the destination xmm register
369 * \param vec the src input attrib
370 * \param chan src channel to fetch (X, Y, Z or W)
371 */
372 static void
373 emit_inputf(
374 struct x86_function *func,
375 unsigned xmm,
376 unsigned vec,
377 unsigned chan )
378 {
379 sse_movups(
380 func,
381 make_xmm( xmm ),
382 get_input( vec, chan ) );
383 }
384
385 /**
386 * Store an xmm register to a shader output
387 * \param xmm the source xmm register
388 * \param vec the dest output attrib
389 * \param chan src dest channel to store (X, Y, Z or W)
390 */
391 static void
392 emit_output(
393 struct x86_function *func,
394 unsigned xmm,
395 unsigned vec,
396 unsigned chan )
397 {
398 sse_movups(
399 func,
400 get_output( vec, chan ),
401 make_xmm( xmm ) );
402 }
403
404 /**
405 * Copy a shader temporary to xmm register
406 * \param xmm the destination xmm register
407 * \param vec the src temp register
408 * \param chan src channel to fetch (X, Y, Z or W)
409 */
410 static void
411 emit_tempf(
412 struct x86_function *func,
413 unsigned xmm,
414 unsigned vec,
415 unsigned chan )
416 {
417 sse_movaps(
418 func,
419 make_xmm( xmm ),
420 get_temp( vec, chan ) );
421 }
422
423 /**
424 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
425 * \param xmm the destination xmm register
426 * \param vec the src input/attribute coefficient index
427 * \param chan src channel to fetch (X, Y, Z or W)
428 * \param member 0=a0, 1=dadx, 2=dady
429 */
430 static void
431 emit_coef(
432 struct x86_function *func,
433 unsigned xmm,
434 unsigned vec,
435 unsigned chan,
436 unsigned member )
437 {
438 sse_movss(
439 func,
440 make_xmm( xmm ),
441 get_coef( vec, chan, member ) );
442 sse_shufps(
443 func,
444 make_xmm( xmm ),
445 make_xmm( xmm ),
446 SHUF( 0, 0, 0, 0 ) );
447 }
448
449 /**
450 * Data store helpers.
451 */
452
453 static void
454 emit_inputs(
455 struct x86_function *func,
456 unsigned xmm,
457 unsigned vec,
458 unsigned chan )
459 {
460 sse_movups(
461 func,
462 get_input( vec, chan ),
463 make_xmm( xmm ) );
464 }
465
466 static void
467 emit_temps(
468 struct x86_function *func,
469 unsigned xmm,
470 unsigned vec,
471 unsigned chan )
472 {
473 sse_movaps(
474 func,
475 get_temp( vec, chan ),
476 make_xmm( xmm ) );
477 }
478
479 static void
480 emit_addrs(
481 struct x86_function *func,
482 unsigned xmm,
483 unsigned vec,
484 unsigned chan )
485 {
486 assert( vec == 0 );
487
488 emit_temps(
489 func,
490 xmm,
491 vec + TGSI_EXEC_TEMP_ADDR,
492 chan );
493 }
494
495 /**
496 * Coefficent fetch helpers.
497 */
498
499 static void
500 emit_coef_a0(
501 struct x86_function *func,
502 unsigned xmm,
503 unsigned vec,
504 unsigned chan )
505 {
506 emit_coef(
507 func,
508 xmm,
509 vec,
510 chan,
511 0 );
512 }
513
514 static void
515 emit_coef_dadx(
516 struct x86_function *func,
517 unsigned xmm,
518 unsigned vec,
519 unsigned chan )
520 {
521 emit_coef(
522 func,
523 xmm,
524 vec,
525 chan,
526 1 );
527 }
528
529 static void
530 emit_coef_dady(
531 struct x86_function *func,
532 unsigned xmm,
533 unsigned vec,
534 unsigned chan )
535 {
536 emit_coef(
537 func,
538 xmm,
539 vec,
540 chan,
541 2 );
542 }
543
544 /**
545 * Function call helpers.
546 */
547
548 /**
549 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
550 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
551 * that the stack pointer is 16 byte aligned, as expected.
552 */
553 static void
554 emit_func_call(
555 struct x86_function *func,
556 unsigned xmm_save_mask,
557 const struct x86_reg *arg,
558 unsigned nr_args,
559 void (PIPE_CDECL *code)() )
560 {
561 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
562 unsigned i, n;
563
564 x86_push(
565 func,
566 x86_make_reg( file_REG32, reg_AX) );
567 x86_push(
568 func,
569 x86_make_reg( file_REG32, reg_CX) );
570 x86_push(
571 func,
572 x86_make_reg( file_REG32, reg_DX) );
573
574 /* Store XMM regs to the stack
575 */
576 for(i = 0, n = 0; i < 8; ++i)
577 if(xmm_save_mask & (1 << i))
578 ++n;
579
580 x86_sub_imm(
581 func,
582 x86_make_reg( file_REG32, reg_SP ),
583 n*16);
584
585 for(i = 0, n = 0; i < 8; ++i)
586 if(xmm_save_mask & (1 << i)) {
587 sse_movups(
588 func,
589 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
590 make_xmm( i ) );
591 ++n;
592 }
593
594 for (i = 0; i < nr_args; i++) {
595 /* Load the address of the buffer we use for passing arguments and
596 * receiving results:
597 */
598 x86_lea(
599 func,
600 ecx,
601 arg[i] );
602
603 /* Push actual function arguments (currently just the pointer to
604 * the buffer above), and call the function:
605 */
606 x86_push( func, ecx );
607 }
608
609 x86_mov_reg_imm( func, ecx, (unsigned long) code );
610 x86_call( func, ecx );
611
612 /* Pop the arguments (or just add an immediate to esp)
613 */
614 for (i = 0; i < nr_args; i++) {
615 x86_pop(func, ecx );
616 }
617
618 /* Pop the saved XMM regs:
619 */
620 for(i = 0, n = 0; i < 8; ++i)
621 if(xmm_save_mask & (1 << i)) {
622 sse_movups(
623 func,
624 make_xmm( i ),
625 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
626 ++n;
627 }
628
629 x86_add_imm(
630 func,
631 x86_make_reg( file_REG32, reg_SP ),
632 n*16);
633
634 /* Restore GP registers in a reverse order.
635 */
636 x86_pop(
637 func,
638 x86_make_reg( file_REG32, reg_DX) );
639 x86_pop(
640 func,
641 x86_make_reg( file_REG32, reg_CX) );
642 x86_pop(
643 func,
644 x86_make_reg( file_REG32, reg_AX) );
645 }
646
647 static void
648 emit_func_call_dst_src1(
649 struct x86_function *func,
650 unsigned xmm_save,
651 unsigned xmm_dst,
652 unsigned xmm_src0,
653 void (PIPE_CDECL *code)() )
654 {
655 struct x86_reg store = get_temp( TEMP_R0, 0 );
656 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
657
658 /* Store our input parameters (in xmm regs) to the buffer we use
659 * for passing arguments. We will pass a pointer to this buffer as
660 * the actual function argument.
661 */
662 sse_movaps(
663 func,
664 store,
665 make_xmm( xmm_src0 ) );
666
667 emit_func_call( func,
668 xmm_mask,
669 &store,
670 1,
671 code );
672
673 sse_movaps(
674 func,
675 make_xmm( xmm_dst ),
676 store );
677 }
678
679
680 static void
681 emit_func_call_dst_src2(
682 struct x86_function *func,
683 unsigned xmm_save,
684 unsigned xmm_dst,
685 unsigned xmm_src0,
686 unsigned xmm_src1,
687 void (PIPE_CDECL *code)() )
688 {
689 struct x86_reg store = get_temp( TEMP_R0, 0 );
690 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
691
692 /* Store two inputs to parameter buffer.
693 */
694 sse_movaps(
695 func,
696 store,
697 make_xmm( xmm_src0 ) );
698
699 sse_movaps(
700 func,
701 x86_make_disp( store, 4 * sizeof(float) ),
702 make_xmm( xmm_src1 ) );
703
704
705 /* Emit the call
706 */
707 emit_func_call( func,
708 xmm_mask,
709 &store,
710 1,
711 code );
712
713 /* Retrieve the results:
714 */
715 sse_movaps(
716 func,
717 make_xmm( xmm_dst ),
718 store );
719 }
720
721
722
723
724
725 #if defined(PIPE_ARCH_SSE)
726
727 /*
728 * Fast SSE2 implementation of special math functions.
729 */
730
731 #define POLY0(x, c0) _mm_set1_ps(c0)
732 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
733 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
734 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
735 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
736 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
737
738 #define EXP_POLY_DEGREE 3
739 #define LOG_POLY_DEGREE 5
740
741 /**
742 * See http://www.devmaster.net/forums/showthread.php?p=43580
743 */
744 static INLINE __m128
745 exp2f4(__m128 x)
746 {
747 __m128i ipart;
748 __m128 fpart, expipart, expfpart;
749
750 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
751 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
752
753 /* ipart = int(x - 0.5) */
754 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
755
756 /* fpart = x - ipart */
757 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
758
759 /* expipart = (float) (1 << ipart) */
760 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
761
762 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
763 #if EXP_POLY_DEGREE == 5
764 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
765 #elif EXP_POLY_DEGREE == 4
766 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
767 #elif EXP_POLY_DEGREE == 3
768 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
769 #elif EXP_POLY_DEGREE == 2
770 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
771 #else
772 #error
773 #endif
774
775 return _mm_mul_ps(expipart, expfpart);
776 }
777
778
779 /**
780 * See http://www.devmaster.net/forums/showthread.php?p=43580
781 */
782 static INLINE __m128
783 log2f4(__m128 x)
784 {
785 __m128i expmask = _mm_set1_epi32(0x7f800000);
786 __m128i mantmask = _mm_set1_epi32(0x007fffff);
787 __m128 one = _mm_set1_ps(1.0f);
788
789 __m128i i = _mm_castps_si128(x);
790
791 /* exp = (float) exponent(x) */
792 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
793
794 /* mant = (float) mantissa(x) */
795 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
796
797 __m128 logmant;
798
799 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
800 * These coefficients can be generate with
801 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
802 */
803 #if LOG_POLY_DEGREE == 6
804 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
805 #elif LOG_POLY_DEGREE == 5
806 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
807 #elif LOG_POLY_DEGREE == 4
808 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
809 #elif LOG_POLY_DEGREE == 3
810 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
811 #else
812 #error
813 #endif
814
815 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
816 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
817
818 return _mm_add_ps(logmant, exp);
819 }
820
821
822 static INLINE __m128
823 powf4(__m128 x, __m128 y)
824 {
825 return exp2f4(_mm_mul_ps(log2f4(x), y));
826 }
827
828 #endif /* PIPE_ARCH_SSE */
829
830
831
832 /**
833 * Low-level instruction translators.
834 */
835
836 static void
837 emit_abs(
838 struct x86_function *func,
839 unsigned xmm )
840 {
841 sse_andps(
842 func,
843 make_xmm( xmm ),
844 get_temp(
845 TGSI_EXEC_TEMP_7FFFFFFF_I,
846 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
847 }
848
849 static void
850 emit_add(
851 struct x86_function *func,
852 unsigned xmm_dst,
853 unsigned xmm_src )
854 {
855 sse_addps(
856 func,
857 make_xmm( xmm_dst ),
858 make_xmm( xmm_src ) );
859 }
860
861 static void PIPE_CDECL
862 cos4f(
863 float *store )
864 {
865 store[0] = cosf( store[0] );
866 store[1] = cosf( store[1] );
867 store[2] = cosf( store[2] );
868 store[3] = cosf( store[3] );
869 }
870
871 static void
872 emit_cos(
873 struct x86_function *func,
874 unsigned xmm_save,
875 unsigned xmm_dst )
876 {
877 emit_func_call_dst_src1(
878 func,
879 xmm_save,
880 xmm_dst,
881 xmm_dst,
882 cos4f );
883 }
884
885 static void PIPE_CDECL
886 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
887 __attribute__((force_align_arg_pointer))
888 #endif
889 ex24f(
890 float *store )
891 {
892 #if defined(PIPE_ARCH_SSE)
893 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
894 #else
895 store[0] = util_fast_exp2( store[0] );
896 store[1] = util_fast_exp2( store[1] );
897 store[2] = util_fast_exp2( store[2] );
898 store[3] = util_fast_exp2( store[3] );
899 #endif
900 }
901
902 static void
903 emit_ex2(
904 struct x86_function *func,
905 unsigned xmm_save,
906 unsigned xmm_dst )
907 {
908 emit_func_call_dst_src1(
909 func,
910 xmm_save,
911 xmm_dst,
912 xmm_dst,
913 ex24f );
914 }
915
916 static void
917 emit_f2it(
918 struct x86_function *func,
919 unsigned xmm )
920 {
921 sse2_cvttps2dq(
922 func,
923 make_xmm( xmm ),
924 make_xmm( xmm ) );
925 }
926
927 static void
928 emit_i2f(
929 struct x86_function *func,
930 unsigned xmm )
931 {
932 sse2_cvtdq2ps(
933 func,
934 make_xmm( xmm ),
935 make_xmm( xmm ) );
936 }
937
938 static void PIPE_CDECL
939 flr4f(
940 float *store )
941 {
942 store[0] = floorf( store[0] );
943 store[1] = floorf( store[1] );
944 store[2] = floorf( store[2] );
945 store[3] = floorf( store[3] );
946 }
947
948 static void
949 emit_flr(
950 struct x86_function *func,
951 unsigned xmm_save,
952 unsigned xmm_dst )
953 {
954 emit_func_call_dst_src1(
955 func,
956 xmm_save,
957 xmm_dst,
958 xmm_dst,
959 flr4f );
960 }
961
962 static void PIPE_CDECL
963 frc4f(
964 float *store )
965 {
966 store[0] -= floorf( store[0] );
967 store[1] -= floorf( store[1] );
968 store[2] -= floorf( store[2] );
969 store[3] -= floorf( store[3] );
970 }
971
972 static void
973 emit_frc(
974 struct x86_function *func,
975 unsigned xmm_save,
976 unsigned xmm_dst )
977 {
978 emit_func_call_dst_src1(
979 func,
980 xmm_save,
981 xmm_dst,
982 xmm_dst,
983 frc4f );
984 }
985
986 static void PIPE_CDECL
987 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
988 __attribute__((force_align_arg_pointer))
989 #endif
990 lg24f(
991 float *store )
992 {
993 #if defined(PIPE_ARCH_SSE)
994 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
995 #else
996 store[0] = util_fast_log2( store[0] );
997 store[1] = util_fast_log2( store[1] );
998 store[2] = util_fast_log2( store[2] );
999 store[3] = util_fast_log2( store[3] );
1000 #endif
1001 }
1002
1003 static void
1004 emit_lg2(
1005 struct x86_function *func,
1006 unsigned xmm_save,
1007 unsigned xmm_dst )
1008 {
1009 emit_func_call_dst_src1(
1010 func,
1011 xmm_save,
1012 xmm_dst,
1013 xmm_dst,
1014 lg24f );
1015 }
1016
1017 static void
1018 emit_MOV(
1019 struct x86_function *func,
1020 unsigned xmm_dst,
1021 unsigned xmm_src )
1022 {
1023 sse_movups(
1024 func,
1025 make_xmm( xmm_dst ),
1026 make_xmm( xmm_src ) );
1027 }
1028
1029 static void
1030 emit_mul (struct x86_function *func,
1031 unsigned xmm_dst,
1032 unsigned xmm_src)
1033 {
1034 sse_mulps(
1035 func,
1036 make_xmm( xmm_dst ),
1037 make_xmm( xmm_src ) );
1038 }
1039
1040 static void
1041 emit_neg(
1042 struct x86_function *func,
1043 unsigned xmm )
1044 {
1045 sse_xorps(
1046 func,
1047 make_xmm( xmm ),
1048 get_temp(
1049 TGSI_EXEC_TEMP_80000000_I,
1050 TGSI_EXEC_TEMP_80000000_C ) );
1051 }
1052
1053 static void PIPE_CDECL
1054 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1055 __attribute__((force_align_arg_pointer))
1056 #endif
1057 pow4f(
1058 float *store )
1059 {
1060 #if defined(PIPE_ARCH_SSE)
1061 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1062 #else
1063 store[0] = util_fast_pow( store[0], store[4] );
1064 store[1] = util_fast_pow( store[1], store[5] );
1065 store[2] = util_fast_pow( store[2], store[6] );
1066 store[3] = util_fast_pow( store[3], store[7] );
1067 #endif
1068 }
1069
1070 static void
1071 emit_pow(
1072 struct x86_function *func,
1073 unsigned xmm_save,
1074 unsigned xmm_dst,
1075 unsigned xmm_src0,
1076 unsigned xmm_src1 )
1077 {
1078 emit_func_call_dst_src2(
1079 func,
1080 xmm_save,
1081 xmm_dst,
1082 xmm_src0,
1083 xmm_src1,
1084 pow4f );
1085 }
1086
1087 static void
1088 emit_rcp (
1089 struct x86_function *func,
1090 unsigned xmm_dst,
1091 unsigned xmm_src )
1092 {
1093 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1094 * good enough. Need to either emit a proper divide or use the
1095 * iterative technique described below in emit_rsqrt().
1096 */
1097 sse2_rcpps(
1098 func,
1099 make_xmm( xmm_dst ),
1100 make_xmm( xmm_src ) );
1101 }
1102
1103 static void PIPE_CDECL
1104 rnd4f(
1105 float *store )
1106 {
1107 store[0] = floorf( store[0] + 0.5f );
1108 store[1] = floorf( store[1] + 0.5f );
1109 store[2] = floorf( store[2] + 0.5f );
1110 store[3] = floorf( store[3] + 0.5f );
1111 }
1112
1113 static void
1114 emit_rnd(
1115 struct x86_function *func,
1116 unsigned xmm_save,
1117 unsigned xmm_dst )
1118 {
1119 emit_func_call_dst_src1(
1120 func,
1121 xmm_save,
1122 xmm_dst,
1123 xmm_dst,
1124 rnd4f );
1125 }
1126
1127 static void
1128 emit_rsqrt(
1129 struct x86_function *func,
1130 unsigned xmm_dst,
1131 unsigned xmm_src )
1132 {
1133 #if HIGH_PRECISION
1134 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1135 * implementations, it is possible to improve its precision at
1136 * fairly low cost, using a newton/raphson step, as below:
1137 *
1138 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1139 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1140 *
1141 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1142 */
1143 {
1144 struct x86_reg dst = make_xmm( xmm_dst );
1145 struct x86_reg src = make_xmm( xmm_src );
1146 struct x86_reg tmp0 = make_xmm( 2 );
1147 struct x86_reg tmp1 = make_xmm( 3 );
1148
1149 assert( xmm_dst != xmm_src );
1150 assert( xmm_dst != 2 && xmm_dst != 3 );
1151 assert( xmm_src != 2 && xmm_src != 3 );
1152
1153 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1154 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1155 sse_rsqrtps( func, tmp1, src );
1156 sse_mulps( func, src, tmp1 );
1157 sse_mulps( func, dst, tmp1 );
1158 sse_mulps( func, src, tmp1 );
1159 sse_subps( func, tmp0, src );
1160 sse_mulps( func, dst, tmp0 );
1161 }
1162 #else
1163 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1164 * good enough.
1165 */
1166 sse_rsqrtps(
1167 func,
1168 make_xmm( xmm_dst ),
1169 make_xmm( xmm_src ) );
1170 #endif
1171 }
1172
1173 static void
1174 emit_setsign(
1175 struct x86_function *func,
1176 unsigned xmm )
1177 {
1178 sse_orps(
1179 func,
1180 make_xmm( xmm ),
1181 get_temp(
1182 TGSI_EXEC_TEMP_80000000_I,
1183 TGSI_EXEC_TEMP_80000000_C ) );
1184 }
1185
1186 static void PIPE_CDECL
1187 sgn4f(
1188 float *store )
1189 {
1190 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1191 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1192 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1193 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1194 }
1195
1196 static void
1197 emit_sgn(
1198 struct x86_function *func,
1199 unsigned xmm_save,
1200 unsigned xmm_dst )
1201 {
1202 emit_func_call_dst_src1(
1203 func,
1204 xmm_save,
1205 xmm_dst,
1206 xmm_dst,
1207 sgn4f );
1208 }
1209
1210 static void PIPE_CDECL
1211 sin4f(
1212 float *store )
1213 {
1214 store[0] = sinf( store[0] );
1215 store[1] = sinf( store[1] );
1216 store[2] = sinf( store[2] );
1217 store[3] = sinf( store[3] );
1218 }
1219
1220 static void
1221 emit_sin (struct x86_function *func,
1222 unsigned xmm_save,
1223 unsigned xmm_dst)
1224 {
1225 emit_func_call_dst_src1(
1226 func,
1227 xmm_save,
1228 xmm_dst,
1229 xmm_dst,
1230 sin4f );
1231 }
1232
1233 static void
1234 emit_sub(
1235 struct x86_function *func,
1236 unsigned xmm_dst,
1237 unsigned xmm_src )
1238 {
1239 sse_subps(
1240 func,
1241 make_xmm( xmm_dst ),
1242 make_xmm( xmm_src ) );
1243 }
1244
1245
1246
1247
1248
1249
1250
1251 /**
1252 * Register fetch.
1253 */
1254
1255 static void
1256 emit_fetch(
1257 struct x86_function *func,
1258 unsigned xmm,
1259 const struct tgsi_full_src_register *reg,
1260 const unsigned chan_index )
1261 {
1262 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1263
1264 switch (swizzle) {
1265 case TGSI_EXTSWIZZLE_X:
1266 case TGSI_EXTSWIZZLE_Y:
1267 case TGSI_EXTSWIZZLE_Z:
1268 case TGSI_EXTSWIZZLE_W:
1269 switch (reg->SrcRegister.File) {
1270 case TGSI_FILE_CONSTANT:
1271 emit_const(
1272 func,
1273 xmm,
1274 reg->SrcRegister.Index,
1275 swizzle,
1276 reg->SrcRegister.Indirect,
1277 reg->SrcRegisterInd.File,
1278 reg->SrcRegisterInd.Index );
1279 break;
1280
1281 case TGSI_FILE_IMMEDIATE:
1282 emit_immediate(
1283 func,
1284 xmm,
1285 reg->SrcRegister.Index,
1286 swizzle );
1287 break;
1288
1289 case TGSI_FILE_INPUT:
1290 emit_inputf(
1291 func,
1292 xmm,
1293 reg->SrcRegister.Index,
1294 swizzle );
1295 break;
1296
1297 case TGSI_FILE_TEMPORARY:
1298 emit_tempf(
1299 func,
1300 xmm,
1301 reg->SrcRegister.Index,
1302 swizzle );
1303 break;
1304
1305 default:
1306 assert( 0 );
1307 }
1308 break;
1309
1310 case TGSI_EXTSWIZZLE_ZERO:
1311 emit_tempf(
1312 func,
1313 xmm,
1314 TGSI_EXEC_TEMP_00000000_I,
1315 TGSI_EXEC_TEMP_00000000_C );
1316 break;
1317
1318 case TGSI_EXTSWIZZLE_ONE:
1319 emit_tempf(
1320 func,
1321 xmm,
1322 TEMP_ONE_I,
1323 TEMP_ONE_C );
1324 break;
1325
1326 default:
1327 assert( 0 );
1328 }
1329
1330 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1331 case TGSI_UTIL_SIGN_CLEAR:
1332 emit_abs( func, xmm );
1333 break;
1334
1335 case TGSI_UTIL_SIGN_SET:
1336 emit_setsign( func, xmm );
1337 break;
1338
1339 case TGSI_UTIL_SIGN_TOGGLE:
1340 emit_neg( func, xmm );
1341 break;
1342
1343 case TGSI_UTIL_SIGN_KEEP:
1344 break;
1345 }
1346 }
1347
1348 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1349 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1350
1351 /**
1352 * Register store.
1353 */
1354
1355 static void
1356 emit_store(
1357 struct x86_function *func,
1358 unsigned xmm,
1359 const struct tgsi_full_dst_register *reg,
1360 const struct tgsi_full_instruction *inst,
1361 unsigned chan_index )
1362 {
1363 switch( reg->DstRegister.File ) {
1364 case TGSI_FILE_OUTPUT:
1365 emit_output(
1366 func,
1367 xmm,
1368 reg->DstRegister.Index,
1369 chan_index );
1370 break;
1371
1372 case TGSI_FILE_TEMPORARY:
1373 emit_temps(
1374 func,
1375 xmm,
1376 reg->DstRegister.Index,
1377 chan_index );
1378 break;
1379
1380 case TGSI_FILE_ADDRESS:
1381 emit_addrs(
1382 func,
1383 xmm,
1384 reg->DstRegister.Index,
1385 chan_index );
1386 break;
1387
1388 default:
1389 assert( 0 );
1390 }
1391
1392 switch( inst->Instruction.Saturate ) {
1393 case TGSI_SAT_NONE:
1394 break;
1395
1396 case TGSI_SAT_ZERO_ONE:
1397 /* assert( 0 ); */
1398 break;
1399
1400 case TGSI_SAT_MINUS_PLUS_ONE:
1401 assert( 0 );
1402 break;
1403 }
1404 }
1405
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1408
1409
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1412 float *store )
1413 {
1414 #if 0
1415 uint j;
1416
1417 debug_printf("%s sampler: %p (%p) store: %p\n",
1418 __FUNCTION__,
1419 sampler, *sampler,
1420 store );
1421
1422 debug_printf("lodbias %f\n", store[12]);
1423
1424 for (j = 0; j < 4; j++)
1425 debug_printf("sample %d texcoord %f %f\n",
1426 j,
1427 store[0+j],
1428 store[4+j]);
1429 #endif
1430
1431 {
1432 float rgba[NUM_CHANNELS][QUAD_SIZE];
1433 (*sampler)->get_samples(*sampler,
1434 &store[0],
1435 &store[4],
1436 &store[8],
1437 0.0f, /*store[12], lodbias */
1438 rgba);
1439
1440 memcpy( store, rgba, 16 * sizeof(float));
1441 }
1442
1443 #if 0
1444 for (j = 0; j < 4; j++)
1445 debug_printf("sample %d result %f %f %f %f\n",
1446 j,
1447 store[0+j],
1448 store[4+j],
1449 store[8+j],
1450 store[12+j]);
1451 #endif
1452 }
1453
1454 /**
1455 * High-level instruction translators.
1456 */
1457
1458 static void
1459 emit_tex( struct x86_function *func,
1460 const struct tgsi_full_instruction *inst,
1461 boolean lodbias,
1462 boolean projected)
1463 {
1464 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1465 struct x86_reg args[2];
1466 unsigned count;
1467 unsigned i;
1468
1469 switch (inst->InstructionExtTexture.Texture) {
1470 case TGSI_TEXTURE_1D:
1471 count = 1;
1472 break;
1473 case TGSI_TEXTURE_2D:
1474 case TGSI_TEXTURE_RECT:
1475 count = 2;
1476 break;
1477 case TGSI_TEXTURE_SHADOW1D:
1478 case TGSI_TEXTURE_SHADOW2D:
1479 case TGSI_TEXTURE_SHADOWRECT:
1480 case TGSI_TEXTURE_3D:
1481 case TGSI_TEXTURE_CUBE:
1482 count = 3;
1483 break;
1484 default:
1485 assert(0);
1486 return;
1487 }
1488
1489 if (lodbias) {
1490 FETCH( func, *inst, 3, 0, 3 );
1491 }
1492 else {
1493 emit_tempf(
1494 func,
1495 3,
1496 TGSI_EXEC_TEMP_00000000_I,
1497 TGSI_EXEC_TEMP_00000000_C );
1498
1499 }
1500
1501 /* store lodbias whether enabled or not -- fetch_texel currently
1502 * respects it always.
1503 */
1504 sse_movaps( func,
1505 get_temp( TEMP_R0, 3 ),
1506 make_xmm( 3 ) );
1507
1508
1509 if (projected) {
1510 FETCH( func, *inst, 3, 0, 3 );
1511
1512 emit_rcp( func, 3, 3 );
1513 }
1514
1515 for (i = 0; i < count; i++) {
1516 FETCH( func, *inst, i, 0, i );
1517
1518 if (projected) {
1519 sse_mulps(
1520 func,
1521 make_xmm( i ),
1522 make_xmm( 3 ) );
1523 }
1524
1525 /* Store in the argument buffer:
1526 */
1527 sse_movaps(
1528 func,
1529 get_temp( TEMP_R0, i ),
1530 make_xmm( i ) );
1531 }
1532
1533 args[0] = get_temp( TEMP_R0, 0 );
1534 args[1] = get_sampler_ptr( unit );
1535
1536
1537 emit_func_call( func,
1538 0,
1539 args,
1540 Elements(args),
1541 fetch_texel );
1542
1543 /* If all four channels are enabled, could use a pointer to
1544 * dst[0].x instead of TEMP_R0 for store?
1545 */
1546 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1547
1548 sse_movaps(
1549 func,
1550 make_xmm( 0 ),
1551 get_temp( TEMP_R0, i ) );
1552
1553 STORE( func, *inst, 0, 0, i );
1554 }
1555 }
1556
1557
1558 static void
1559 emit_kil(
1560 struct x86_function *func,
1561 const struct tgsi_full_src_register *reg )
1562 {
1563 unsigned uniquemask;
1564 unsigned unique_count = 0;
1565 unsigned chan_index;
1566 unsigned i;
1567
1568 /* This mask stores component bits that were already tested. Note that
1569 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1570 * tested. */
1571 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1572
1573 FOR_EACH_CHANNEL( chan_index ) {
1574 unsigned swizzle;
1575
1576 /* unswizzle channel */
1577 swizzle = tgsi_util_get_full_src_register_extswizzle(
1578 reg,
1579 chan_index );
1580
1581 /* check if the component has not been already tested */
1582 if( !(uniquemask & (1 << swizzle)) ) {
1583 uniquemask |= 1 << swizzle;
1584
1585 /* allocate register */
1586 emit_fetch(
1587 func,
1588 unique_count++,
1589 reg,
1590 chan_index );
1591 }
1592 }
1593
1594 x86_push(
1595 func,
1596 x86_make_reg( file_REG32, reg_AX ) );
1597 x86_push(
1598 func,
1599 x86_make_reg( file_REG32, reg_DX ) );
1600
1601 for (i = 0 ; i < unique_count; i++ ) {
1602 struct x86_reg dataXMM = make_xmm(i);
1603
1604 sse_cmpps(
1605 func,
1606 dataXMM,
1607 get_temp(
1608 TGSI_EXEC_TEMP_00000000_I,
1609 TGSI_EXEC_TEMP_00000000_C ),
1610 cc_LessThan );
1611
1612 if( i == 0 ) {
1613 sse_movmskps(
1614 func,
1615 x86_make_reg( file_REG32, reg_AX ),
1616 dataXMM );
1617 }
1618 else {
1619 sse_movmskps(
1620 func,
1621 x86_make_reg( file_REG32, reg_DX ),
1622 dataXMM );
1623 x86_or(
1624 func,
1625 x86_make_reg( file_REG32, reg_AX ),
1626 x86_make_reg( file_REG32, reg_DX ) );
1627 }
1628 }
1629
1630 x86_or(
1631 func,
1632 get_temp(
1633 TGSI_EXEC_TEMP_KILMASK_I,
1634 TGSI_EXEC_TEMP_KILMASK_C ),
1635 x86_make_reg( file_REG32, reg_AX ) );
1636
1637 x86_pop(
1638 func,
1639 x86_make_reg( file_REG32, reg_DX ) );
1640 x86_pop(
1641 func,
1642 x86_make_reg( file_REG32, reg_AX ) );
1643 }
1644
1645
1646 static void
1647 emit_kilp(
1648 struct x86_function *func )
1649 {
1650 /* XXX todo / fix me */
1651 }
1652
1653
1654 static void
1655 emit_setcc(
1656 struct x86_function *func,
1657 struct tgsi_full_instruction *inst,
1658 enum sse_cc cc )
1659 {
1660 unsigned chan_index;
1661
1662 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1663 FETCH( func, *inst, 0, 0, chan_index );
1664 FETCH( func, *inst, 1, 1, chan_index );
1665 sse_cmpps(
1666 func,
1667 make_xmm( 0 ),
1668 make_xmm( 1 ),
1669 cc );
1670 sse_andps(
1671 func,
1672 make_xmm( 0 ),
1673 get_temp(
1674 TEMP_ONE_I,
1675 TEMP_ONE_C ) );
1676 STORE( func, *inst, 0, 0, chan_index );
1677 }
1678 }
1679
1680 static void
1681 emit_cmp(
1682 struct x86_function *func,
1683 struct tgsi_full_instruction *inst )
1684 {
1685 unsigned chan_index;
1686
1687 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688 FETCH( func, *inst, 0, 0, chan_index );
1689 FETCH( func, *inst, 1, 1, chan_index );
1690 FETCH( func, *inst, 2, 2, chan_index );
1691 sse_cmpps(
1692 func,
1693 make_xmm( 0 ),
1694 get_temp(
1695 TGSI_EXEC_TEMP_00000000_I,
1696 TGSI_EXEC_TEMP_00000000_C ),
1697 cc_LessThan );
1698 sse_andps(
1699 func,
1700 make_xmm( 1 ),
1701 make_xmm( 0 ) );
1702 sse_andnps(
1703 func,
1704 make_xmm( 0 ),
1705 make_xmm( 2 ) );
1706 sse_orps(
1707 func,
1708 make_xmm( 0 ),
1709 make_xmm( 1 ) );
1710 STORE( func, *inst, 0, 0, chan_index );
1711 }
1712 }
1713
1714
1715 /**
1716 * Check if inst src/dest regs use indirect addressing into temporary
1717 * register file.
1718 */
1719 static boolean
1720 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1721 {
1722 uint i;
1723 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1725 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1726 reg->SrcRegister.Indirect)
1727 return TRUE;
1728 }
1729 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1730 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1731 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1732 reg->DstRegister.Indirect)
1733 return TRUE;
1734 }
1735 return FALSE;
1736 }
1737
1738
1739 static int
1740 emit_instruction(
1741 struct x86_function *func,
1742 struct tgsi_full_instruction *inst )
1743 {
1744 unsigned chan_index;
1745
1746 /* we can't handle indirect addressing into temp register file yet */
1747 if (indirect_temp_reference(inst))
1748 return FALSE;
1749
1750 /* we don't handle saturation/clamping yet */
1751 if (inst->Instruction.Saturate != TGSI_SAT_NONE)
1752 return FALSE;
1753
1754 /* need to use extra temps to fix SOA dependencies : */
1755 if (tgsi_check_soa_dependencies(inst))
1756 return FALSE;
1757
1758 switch (inst->Instruction.Opcode) {
1759 case TGSI_OPCODE_ARL:
1760 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761 FETCH( func, *inst, 0, 0, chan_index );
1762 emit_flr(func, 0, 0);
1763 emit_f2it( func, 0 );
1764 STORE( func, *inst, 0, 0, chan_index );
1765 }
1766 break;
1767
1768 case TGSI_OPCODE_MOV:
1769 case TGSI_OPCODE_SWZ:
1770 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1771 FETCH( func, *inst, 0, 0, chan_index );
1772 STORE( func, *inst, 0, 0, chan_index );
1773 }
1774 break;
1775
1776 case TGSI_OPCODE_LIT:
1777 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1778 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1779 emit_tempf(
1780 func,
1781 0,
1782 TEMP_ONE_I,
1783 TEMP_ONE_C);
1784 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1785 STORE( func, *inst, 0, 0, CHAN_X );
1786 }
1787 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1788 STORE( func, *inst, 0, 0, CHAN_W );
1789 }
1790 }
1791 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1792 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1794 FETCH( func, *inst, 0, 0, CHAN_X );
1795 sse_maxps(
1796 func,
1797 make_xmm( 0 ),
1798 get_temp(
1799 TGSI_EXEC_TEMP_00000000_I,
1800 TGSI_EXEC_TEMP_00000000_C ) );
1801 STORE( func, *inst, 0, 0, CHAN_Y );
1802 }
1803 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1804 /* XMM[1] = SrcReg[0].yyyy */
1805 FETCH( func, *inst, 1, 0, CHAN_Y );
1806 /* XMM[1] = max(XMM[1], 0) */
1807 sse_maxps(
1808 func,
1809 make_xmm( 1 ),
1810 get_temp(
1811 TGSI_EXEC_TEMP_00000000_I,
1812 TGSI_EXEC_TEMP_00000000_C ) );
1813 /* XMM[2] = SrcReg[0].wwww */
1814 FETCH( func, *inst, 2, 0, CHAN_W );
1815 /* XMM[2] = min(XMM[2], 128.0) */
1816 sse_minps(
1817 func,
1818 make_xmm( 2 ),
1819 get_temp(
1820 TGSI_EXEC_TEMP_128_I,
1821 TGSI_EXEC_TEMP_128_C ) );
1822 /* XMM[2] = max(XMM[2], -128.0) */
1823 sse_maxps(
1824 func,
1825 make_xmm( 2 ),
1826 get_temp(
1827 TGSI_EXEC_TEMP_MINUS_128_I,
1828 TGSI_EXEC_TEMP_MINUS_128_C ) );
1829 emit_pow( func, 3, 1, 1, 2 );
1830 FETCH( func, *inst, 0, 0, CHAN_X );
1831 sse_xorps(
1832 func,
1833 make_xmm( 2 ),
1834 make_xmm( 2 ) );
1835 sse_cmpps(
1836 func,
1837 make_xmm( 2 ),
1838 make_xmm( 0 ),
1839 cc_LessThan );
1840 sse_andps(
1841 func,
1842 make_xmm( 2 ),
1843 make_xmm( 1 ) );
1844 STORE( func, *inst, 2, 0, CHAN_Z );
1845 }
1846 }
1847 break;
1848
1849 case TGSI_OPCODE_RCP:
1850 /* TGSI_OPCODE_RECIP */
1851 FETCH( func, *inst, 0, 0, CHAN_X );
1852 emit_rcp( func, 0, 0 );
1853 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1854 STORE( func, *inst, 0, 0, chan_index );
1855 }
1856 break;
1857
1858 case TGSI_OPCODE_RSQ:
1859 /* TGSI_OPCODE_RECIPSQRT */
1860 FETCH( func, *inst, 0, 0, CHAN_X );
1861 emit_abs( func, 0 );
1862 emit_rsqrt( func, 1, 0 );
1863 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1864 STORE( func, *inst, 1, 0, chan_index );
1865 }
1866 break;
1867
1868 case TGSI_OPCODE_EXP:
1869 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1870 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1871 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1872 FETCH( func, *inst, 0, 0, CHAN_X );
1873 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1874 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1875 emit_MOV( func, 1, 0 );
1876 emit_flr( func, 2, 1 );
1877 /* dst.x = ex2(floor(src.x)) */
1878 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1879 emit_MOV( func, 2, 1 );
1880 emit_ex2( func, 3, 2 );
1881 STORE( func, *inst, 2, 0, CHAN_X );
1882 }
1883 /* dst.y = src.x - floor(src.x) */
1884 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1885 emit_MOV( func, 2, 0 );
1886 emit_sub( func, 2, 1 );
1887 STORE( func, *inst, 2, 0, CHAN_Y );
1888 }
1889 }
1890 /* dst.z = ex2(src.x) */
1891 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1892 emit_ex2( func, 3, 0 );
1893 STORE( func, *inst, 0, 0, CHAN_Z );
1894 }
1895 }
1896 /* dst.w = 1.0 */
1897 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1898 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1899 STORE( func, *inst, 0, 0, CHAN_W );
1900 }
1901 break;
1902
1903 case TGSI_OPCODE_LOG:
1904 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1905 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1906 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1907 FETCH( func, *inst, 0, 0, CHAN_X );
1908 emit_abs( func, 0 );
1909 emit_MOV( func, 1, 0 );
1910 emit_lg2( func, 2, 1 );
1911 /* dst.z = lg2(abs(src.x)) */
1912 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1913 STORE( func, *inst, 1, 0, CHAN_Z );
1914 }
1915 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1916 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1917 emit_flr( func, 2, 1 );
1918 /* dst.x = floor(lg2(abs(src.x))) */
1919 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1920 STORE( func, *inst, 1, 0, CHAN_X );
1921 }
1922 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1923 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1924 emit_ex2( func, 2, 1 );
1925 emit_rcp( func, 1, 1 );
1926 emit_mul( func, 0, 1 );
1927 STORE( func, *inst, 0, 0, CHAN_Y );
1928 }
1929 }
1930 }
1931 /* dst.w = 1.0 */
1932 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1933 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1934 STORE( func, *inst, 0, 0, CHAN_W );
1935 }
1936 break;
1937
1938 case TGSI_OPCODE_MUL:
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940 FETCH( func, *inst, 0, 0, chan_index );
1941 FETCH( func, *inst, 1, 1, chan_index );
1942 emit_mul( func, 0, 1 );
1943 STORE( func, *inst, 0, 0, chan_index );
1944 }
1945 break;
1946
1947 case TGSI_OPCODE_ADD:
1948 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1949 FETCH( func, *inst, 0, 0, chan_index );
1950 FETCH( func, *inst, 1, 1, chan_index );
1951 emit_add( func, 0, 1 );
1952 STORE( func, *inst, 0, 0, chan_index );
1953 }
1954 break;
1955
1956 case TGSI_OPCODE_DP3:
1957 /* TGSI_OPCODE_DOT3 */
1958 FETCH( func, *inst, 0, 0, CHAN_X );
1959 FETCH( func, *inst, 1, 1, CHAN_X );
1960 emit_mul( func, 0, 1 );
1961 FETCH( func, *inst, 1, 0, CHAN_Y );
1962 FETCH( func, *inst, 2, 1, CHAN_Y );
1963 emit_mul( func, 1, 2 );
1964 emit_add( func, 0, 1 );
1965 FETCH( func, *inst, 1, 0, CHAN_Z );
1966 FETCH( func, *inst, 2, 1, CHAN_Z );
1967 emit_mul( func, 1, 2 );
1968 emit_add( func, 0, 1 );
1969 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1970 STORE( func, *inst, 0, 0, chan_index );
1971 }
1972 break;
1973
1974 case TGSI_OPCODE_DP4:
1975 /* TGSI_OPCODE_DOT4 */
1976 FETCH( func, *inst, 0, 0, CHAN_X );
1977 FETCH( func, *inst, 1, 1, CHAN_X );
1978 emit_mul( func, 0, 1 );
1979 FETCH( func, *inst, 1, 0, CHAN_Y );
1980 FETCH( func, *inst, 2, 1, CHAN_Y );
1981 emit_mul( func, 1, 2 );
1982 emit_add( func, 0, 1 );
1983 FETCH( func, *inst, 1, 0, CHAN_Z );
1984 FETCH( func, *inst, 2, 1, CHAN_Z );
1985 emit_mul(func, 1, 2 );
1986 emit_add(func, 0, 1 );
1987 FETCH( func, *inst, 1, 0, CHAN_W );
1988 FETCH( func, *inst, 2, 1, CHAN_W );
1989 emit_mul( func, 1, 2 );
1990 emit_add( func, 0, 1 );
1991 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1992 STORE( func, *inst, 0, 0, chan_index );
1993 }
1994 break;
1995
1996 case TGSI_OPCODE_DST:
1997 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1998 emit_tempf(
1999 func,
2000 0,
2001 TEMP_ONE_I,
2002 TEMP_ONE_C );
2003 STORE( func, *inst, 0, 0, CHAN_X );
2004 }
2005 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2006 FETCH( func, *inst, 0, 0, CHAN_Y );
2007 FETCH( func, *inst, 1, 1, CHAN_Y );
2008 emit_mul( func, 0, 1 );
2009 STORE( func, *inst, 0, 0, CHAN_Y );
2010 }
2011 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2012 FETCH( func, *inst, 0, 0, CHAN_Z );
2013 STORE( func, *inst, 0, 0, CHAN_Z );
2014 }
2015 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2016 FETCH( func, *inst, 0, 1, CHAN_W );
2017 STORE( func, *inst, 0, 0, CHAN_W );
2018 }
2019 break;
2020
2021 case TGSI_OPCODE_MIN:
2022 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2023 FETCH( func, *inst, 0, 0, chan_index );
2024 FETCH( func, *inst, 1, 1, chan_index );
2025 sse_minps(
2026 func,
2027 make_xmm( 0 ),
2028 make_xmm( 1 ) );
2029 STORE( func, *inst, 0, 0, chan_index );
2030 }
2031 break;
2032
2033 case TGSI_OPCODE_MAX:
2034 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2035 FETCH( func, *inst, 0, 0, chan_index );
2036 FETCH( func, *inst, 1, 1, chan_index );
2037 sse_maxps(
2038 func,
2039 make_xmm( 0 ),
2040 make_xmm( 1 ) );
2041 STORE( func, *inst, 0, 0, chan_index );
2042 }
2043 break;
2044
2045 case TGSI_OPCODE_SLT:
2046 /* TGSI_OPCODE_SETLT */
2047 emit_setcc( func, inst, cc_LessThan );
2048 break;
2049
2050 case TGSI_OPCODE_SGE:
2051 /* TGSI_OPCODE_SETGE */
2052 emit_setcc( func, inst, cc_NotLessThan );
2053 break;
2054
2055 case TGSI_OPCODE_MAD:
2056 /* TGSI_OPCODE_MADD */
2057 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2058 FETCH( func, *inst, 0, 0, chan_index );
2059 FETCH( func, *inst, 1, 1, chan_index );
2060 FETCH( func, *inst, 2, 2, chan_index );
2061 emit_mul( func, 0, 1 );
2062 emit_add( func, 0, 2 );
2063 STORE( func, *inst, 0, 0, chan_index );
2064 }
2065 break;
2066
2067 case TGSI_OPCODE_SUB:
2068 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2069 FETCH( func, *inst, 0, 0, chan_index );
2070 FETCH( func, *inst, 1, 1, chan_index );
2071 emit_sub( func, 0, 1 );
2072 STORE( func, *inst, 0, 0, chan_index );
2073 }
2074 break;
2075
2076 case TGSI_OPCODE_LRP:
2077 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2078 FETCH( func, *inst, 0, 0, chan_index );
2079 FETCH( func, *inst, 1, 1, chan_index );
2080 FETCH( func, *inst, 2, 2, chan_index );
2081 emit_sub( func, 1, 2 );
2082 emit_mul( func, 0, 1 );
2083 emit_add( func, 0, 2 );
2084 STORE( func, *inst, 0, 0, chan_index );
2085 }
2086 break;
2087
2088 case TGSI_OPCODE_CND:
2089 return 0;
2090 break;
2091
2092 case TGSI_OPCODE_DP2A:
2093 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2094 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2095 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2096 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2097 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2098 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2099 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2100 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2101 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2102 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2104 }
2105 break;
2106
2107 case TGSI_OPCODE_FRC:
2108 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2109 FETCH( func, *inst, 0, 0, chan_index );
2110 emit_frc( func, 0, 0 );
2111 STORE( func, *inst, 0, 0, chan_index );
2112 }
2113 break;
2114
2115 case TGSI_OPCODE_CLAMP:
2116 return 0;
2117 break;
2118
2119 case TGSI_OPCODE_FLR:
2120 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2121 FETCH( func, *inst, 0, 0, chan_index );
2122 emit_flr( func, 0, 0 );
2123 STORE( func, *inst, 0, 0, chan_index );
2124 }
2125 break;
2126
2127 case TGSI_OPCODE_ROUND:
2128 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2129 FETCH( func, *inst, 0, 0, chan_index );
2130 emit_rnd( func, 0, 0 );
2131 STORE( func, *inst, 0, 0, chan_index );
2132 }
2133 break;
2134
2135 case TGSI_OPCODE_EX2:
2136 FETCH( func, *inst, 0, 0, CHAN_X );
2137 emit_ex2( func, 0, 0 );
2138 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2139 STORE( func, *inst, 0, 0, chan_index );
2140 }
2141 break;
2142
2143 case TGSI_OPCODE_LG2:
2144 FETCH( func, *inst, 0, 0, CHAN_X );
2145 emit_lg2( func, 0, 0 );
2146 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147 STORE( func, *inst, 0, 0, chan_index );
2148 }
2149 break;
2150
2151 case TGSI_OPCODE_POW:
2152 FETCH( func, *inst, 0, 0, CHAN_X );
2153 FETCH( func, *inst, 1, 1, CHAN_X );
2154 emit_pow( func, 0, 0, 0, 1 );
2155 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2156 STORE( func, *inst, 0, 0, chan_index );
2157 }
2158 break;
2159
2160 case TGSI_OPCODE_XPD:
2161 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2162 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2163 FETCH( func, *inst, 1, 1, CHAN_Z );
2164 FETCH( func, *inst, 3, 0, CHAN_Z );
2165 }
2166 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2167 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2168 FETCH( func, *inst, 0, 0, CHAN_Y );
2169 FETCH( func, *inst, 4, 1, CHAN_Y );
2170 }
2171 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2172 emit_MOV( func, 2, 0 );
2173 emit_mul( func, 2, 1 );
2174 emit_MOV( func, 5, 3 );
2175 emit_mul( func, 5, 4 );
2176 emit_sub( func, 2, 5 );
2177 STORE( func, *inst, 2, 0, CHAN_X );
2178 }
2179 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2180 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2181 FETCH( func, *inst, 2, 1, CHAN_X );
2182 FETCH( func, *inst, 5, 0, CHAN_X );
2183 }
2184 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2185 emit_mul( func, 3, 2 );
2186 emit_mul( func, 1, 5 );
2187 emit_sub( func, 3, 1 );
2188 STORE( func, *inst, 3, 0, CHAN_Y );
2189 }
2190 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2191 emit_mul( func, 5, 4 );
2192 emit_mul( func, 0, 2 );
2193 emit_sub( func, 5, 0 );
2194 STORE( func, *inst, 5, 0, CHAN_Z );
2195 }
2196 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2197 emit_tempf(
2198 func,
2199 0,
2200 TEMP_ONE_I,
2201 TEMP_ONE_C );
2202 STORE( func, *inst, 0, 0, CHAN_W );
2203 }
2204 break;
2205
2206 case TGSI_OPCODE_ABS:
2207 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2208 FETCH( func, *inst, 0, 0, chan_index );
2209 emit_abs( func, 0) ;
2210
2211 STORE( func, *inst, 0, 0, chan_index );
2212 }
2213 break;
2214
2215 case TGSI_OPCODE_RCC:
2216 return 0;
2217 break;
2218
2219 case TGSI_OPCODE_DPH:
2220 FETCH( func, *inst, 0, 0, CHAN_X );
2221 FETCH( func, *inst, 1, 1, CHAN_X );
2222 emit_mul( func, 0, 1 );
2223 FETCH( func, *inst, 1, 0, CHAN_Y );
2224 FETCH( func, *inst, 2, 1, CHAN_Y );
2225 emit_mul( func, 1, 2 );
2226 emit_add( func, 0, 1 );
2227 FETCH( func, *inst, 1, 0, CHAN_Z );
2228 FETCH( func, *inst, 2, 1, CHAN_Z );
2229 emit_mul( func, 1, 2 );
2230 emit_add( func, 0, 1 );
2231 FETCH( func, *inst, 1, 1, CHAN_W );
2232 emit_add( func, 0, 1 );
2233 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2234 STORE( func, *inst, 0, 0, chan_index );
2235 }
2236 break;
2237
2238 case TGSI_OPCODE_COS:
2239 FETCH( func, *inst, 0, 0, CHAN_X );
2240 emit_cos( func, 0, 0 );
2241 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2242 STORE( func, *inst, 0, 0, chan_index );
2243 }
2244 break;
2245
2246 case TGSI_OPCODE_DDX:
2247 return 0;
2248 break;
2249
2250 case TGSI_OPCODE_DDY:
2251 return 0;
2252 break;
2253
2254 case TGSI_OPCODE_KILP:
2255 /* predicated kill */
2256 emit_kilp( func );
2257 return 0; /* XXX fix me */
2258 break;
2259
2260 case TGSI_OPCODE_KIL:
2261 /* conditional kill */
2262 emit_kil( func, &inst->FullSrcRegisters[0] );
2263 break;
2264
2265 case TGSI_OPCODE_PK2H:
2266 return 0;
2267 break;
2268
2269 case TGSI_OPCODE_PK2US:
2270 return 0;
2271 break;
2272
2273 case TGSI_OPCODE_PK4B:
2274 return 0;
2275 break;
2276
2277 case TGSI_OPCODE_PK4UB:
2278 return 0;
2279 break;
2280
2281 case TGSI_OPCODE_RFL:
2282 return 0;
2283 break;
2284
2285 case TGSI_OPCODE_SEQ:
2286 return 0;
2287 break;
2288
2289 case TGSI_OPCODE_SFL:
2290 return 0;
2291 break;
2292
2293 case TGSI_OPCODE_SGT:
2294 return 0;
2295 break;
2296
2297 case TGSI_OPCODE_SIN:
2298 FETCH( func, *inst, 0, 0, CHAN_X );
2299 emit_sin( func, 0, 0 );
2300 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2301 STORE( func, *inst, 0, 0, chan_index );
2302 }
2303 break;
2304
2305 case TGSI_OPCODE_SLE:
2306 return 0;
2307 break;
2308
2309 case TGSI_OPCODE_SNE:
2310 return 0;
2311 break;
2312
2313 case TGSI_OPCODE_STR:
2314 return 0;
2315 break;
2316
2317 case TGSI_OPCODE_TEX:
2318 emit_tex( func, inst, FALSE, FALSE );
2319 break;
2320
2321 case TGSI_OPCODE_TXD:
2322 return 0;
2323 break;
2324
2325 case TGSI_OPCODE_UP2H:
2326 return 0;
2327 break;
2328
2329 case TGSI_OPCODE_UP2US:
2330 return 0;
2331 break;
2332
2333 case TGSI_OPCODE_UP4B:
2334 return 0;
2335 break;
2336
2337 case TGSI_OPCODE_UP4UB:
2338 return 0;
2339 break;
2340
2341 case TGSI_OPCODE_X2D:
2342 return 0;
2343 break;
2344
2345 case TGSI_OPCODE_ARA:
2346 return 0;
2347 break;
2348
2349 case TGSI_OPCODE_ARR:
2350 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2351 FETCH( func, *inst, 0, 0, chan_index );
2352 emit_rnd( func, 0, 0 );
2353 emit_f2it( func, 0 );
2354 STORE( func, *inst, 0, 0, chan_index );
2355 }
2356 break;
2357
2358 case TGSI_OPCODE_BRA:
2359 return 0;
2360 break;
2361
2362 case TGSI_OPCODE_CAL:
2363 return 0;
2364 break;
2365
2366 case TGSI_OPCODE_RET:
2367 emit_ret( func );
2368 break;
2369
2370 case TGSI_OPCODE_END:
2371 break;
2372
2373 case TGSI_OPCODE_SSG:
2374 /* TGSI_OPCODE_SGN */
2375 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2376 FETCH( func, *inst, 0, 0, chan_index );
2377 emit_sgn( func, 0, 0 );
2378 STORE( func, *inst, 0, 0, chan_index );
2379 }
2380 break;
2381
2382 case TGSI_OPCODE_CMP:
2383 emit_cmp (func, inst);
2384 break;
2385
2386 case TGSI_OPCODE_SCS:
2387 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2388 FETCH( func, *inst, 0, 0, CHAN_X );
2389 emit_cos( func, 0, 0 );
2390 STORE( func, *inst, 0, 0, CHAN_X );
2391 }
2392 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2393 FETCH( func, *inst, 0, 0, CHAN_X );
2394 emit_sin( func, 0, 0 );
2395 STORE( func, *inst, 0, 0, CHAN_Y );
2396 }
2397 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2398 emit_tempf(
2399 func,
2400 0,
2401 TGSI_EXEC_TEMP_00000000_I,
2402 TGSI_EXEC_TEMP_00000000_C );
2403 STORE( func, *inst, 0, 0, CHAN_Z );
2404 }
2405 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2406 emit_tempf(
2407 func,
2408 0,
2409 TEMP_ONE_I,
2410 TEMP_ONE_C );
2411 STORE( func, *inst, 0, 0, CHAN_W );
2412 }
2413 break;
2414
2415 case TGSI_OPCODE_TXB:
2416 emit_tex( func, inst, TRUE, FALSE );
2417 break;
2418
2419 case TGSI_OPCODE_NRM:
2420 /* fall-through */
2421 case TGSI_OPCODE_NRM4:
2422 /* 3 or 4-component normalization */
2423 {
2424 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2425
2426 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2427 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2428 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2429 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2430
2431 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2432
2433 /* xmm4 = src.x */
2434 /* xmm0 = src.x * src.x */
2435 FETCH(func, *inst, 0, 0, CHAN_X);
2436 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2437 emit_MOV(func, 4, 0);
2438 }
2439 emit_mul(func, 0, 0);
2440
2441 /* xmm5 = src.y */
2442 /* xmm0 = xmm0 + src.y * src.y */
2443 FETCH(func, *inst, 1, 0, CHAN_Y);
2444 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2445 emit_MOV(func, 5, 1);
2446 }
2447 emit_mul(func, 1, 1);
2448 emit_add(func, 0, 1);
2449
2450 /* xmm6 = src.z */
2451 /* xmm0 = xmm0 + src.z * src.z */
2452 FETCH(func, *inst, 1, 0, CHAN_Z);
2453 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2454 emit_MOV(func, 6, 1);
2455 }
2456 emit_mul(func, 1, 1);
2457 emit_add(func, 0, 1);
2458
2459 if (dims == 4) {
2460 /* xmm7 = src.w */
2461 /* xmm0 = xmm0 + src.w * src.w */
2462 FETCH(func, *inst, 1, 0, CHAN_W);
2463 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2464 emit_MOV(func, 7, 1);
2465 }
2466 emit_mul(func, 1, 1);
2467 emit_add(func, 0, 1);
2468 }
2469
2470 /* xmm1 = 1 / sqrt(xmm0) */
2471 emit_rsqrt(func, 1, 0);
2472
2473 /* dst.x = xmm1 * src.x */
2474 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2475 emit_mul(func, 4, 1);
2476 STORE(func, *inst, 4, 0, CHAN_X);
2477 }
2478
2479 /* dst.y = xmm1 * src.y */
2480 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2481 emit_mul(func, 5, 1);
2482 STORE(func, *inst, 5, 0, CHAN_Y);
2483 }
2484
2485 /* dst.z = xmm1 * src.z */
2486 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2487 emit_mul(func, 6, 1);
2488 STORE(func, *inst, 6, 0, CHAN_Z);
2489 }
2490
2491 /* dst.w = xmm1 * src.w */
2492 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2493 emit_mul(func, 7, 1);
2494 STORE(func, *inst, 7, 0, CHAN_W);
2495 }
2496 }
2497
2498 /* dst0.w = 1.0 */
2499 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2500 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2501 STORE(func, *inst, 0, 0, CHAN_W);
2502 }
2503 }
2504 break;
2505
2506 case TGSI_OPCODE_DIV:
2507 return 0;
2508 break;
2509
2510 case TGSI_OPCODE_DP2:
2511 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2512 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2513 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2514 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2515 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2516 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2517 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2518 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2519 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2520 }
2521 break;
2522
2523 case TGSI_OPCODE_TXL:
2524 emit_tex( func, inst, TRUE, FALSE );
2525 break;
2526
2527 case TGSI_OPCODE_TXP:
2528 emit_tex( func, inst, FALSE, TRUE );
2529 break;
2530
2531 case TGSI_OPCODE_BRK:
2532 return 0;
2533 break;
2534
2535 case TGSI_OPCODE_IF:
2536 return 0;
2537 break;
2538
2539 case TGSI_OPCODE_BGNFOR:
2540 return 0;
2541 break;
2542
2543 case TGSI_OPCODE_REP:
2544 return 0;
2545 break;
2546
2547 case TGSI_OPCODE_ELSE:
2548 return 0;
2549 break;
2550
2551 case TGSI_OPCODE_ENDIF:
2552 return 0;
2553 break;
2554
2555 case TGSI_OPCODE_ENDFOR:
2556 return 0;
2557 break;
2558
2559 case TGSI_OPCODE_ENDREP:
2560 return 0;
2561 break;
2562
2563 case TGSI_OPCODE_PUSHA:
2564 return 0;
2565 break;
2566
2567 case TGSI_OPCODE_POPA:
2568 return 0;
2569 break;
2570
2571 case TGSI_OPCODE_CEIL:
2572 return 0;
2573 break;
2574
2575 case TGSI_OPCODE_I2F:
2576 return 0;
2577 break;
2578
2579 case TGSI_OPCODE_NOT:
2580 return 0;
2581 break;
2582
2583 case TGSI_OPCODE_TRUNC:
2584 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2585 FETCH( func, *inst, 0, 0, chan_index );
2586 emit_f2it( func, 0 );
2587 emit_i2f( func, 0 );
2588 STORE( func, *inst, 0, 0, chan_index );
2589 }
2590 break;
2591
2592 case TGSI_OPCODE_SHL:
2593 return 0;
2594 break;
2595
2596 case TGSI_OPCODE_SHR:
2597 return 0;
2598 break;
2599
2600 case TGSI_OPCODE_AND:
2601 return 0;
2602 break;
2603
2604 case TGSI_OPCODE_OR:
2605 return 0;
2606 break;
2607
2608 case TGSI_OPCODE_MOD:
2609 return 0;
2610 break;
2611
2612 case TGSI_OPCODE_XOR:
2613 return 0;
2614 break;
2615
2616 case TGSI_OPCODE_SAD:
2617 return 0;
2618 break;
2619
2620 case TGSI_OPCODE_TXF:
2621 return 0;
2622 break;
2623
2624 case TGSI_OPCODE_TXQ:
2625 return 0;
2626 break;
2627
2628 case TGSI_OPCODE_CONT:
2629 return 0;
2630 break;
2631
2632 case TGSI_OPCODE_EMIT:
2633 return 0;
2634 break;
2635
2636 case TGSI_OPCODE_ENDPRIM:
2637 return 0;
2638 break;
2639
2640 default:
2641 return 0;
2642 }
2643
2644 return 1;
2645 }
2646
2647 static void
2648 emit_declaration(
2649 struct x86_function *func,
2650 struct tgsi_full_declaration *decl )
2651 {
2652 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2653 unsigned first, last, mask;
2654 unsigned i, j;
2655
2656 first = decl->DeclarationRange.First;
2657 last = decl->DeclarationRange.Last;
2658 mask = decl->Declaration.UsageMask;
2659
2660 for( i = first; i <= last; i++ ) {
2661 for( j = 0; j < NUM_CHANNELS; j++ ) {
2662 if( mask & (1 << j) ) {
2663 switch( decl->Declaration.Interpolate ) {
2664 case TGSI_INTERPOLATE_CONSTANT:
2665 emit_coef_a0( func, 0, i, j );
2666 emit_inputs( func, 0, i, j );
2667 break;
2668
2669 case TGSI_INTERPOLATE_LINEAR:
2670 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2671 emit_coef_dadx( func, 1, i, j );
2672 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2673 emit_coef_dady( func, 3, i, j );
2674 emit_mul( func, 0, 1 ); /* x * dadx */
2675 emit_coef_a0( func, 4, i, j );
2676 emit_mul( func, 2, 3 ); /* y * dady */
2677 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2678 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2679 emit_inputs( func, 0, i, j );
2680 break;
2681
2682 case TGSI_INTERPOLATE_PERSPECTIVE:
2683 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2684 emit_coef_dadx( func, 1, i, j );
2685 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2686 emit_coef_dady( func, 3, i, j );
2687 emit_mul( func, 0, 1 ); /* x * dadx */
2688 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2689 emit_coef_a0( func, 5, i, j );
2690 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2691 emit_mul( func, 2, 3 ); /* y * dady */
2692 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2693 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2694 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2695 emit_inputs( func, 0, i, j );
2696 break;
2697
2698 default:
2699 assert( 0 );
2700 break;
2701 }
2702 }
2703 }
2704 }
2705 }
2706 }
2707
2708 static void aos_to_soa( struct x86_function *func,
2709 uint arg_aos,
2710 uint arg_machine,
2711 uint arg_num,
2712 uint arg_stride )
2713 {
2714 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2715 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2716 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2717 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2718 int inner_loop;
2719
2720
2721 /* Save EBX */
2722 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2723
2724 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2725 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2726 x86_lea( func, soa_input,
2727 x86_make_disp( soa_input,
2728 Offset(struct tgsi_exec_machine, Inputs) ) );
2729 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2730 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2731
2732 /* do */
2733 inner_loop = x86_get_label( func );
2734 {
2735 x86_push( func, aos_input );
2736 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2737 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2738 x86_add( func, aos_input, stride );
2739 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2740 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2741 x86_add( func, aos_input, stride );
2742 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2743 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2744 x86_add( func, aos_input, stride );
2745 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2746 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2747 x86_pop( func, aos_input );
2748
2749 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2750 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2751 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2752 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2753 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2754 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2755
2756 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2757 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2758 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2759 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2760
2761 /* Advance to next input */
2762 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2763 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2764 }
2765 /* while --num_inputs */
2766 x86_dec( func, num_inputs );
2767 x86_jcc( func, cc_NE, inner_loop );
2768
2769 /* Restore EBX */
2770 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2771 }
2772
2773 static void soa_to_aos( struct x86_function *func,
2774 uint arg_aos,
2775 uint arg_machine,
2776 uint arg_num,
2777 uint arg_stride )
2778 {
2779 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2780 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2781 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2782 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2783 int inner_loop;
2784
2785 /* Save EBX */
2786 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2787
2788 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2789 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2790 x86_lea( func, soa_output,
2791 x86_make_disp( soa_output,
2792 Offset(struct tgsi_exec_machine, Outputs) ) );
2793 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2794
2795 /* do */
2796 inner_loop = x86_get_label( func );
2797 {
2798 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2799 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2800 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2801 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2802
2803 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2804 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2805 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2806 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2807 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2808 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2809
2810 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2811 x86_push( func, aos_output );
2812 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2813 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2814 x86_add( func, aos_output, temp );
2815 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2816 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2817 x86_add( func, aos_output, temp );
2818 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2819 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2820 x86_add( func, aos_output, temp );
2821 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2822 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2823 x86_pop( func, aos_output );
2824
2825 /* Advance to next output */
2826 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2827 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2828 }
2829 /* while --num_outputs */
2830 x86_dec( func, num_outputs );
2831 x86_jcc( func, cc_NE, inner_loop );
2832
2833 /* Restore EBX */
2834 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2835 }
2836
2837 /**
2838 * Translate a TGSI vertex/fragment shader to SSE2 code.
2839 * Slightly different things are done for vertex vs. fragment shaders.
2840 *
2841 * \param tokens the TGSI input shader
2842 * \param func the output SSE code/function
2843 * \param immediates buffer to place immediates, later passed to SSE func
2844 * \param return 1 for success, 0 if translation failed
2845 */
2846 unsigned
2847 tgsi_emit_sse2(
2848 const struct tgsi_token *tokens,
2849 struct x86_function *func,
2850 float (*immediates)[4],
2851 boolean do_swizzles )
2852 {
2853 struct tgsi_parse_context parse;
2854 unsigned ok = 1;
2855 uint num_immediates = 0;
2856
2857 util_init_math();
2858
2859 func->csr = func->store;
2860
2861 tgsi_parse_init( &parse, tokens );
2862
2863 /* Can't just use EDI, EBX without save/restoring them:
2864 */
2865 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2866 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2867
2868 /*
2869 * Different function args for vertex/fragment shaders:
2870 */
2871 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2872 if (do_swizzles)
2873 aos_to_soa( func,
2874 4, /* aos_input */
2875 1, /* machine */
2876 5, /* num_inputs */
2877 6 ); /* input_stride */
2878 }
2879
2880 x86_mov(
2881 func,
2882 get_machine_base(),
2883 x86_fn_arg( func, 1 ) );
2884 x86_mov(
2885 func,
2886 get_const_base(),
2887 x86_fn_arg( func, 2 ) );
2888 x86_mov(
2889 func,
2890 get_immediate_base(),
2891 x86_fn_arg( func, 3 ) );
2892
2893 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2894 x86_mov(
2895 func,
2896 get_coef_base(),
2897 x86_fn_arg( func, 4 ) );
2898 }
2899
2900 x86_mov(
2901 func,
2902 get_sampler_base(),
2903 x86_make_disp( get_machine_base(),
2904 Offset( struct tgsi_exec_machine, Samplers ) ) );
2905
2906
2907 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2908 tgsi_parse_token( &parse );
2909
2910 switch( parse.FullToken.Token.Type ) {
2911 case TGSI_TOKEN_TYPE_DECLARATION:
2912 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2913 emit_declaration(
2914 func,
2915 &parse.FullToken.FullDeclaration );
2916 }
2917 break;
2918
2919 case TGSI_TOKEN_TYPE_INSTRUCTION:
2920 ok = emit_instruction(
2921 func,
2922 &parse.FullToken.FullInstruction );
2923
2924 if (!ok) {
2925 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2926 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2927 opcode,
2928 tgsi_get_opcode_name(opcode),
2929 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2930 "vertex shader" : "fragment shader");
2931 }
2932 break;
2933
2934 case TGSI_TOKEN_TYPE_IMMEDIATE:
2935 /* simply copy the immediate values into the next immediates[] slot */
2936 {
2937 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2938 uint i;
2939 assert(size <= 4);
2940 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2941 for( i = 0; i < size; i++ ) {
2942 immediates[num_immediates][i] =
2943 parse.FullToken.FullImmediate.u[i].Float;
2944 }
2945 #if 0
2946 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2947 num_immediates,
2948 immediates[num_immediates][0],
2949 immediates[num_immediates][1],
2950 immediates[num_immediates][2],
2951 immediates[num_immediates][3]);
2952 #endif
2953 num_immediates++;
2954 }
2955 break;
2956
2957 default:
2958 ok = 0;
2959 assert( 0 );
2960 }
2961 }
2962
2963 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2964 if (do_swizzles)
2965 soa_to_aos( func,
2966 7, /* aos_output */
2967 1, /* machine */
2968 8, /* num_outputs */
2969 9 ); /* output_stride */
2970 }
2971
2972 /* Can't just use EBX, EDI without save/restoring them:
2973 */
2974 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2975 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2976
2977 emit_ret( func );
2978
2979 tgsi_parse_free( &parse );
2980
2981 return ok;
2982 }
2983
2984 #endif /* PIPE_ARCH_X86 */
2985