Merge branch 'mesa_7_6_branch'
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_info.h"
40 #include "tgsi/tgsi_parse.h"
41 #include "tgsi/tgsi_util.h"
42 #include "tgsi/tgsi_dump.h"
43 #include "tgsi/tgsi_exec.h"
44 #include "tgsi/tgsi_sse2.h"
45
46 #include "rtasm/rtasm_x86sse.h"
47
48 /* for 1/sqrt()
49 *
50 * This costs about 100fps (close to 10%) in gears:
51 */
52 #define HIGH_PRECISION 1
53
54 #define FAST_MATH 1
55
56
57 #define FOR_EACH_CHANNEL( CHAN )\
58 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
59
60 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
61 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
62
63 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
64 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
65
66 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
67 FOR_EACH_CHANNEL( CHAN )\
68 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
69
70 #define CHAN_X 0
71 #define CHAN_Y 1
72 #define CHAN_Z 2
73 #define CHAN_W 3
74
75 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
76 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
77
78 #define TEMP_R0 TGSI_EXEC_TEMP_R0
79 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
80 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
81 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
82
83
84 /**
85 * X86 utility functions.
86 */
87
88 static struct x86_reg
89 make_xmm(
90 unsigned xmm )
91 {
92 return x86_make_reg(
93 file_XMM,
94 (enum x86_reg_name) xmm );
95 }
96
97 /**
98 * X86 register mapping helpers.
99 */
100
101 static struct x86_reg
102 get_const_base( void )
103 {
104 return x86_make_reg(
105 file_REG32,
106 reg_AX );
107 }
108
109 static struct x86_reg
110 get_machine_base( void )
111 {
112 return x86_make_reg(
113 file_REG32,
114 reg_CX );
115 }
116
117 static struct x86_reg
118 get_input_base( void )
119 {
120 return x86_make_disp(
121 get_machine_base(),
122 Offset(struct tgsi_exec_machine, Inputs) );
123 }
124
125 static struct x86_reg
126 get_output_base( void )
127 {
128 return x86_make_disp(
129 get_machine_base(),
130 Offset(struct tgsi_exec_machine, Outputs) );
131 }
132
133 static struct x86_reg
134 get_temp_base( void )
135 {
136 return x86_make_disp(
137 get_machine_base(),
138 Offset(struct tgsi_exec_machine, Temps) );
139 }
140
141 static struct x86_reg
142 get_coef_base( void )
143 {
144 return x86_make_reg(
145 file_REG32,
146 reg_BX );
147 }
148
149 static struct x86_reg
150 get_sampler_base( void )
151 {
152 return x86_make_reg(
153 file_REG32,
154 reg_DI );
155 }
156
157 static struct x86_reg
158 get_immediate_base( void )
159 {
160 return x86_make_reg(
161 file_REG32,
162 reg_DX );
163 }
164
165
166 /**
167 * Data access helpers.
168 */
169
170
171 static struct x86_reg
172 get_immediate(
173 unsigned vec,
174 unsigned chan )
175 {
176 return x86_make_disp(
177 get_immediate_base(),
178 (vec * 4 + chan) * 4 );
179 }
180
181 static struct x86_reg
182 get_const(
183 unsigned vec,
184 unsigned chan )
185 {
186 return x86_make_disp(
187 get_const_base(),
188 (vec * 4 + chan) * 4 );
189 }
190
191 static struct x86_reg
192 get_sampler_ptr(
193 unsigned unit )
194 {
195 return x86_make_disp(
196 get_sampler_base(),
197 unit * sizeof( struct tgsi_sampler * ) );
198 }
199
200 static struct x86_reg
201 get_input(
202 unsigned vec,
203 unsigned chan )
204 {
205 return x86_make_disp(
206 get_input_base(),
207 (vec * 4 + chan) * 16 );
208 }
209
210 static struct x86_reg
211 get_output(
212 unsigned vec,
213 unsigned chan )
214 {
215 return x86_make_disp(
216 get_output_base(),
217 (vec * 4 + chan) * 16 );
218 }
219
220 static struct x86_reg
221 get_temp(
222 unsigned vec,
223 unsigned chan )
224 {
225 return x86_make_disp(
226 get_temp_base(),
227 (vec * 4 + chan) * 16 );
228 }
229
230 static struct x86_reg
231 get_coef(
232 unsigned vec,
233 unsigned chan,
234 unsigned member )
235 {
236 return x86_make_disp(
237 get_coef_base(),
238 ((vec * 3 + member) * 4 + chan) * 4 );
239 }
240
241
242 static void
243 emit_ret(
244 struct x86_function *func )
245 {
246 x86_ret( func );
247 }
248
249
250 /**
251 * Data fetch helpers.
252 */
253
254 /**
255 * Copy a shader constant to xmm register
256 * \param xmm the destination xmm register
257 * \param vec the src const buffer index
258 * \param chan src channel to fetch (X, Y, Z or W)
259 */
260 static void
261 emit_const(
262 struct x86_function *func,
263 uint xmm,
264 int vec,
265 uint chan,
266 uint indirect,
267 uint indirectFile,
268 int indirectIndex )
269 {
270 if (indirect) {
271 /* 'vec' is the offset from the address register's value.
272 * We're loading CONST[ADDR+vec] into an xmm register.
273 */
274 struct x86_reg r0 = get_immediate_base();
275 struct x86_reg r1 = get_coef_base();
276 uint i;
277
278 assert( indirectFile == TGSI_FILE_ADDRESS );
279 assert( indirectIndex == 0 );
280 assert( r0.mod == mod_REG );
281 assert( r1.mod == mod_REG );
282
283 x86_push( func, r0 );
284 x86_push( func, r1 );
285
286 /*
287 * Loop over the four pixels or vertices in the quad.
288 * Get the value of the address (offset) register for pixel/vertex[i],
289 * add it to the src offset and index into the constant buffer.
290 * Note that we're working on SOA data.
291 * If any of the pixel/vertex execution channels are unused their
292 * values will be garbage. It's very important that we don't use
293 * those garbage values as indexes into the constant buffer since
294 * that'll cause segfaults.
295 * The solution is to bitwise-AND the offset with the execution mask
296 * register whose values are either 0 or ~0.
297 * The caller must setup the execution mask register to indicate
298 * which channels are valid/alive before running the shader.
299 * The execution mask will also figure into loops and conditionals
300 * someday.
301 */
302 for (i = 0; i < QUAD_SIZE; i++) {
303 /* r1 = address register[i] */
304 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
305 /* r0 = execution mask[i] */
306 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
307 /* r1 = r1 & r0 */
308 x86_and( func, r1, r0 );
309 /* r0 = 'vec', the offset */
310 x86_lea( func, r0, get_const( vec, chan ) );
311
312 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
313 */
314 x86_add( func, r1, r1 );
315 x86_add( func, r1, r1 );
316 x86_add( func, r1, r1 );
317 x86_add( func, r1, r1 );
318
319 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
320 x86_mov( func, r1, x86_deref( r0 ) );
321 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
322 }
323
324 x86_pop( func, r1 );
325 x86_pop( func, r0 );
326
327 sse_movaps(
328 func,
329 make_xmm( xmm ),
330 get_temp( TEMP_R0, CHAN_X ) );
331 }
332 else {
333 /* 'vec' is the index into the src register file, such as TEMP[vec] */
334 assert( vec >= 0 );
335
336 sse_movss(
337 func,
338 make_xmm( xmm ),
339 get_const( vec, chan ) );
340 sse_shufps(
341 func,
342 make_xmm( xmm ),
343 make_xmm( xmm ),
344 SHUF( 0, 0, 0, 0 ) );
345 }
346 }
347
348 static void
349 emit_immediate(
350 struct x86_function *func,
351 unsigned xmm,
352 unsigned vec,
353 unsigned chan )
354 {
355 sse_movss(
356 func,
357 make_xmm( xmm ),
358 get_immediate( vec, chan ) );
359 sse_shufps(
360 func,
361 make_xmm( xmm ),
362 make_xmm( xmm ),
363 SHUF( 0, 0, 0, 0 ) );
364 }
365
366
367 /**
368 * Copy a shader input to xmm register
369 * \param xmm the destination xmm register
370 * \param vec the src input attrib
371 * \param chan src channel to fetch (X, Y, Z or W)
372 */
373 static void
374 emit_inputf(
375 struct x86_function *func,
376 unsigned xmm,
377 unsigned vec,
378 unsigned chan )
379 {
380 sse_movups(
381 func,
382 make_xmm( xmm ),
383 get_input( vec, chan ) );
384 }
385
386 /**
387 * Store an xmm register to a shader output
388 * \param xmm the source xmm register
389 * \param vec the dest output attrib
390 * \param chan src dest channel to store (X, Y, Z or W)
391 */
392 static void
393 emit_output(
394 struct x86_function *func,
395 unsigned xmm,
396 unsigned vec,
397 unsigned chan )
398 {
399 sse_movups(
400 func,
401 get_output( vec, chan ),
402 make_xmm( xmm ) );
403 }
404
405 /**
406 * Copy a shader temporary to xmm register
407 * \param xmm the destination xmm register
408 * \param vec the src temp register
409 * \param chan src channel to fetch (X, Y, Z or W)
410 */
411 static void
412 emit_tempf(
413 struct x86_function *func,
414 unsigned xmm,
415 unsigned vec,
416 unsigned chan )
417 {
418 sse_movaps(
419 func,
420 make_xmm( xmm ),
421 get_temp( vec, chan ) );
422 }
423
424 /**
425 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
426 * \param xmm the destination xmm register
427 * \param vec the src input/attribute coefficient index
428 * \param chan src channel to fetch (X, Y, Z or W)
429 * \param member 0=a0, 1=dadx, 2=dady
430 */
431 static void
432 emit_coef(
433 struct x86_function *func,
434 unsigned xmm,
435 unsigned vec,
436 unsigned chan,
437 unsigned member )
438 {
439 sse_movss(
440 func,
441 make_xmm( xmm ),
442 get_coef( vec, chan, member ) );
443 sse_shufps(
444 func,
445 make_xmm( xmm ),
446 make_xmm( xmm ),
447 SHUF( 0, 0, 0, 0 ) );
448 }
449
450 /**
451 * Data store helpers.
452 */
453
454 static void
455 emit_inputs(
456 struct x86_function *func,
457 unsigned xmm,
458 unsigned vec,
459 unsigned chan )
460 {
461 sse_movups(
462 func,
463 get_input( vec, chan ),
464 make_xmm( xmm ) );
465 }
466
467 static void
468 emit_temps(
469 struct x86_function *func,
470 unsigned xmm,
471 unsigned vec,
472 unsigned chan )
473 {
474 sse_movaps(
475 func,
476 get_temp( vec, chan ),
477 make_xmm( xmm ) );
478 }
479
480 static void
481 emit_addrs(
482 struct x86_function *func,
483 unsigned xmm,
484 unsigned vec,
485 unsigned chan )
486 {
487 assert( vec == 0 );
488
489 emit_temps(
490 func,
491 xmm,
492 vec + TGSI_EXEC_TEMP_ADDR,
493 chan );
494 }
495
496 /**
497 * Coefficent fetch helpers.
498 */
499
500 static void
501 emit_coef_a0(
502 struct x86_function *func,
503 unsigned xmm,
504 unsigned vec,
505 unsigned chan )
506 {
507 emit_coef(
508 func,
509 xmm,
510 vec,
511 chan,
512 0 );
513 }
514
515 static void
516 emit_coef_dadx(
517 struct x86_function *func,
518 unsigned xmm,
519 unsigned vec,
520 unsigned chan )
521 {
522 emit_coef(
523 func,
524 xmm,
525 vec,
526 chan,
527 1 );
528 }
529
530 static void
531 emit_coef_dady(
532 struct x86_function *func,
533 unsigned xmm,
534 unsigned vec,
535 unsigned chan )
536 {
537 emit_coef(
538 func,
539 xmm,
540 vec,
541 chan,
542 2 );
543 }
544
545 /**
546 * Function call helpers.
547 */
548
549 /**
550 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
551 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
552 * that the stack pointer is 16 byte aligned, as expected.
553 */
554 static void
555 emit_func_call(
556 struct x86_function *func,
557 unsigned xmm_save_mask,
558 const struct x86_reg *arg,
559 unsigned nr_args,
560 void (PIPE_CDECL *code)() )
561 {
562 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
563 unsigned i, n;
564
565 x86_push(
566 func,
567 x86_make_reg( file_REG32, reg_AX) );
568 x86_push(
569 func,
570 x86_make_reg( file_REG32, reg_CX) );
571 x86_push(
572 func,
573 x86_make_reg( file_REG32, reg_DX) );
574
575 /* Store XMM regs to the stack
576 */
577 for(i = 0, n = 0; i < 8; ++i)
578 if(xmm_save_mask & (1 << i))
579 ++n;
580
581 x86_sub_imm(
582 func,
583 x86_make_reg( file_REG32, reg_SP ),
584 n*16);
585
586 for(i = 0, n = 0; i < 8; ++i)
587 if(xmm_save_mask & (1 << i)) {
588 sse_movups(
589 func,
590 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
591 make_xmm( i ) );
592 ++n;
593 }
594
595 for (i = 0; i < nr_args; i++) {
596 /* Load the address of the buffer we use for passing arguments and
597 * receiving results:
598 */
599 x86_lea(
600 func,
601 ecx,
602 arg[i] );
603
604 /* Push actual function arguments (currently just the pointer to
605 * the buffer above), and call the function:
606 */
607 x86_push( func, ecx );
608 }
609
610 x86_mov_reg_imm( func, ecx, (unsigned long) code );
611 x86_call( func, ecx );
612
613 /* Pop the arguments (or just add an immediate to esp)
614 */
615 for (i = 0; i < nr_args; i++) {
616 x86_pop(func, ecx );
617 }
618
619 /* Pop the saved XMM regs:
620 */
621 for(i = 0, n = 0; i < 8; ++i)
622 if(xmm_save_mask & (1 << i)) {
623 sse_movups(
624 func,
625 make_xmm( i ),
626 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
627 ++n;
628 }
629
630 x86_add_imm(
631 func,
632 x86_make_reg( file_REG32, reg_SP ),
633 n*16);
634
635 /* Restore GP registers in a reverse order.
636 */
637 x86_pop(
638 func,
639 x86_make_reg( file_REG32, reg_DX) );
640 x86_pop(
641 func,
642 x86_make_reg( file_REG32, reg_CX) );
643 x86_pop(
644 func,
645 x86_make_reg( file_REG32, reg_AX) );
646 }
647
648 static void
649 emit_func_call_dst_src1(
650 struct x86_function *func,
651 unsigned xmm_save,
652 unsigned xmm_dst,
653 unsigned xmm_src0,
654 void (PIPE_CDECL *code)() )
655 {
656 struct x86_reg store = get_temp( TEMP_R0, 0 );
657 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
658
659 /* Store our input parameters (in xmm regs) to the buffer we use
660 * for passing arguments. We will pass a pointer to this buffer as
661 * the actual function argument.
662 */
663 sse_movaps(
664 func,
665 store,
666 make_xmm( xmm_src0 ) );
667
668 emit_func_call( func,
669 xmm_mask,
670 &store,
671 1,
672 code );
673
674 sse_movaps(
675 func,
676 make_xmm( xmm_dst ),
677 store );
678 }
679
680
681 static void
682 emit_func_call_dst_src2(
683 struct x86_function *func,
684 unsigned xmm_save,
685 unsigned xmm_dst,
686 unsigned xmm_src0,
687 unsigned xmm_src1,
688 void (PIPE_CDECL *code)() )
689 {
690 struct x86_reg store = get_temp( TEMP_R0, 0 );
691 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
692
693 /* Store two inputs to parameter buffer.
694 */
695 sse_movaps(
696 func,
697 store,
698 make_xmm( xmm_src0 ) );
699
700 sse_movaps(
701 func,
702 x86_make_disp( store, 4 * sizeof(float) ),
703 make_xmm( xmm_src1 ) );
704
705
706 /* Emit the call
707 */
708 emit_func_call( func,
709 xmm_mask,
710 &store,
711 1,
712 code );
713
714 /* Retrieve the results:
715 */
716 sse_movaps(
717 func,
718 make_xmm( xmm_dst ),
719 store );
720 }
721
722
723
724
725
726 #if defined(PIPE_ARCH_SSE)
727
728 /*
729 * Fast SSE2 implementation of special math functions.
730 */
731
732 #define POLY0(x, c0) _mm_set1_ps(c0)
733 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
734 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
735 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
736 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
737 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
738
739 #define EXP_POLY_DEGREE 3
740 #define LOG_POLY_DEGREE 5
741
742 /**
743 * See http://www.devmaster.net/forums/showthread.php?p=43580
744 */
745 static INLINE __m128
746 exp2f4(__m128 x)
747 {
748 __m128i ipart;
749 __m128 fpart, expipart, expfpart;
750
751 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
752 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
753
754 /* ipart = int(x - 0.5) */
755 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
756
757 /* fpart = x - ipart */
758 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
759
760 /* expipart = (float) (1 << ipart) */
761 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
762
763 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
764 #if EXP_POLY_DEGREE == 5
765 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
766 #elif EXP_POLY_DEGREE == 4
767 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
768 #elif EXP_POLY_DEGREE == 3
769 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
770 #elif EXP_POLY_DEGREE == 2
771 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
772 #else
773 #error
774 #endif
775
776 return _mm_mul_ps(expipart, expfpart);
777 }
778
779
780 /**
781 * See http://www.devmaster.net/forums/showthread.php?p=43580
782 */
783 static INLINE __m128
784 log2f4(__m128 x)
785 {
786 __m128i expmask = _mm_set1_epi32(0x7f800000);
787 __m128i mantmask = _mm_set1_epi32(0x007fffff);
788 __m128 one = _mm_set1_ps(1.0f);
789
790 __m128i i = _mm_castps_si128(x);
791
792 /* exp = (float) exponent(x) */
793 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
794
795 /* mant = (float) mantissa(x) */
796 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
797
798 __m128 logmant;
799
800 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
801 * These coefficients can be generate with
802 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
803 */
804 #if LOG_POLY_DEGREE == 6
805 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
806 #elif LOG_POLY_DEGREE == 5
807 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
808 #elif LOG_POLY_DEGREE == 4
809 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
810 #elif LOG_POLY_DEGREE == 3
811 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
812 #else
813 #error
814 #endif
815
816 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
817 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
818
819 return _mm_add_ps(logmant, exp);
820 }
821
822
823 static INLINE __m128
824 powf4(__m128 x, __m128 y)
825 {
826 return exp2f4(_mm_mul_ps(log2f4(x), y));
827 }
828
829 #endif /* PIPE_ARCH_SSE */
830
831
832
833 /**
834 * Low-level instruction translators.
835 */
836
837 static void
838 emit_abs(
839 struct x86_function *func,
840 unsigned xmm )
841 {
842 sse_andps(
843 func,
844 make_xmm( xmm ),
845 get_temp(
846 TGSI_EXEC_TEMP_7FFFFFFF_I,
847 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
848 }
849
850 static void
851 emit_add(
852 struct x86_function *func,
853 unsigned xmm_dst,
854 unsigned xmm_src )
855 {
856 sse_addps(
857 func,
858 make_xmm( xmm_dst ),
859 make_xmm( xmm_src ) );
860 }
861
862 static void PIPE_CDECL
863 cos4f(
864 float *store )
865 {
866 store[0] = cosf( store[0] );
867 store[1] = cosf( store[1] );
868 store[2] = cosf( store[2] );
869 store[3] = cosf( store[3] );
870 }
871
872 static void
873 emit_cos(
874 struct x86_function *func,
875 unsigned xmm_save,
876 unsigned xmm_dst )
877 {
878 emit_func_call_dst_src1(
879 func,
880 xmm_save,
881 xmm_dst,
882 xmm_dst,
883 cos4f );
884 }
885
886 static void PIPE_CDECL
887 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
888 __attribute__((force_align_arg_pointer))
889 #endif
890 ex24f(
891 float *store )
892 {
893 #if defined(PIPE_ARCH_SSE)
894 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
895 #else
896 store[0] = util_fast_exp2( store[0] );
897 store[1] = util_fast_exp2( store[1] );
898 store[2] = util_fast_exp2( store[2] );
899 store[3] = util_fast_exp2( store[3] );
900 #endif
901 }
902
903 static void
904 emit_ex2(
905 struct x86_function *func,
906 unsigned xmm_save,
907 unsigned xmm_dst )
908 {
909 emit_func_call_dst_src1(
910 func,
911 xmm_save,
912 xmm_dst,
913 xmm_dst,
914 ex24f );
915 }
916
917 static void
918 emit_f2it(
919 struct x86_function *func,
920 unsigned xmm )
921 {
922 sse2_cvttps2dq(
923 func,
924 make_xmm( xmm ),
925 make_xmm( xmm ) );
926 }
927
928 static void
929 emit_i2f(
930 struct x86_function *func,
931 unsigned xmm )
932 {
933 sse2_cvtdq2ps(
934 func,
935 make_xmm( xmm ),
936 make_xmm( xmm ) );
937 }
938
939 static void PIPE_CDECL
940 flr4f(
941 float *store )
942 {
943 store[0] = floorf( store[0] );
944 store[1] = floorf( store[1] );
945 store[2] = floorf( store[2] );
946 store[3] = floorf( store[3] );
947 }
948
949 static void
950 emit_flr(
951 struct x86_function *func,
952 unsigned xmm_save,
953 unsigned xmm_dst )
954 {
955 emit_func_call_dst_src1(
956 func,
957 xmm_save,
958 xmm_dst,
959 xmm_dst,
960 flr4f );
961 }
962
963 static void PIPE_CDECL
964 frc4f(
965 float *store )
966 {
967 store[0] -= floorf( store[0] );
968 store[1] -= floorf( store[1] );
969 store[2] -= floorf( store[2] );
970 store[3] -= floorf( store[3] );
971 }
972
973 static void
974 emit_frc(
975 struct x86_function *func,
976 unsigned xmm_save,
977 unsigned xmm_dst )
978 {
979 emit_func_call_dst_src1(
980 func,
981 xmm_save,
982 xmm_dst,
983 xmm_dst,
984 frc4f );
985 }
986
987 static void PIPE_CDECL
988 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
989 __attribute__((force_align_arg_pointer))
990 #endif
991 lg24f(
992 float *store )
993 {
994 #if defined(PIPE_ARCH_SSE)
995 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
996 #else
997 store[0] = util_fast_log2( store[0] );
998 store[1] = util_fast_log2( store[1] );
999 store[2] = util_fast_log2( store[2] );
1000 store[3] = util_fast_log2( store[3] );
1001 #endif
1002 }
1003
1004 static void
1005 emit_lg2(
1006 struct x86_function *func,
1007 unsigned xmm_save,
1008 unsigned xmm_dst )
1009 {
1010 emit_func_call_dst_src1(
1011 func,
1012 xmm_save,
1013 xmm_dst,
1014 xmm_dst,
1015 lg24f );
1016 }
1017
1018 static void
1019 emit_MOV(
1020 struct x86_function *func,
1021 unsigned xmm_dst,
1022 unsigned xmm_src )
1023 {
1024 sse_movups(
1025 func,
1026 make_xmm( xmm_dst ),
1027 make_xmm( xmm_src ) );
1028 }
1029
1030 static void
1031 emit_mul (struct x86_function *func,
1032 unsigned xmm_dst,
1033 unsigned xmm_src)
1034 {
1035 sse_mulps(
1036 func,
1037 make_xmm( xmm_dst ),
1038 make_xmm( xmm_src ) );
1039 }
1040
1041 static void
1042 emit_neg(
1043 struct x86_function *func,
1044 unsigned xmm )
1045 {
1046 sse_xorps(
1047 func,
1048 make_xmm( xmm ),
1049 get_temp(
1050 TGSI_EXEC_TEMP_80000000_I,
1051 TGSI_EXEC_TEMP_80000000_C ) );
1052 }
1053
1054 static void PIPE_CDECL
1055 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1056 __attribute__((force_align_arg_pointer))
1057 #endif
1058 pow4f(
1059 float *store )
1060 {
1061 #if defined(PIPE_ARCH_SSE)
1062 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1063 #else
1064 store[0] = util_fast_pow( store[0], store[4] );
1065 store[1] = util_fast_pow( store[1], store[5] );
1066 store[2] = util_fast_pow( store[2], store[6] );
1067 store[3] = util_fast_pow( store[3], store[7] );
1068 #endif
1069 }
1070
1071 static void
1072 emit_pow(
1073 struct x86_function *func,
1074 unsigned xmm_save,
1075 unsigned xmm_dst,
1076 unsigned xmm_src0,
1077 unsigned xmm_src1 )
1078 {
1079 emit_func_call_dst_src2(
1080 func,
1081 xmm_save,
1082 xmm_dst,
1083 xmm_src0,
1084 xmm_src1,
1085 pow4f );
1086 }
1087
1088 static void
1089 emit_rcp (
1090 struct x86_function *func,
1091 unsigned xmm_dst,
1092 unsigned xmm_src )
1093 {
1094 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1095 * good enough. Need to either emit a proper divide or use the
1096 * iterative technique described below in emit_rsqrt().
1097 */
1098 sse2_rcpps(
1099 func,
1100 make_xmm( xmm_dst ),
1101 make_xmm( xmm_src ) );
1102 }
1103
1104 static void PIPE_CDECL
1105 rnd4f(
1106 float *store )
1107 {
1108 store[0] = floorf( store[0] + 0.5f );
1109 store[1] = floorf( store[1] + 0.5f );
1110 store[2] = floorf( store[2] + 0.5f );
1111 store[3] = floorf( store[3] + 0.5f );
1112 }
1113
1114 static void
1115 emit_rnd(
1116 struct x86_function *func,
1117 unsigned xmm_save,
1118 unsigned xmm_dst )
1119 {
1120 emit_func_call_dst_src1(
1121 func,
1122 xmm_save,
1123 xmm_dst,
1124 xmm_dst,
1125 rnd4f );
1126 }
1127
1128 static void
1129 emit_rsqrt(
1130 struct x86_function *func,
1131 unsigned xmm_dst,
1132 unsigned xmm_src )
1133 {
1134 #if HIGH_PRECISION
1135 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1136 * implementations, it is possible to improve its precision at
1137 * fairly low cost, using a newton/raphson step, as below:
1138 *
1139 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1140 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1141 *
1142 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1143 */
1144 {
1145 struct x86_reg dst = make_xmm( xmm_dst );
1146 struct x86_reg src = make_xmm( xmm_src );
1147 struct x86_reg tmp0 = make_xmm( 2 );
1148 struct x86_reg tmp1 = make_xmm( 3 );
1149
1150 assert( xmm_dst != xmm_src );
1151 assert( xmm_dst != 2 && xmm_dst != 3 );
1152 assert( xmm_src != 2 && xmm_src != 3 );
1153
1154 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1155 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1156 sse_rsqrtps( func, tmp1, src );
1157 sse_mulps( func, src, tmp1 );
1158 sse_mulps( func, dst, tmp1 );
1159 sse_mulps( func, src, tmp1 );
1160 sse_subps( func, tmp0, src );
1161 sse_mulps( func, dst, tmp0 );
1162 }
1163 #else
1164 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1165 * good enough.
1166 */
1167 sse_rsqrtps(
1168 func,
1169 make_xmm( xmm_dst ),
1170 make_xmm( xmm_src ) );
1171 #endif
1172 }
1173
1174 static void
1175 emit_setsign(
1176 struct x86_function *func,
1177 unsigned xmm )
1178 {
1179 sse_orps(
1180 func,
1181 make_xmm( xmm ),
1182 get_temp(
1183 TGSI_EXEC_TEMP_80000000_I,
1184 TGSI_EXEC_TEMP_80000000_C ) );
1185 }
1186
1187 static void PIPE_CDECL
1188 sgn4f(
1189 float *store )
1190 {
1191 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1192 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1193 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1194 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1195 }
1196
1197 static void
1198 emit_sgn(
1199 struct x86_function *func,
1200 unsigned xmm_save,
1201 unsigned xmm_dst )
1202 {
1203 emit_func_call_dst_src1(
1204 func,
1205 xmm_save,
1206 xmm_dst,
1207 xmm_dst,
1208 sgn4f );
1209 }
1210
1211 static void PIPE_CDECL
1212 sin4f(
1213 float *store )
1214 {
1215 store[0] = sinf( store[0] );
1216 store[1] = sinf( store[1] );
1217 store[2] = sinf( store[2] );
1218 store[3] = sinf( store[3] );
1219 }
1220
1221 static void
1222 emit_sin (struct x86_function *func,
1223 unsigned xmm_save,
1224 unsigned xmm_dst)
1225 {
1226 emit_func_call_dst_src1(
1227 func,
1228 xmm_save,
1229 xmm_dst,
1230 xmm_dst,
1231 sin4f );
1232 }
1233
1234 static void
1235 emit_sub(
1236 struct x86_function *func,
1237 unsigned xmm_dst,
1238 unsigned xmm_src )
1239 {
1240 sse_subps(
1241 func,
1242 make_xmm( xmm_dst ),
1243 make_xmm( xmm_src ) );
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 /**
1253 * Register fetch.
1254 */
1255
1256 static void
1257 emit_fetch(
1258 struct x86_function *func,
1259 unsigned xmm,
1260 const struct tgsi_full_src_register *reg,
1261 const unsigned chan_index )
1262 {
1263 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1264
1265 switch (swizzle) {
1266 case TGSI_EXTSWIZZLE_X:
1267 case TGSI_EXTSWIZZLE_Y:
1268 case TGSI_EXTSWIZZLE_Z:
1269 case TGSI_EXTSWIZZLE_W:
1270 switch (reg->SrcRegister.File) {
1271 case TGSI_FILE_CONSTANT:
1272 emit_const(
1273 func,
1274 xmm,
1275 reg->SrcRegister.Index,
1276 swizzle,
1277 reg->SrcRegister.Indirect,
1278 reg->SrcRegisterInd.File,
1279 reg->SrcRegisterInd.Index );
1280 break;
1281
1282 case TGSI_FILE_IMMEDIATE:
1283 emit_immediate(
1284 func,
1285 xmm,
1286 reg->SrcRegister.Index,
1287 swizzle );
1288 break;
1289
1290 case TGSI_FILE_INPUT:
1291 emit_inputf(
1292 func,
1293 xmm,
1294 reg->SrcRegister.Index,
1295 swizzle );
1296 break;
1297
1298 case TGSI_FILE_TEMPORARY:
1299 emit_tempf(
1300 func,
1301 xmm,
1302 reg->SrcRegister.Index,
1303 swizzle );
1304 break;
1305
1306 default:
1307 assert( 0 );
1308 }
1309 break;
1310
1311 case TGSI_EXTSWIZZLE_ZERO:
1312 emit_tempf(
1313 func,
1314 xmm,
1315 TGSI_EXEC_TEMP_00000000_I,
1316 TGSI_EXEC_TEMP_00000000_C );
1317 break;
1318
1319 case TGSI_EXTSWIZZLE_ONE:
1320 emit_tempf(
1321 func,
1322 xmm,
1323 TEMP_ONE_I,
1324 TEMP_ONE_C );
1325 break;
1326
1327 default:
1328 assert( 0 );
1329 }
1330
1331 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1332 case TGSI_UTIL_SIGN_CLEAR:
1333 emit_abs( func, xmm );
1334 break;
1335
1336 case TGSI_UTIL_SIGN_SET:
1337 emit_setsign( func, xmm );
1338 break;
1339
1340 case TGSI_UTIL_SIGN_TOGGLE:
1341 emit_neg( func, xmm );
1342 break;
1343
1344 case TGSI_UTIL_SIGN_KEEP:
1345 break;
1346 }
1347 }
1348
1349 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1350 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1351
1352 /**
1353 * Register store.
1354 */
1355
1356 static void
1357 emit_store(
1358 struct x86_function *func,
1359 unsigned xmm,
1360 const struct tgsi_full_dst_register *reg,
1361 const struct tgsi_full_instruction *inst,
1362 unsigned chan_index )
1363 {
1364 switch( inst->Instruction.Saturate ) {
1365 case TGSI_SAT_NONE:
1366 break;
1367
1368 case TGSI_SAT_ZERO_ONE:
1369 sse_maxps(
1370 func,
1371 make_xmm( xmm ),
1372 get_temp(
1373 TGSI_EXEC_TEMP_00000000_I,
1374 TGSI_EXEC_TEMP_00000000_C ) );
1375
1376 sse_minps(
1377 func,
1378 make_xmm( xmm ),
1379 get_temp(
1380 TGSI_EXEC_TEMP_ONE_I,
1381 TGSI_EXEC_TEMP_ONE_C ) );
1382 break;
1383
1384 case TGSI_SAT_MINUS_PLUS_ONE:
1385 assert( 0 );
1386 break;
1387 }
1388
1389
1390 switch( reg->DstRegister.File ) {
1391 case TGSI_FILE_OUTPUT:
1392 emit_output(
1393 func,
1394 xmm,
1395 reg->DstRegister.Index,
1396 chan_index );
1397 break;
1398
1399 case TGSI_FILE_TEMPORARY:
1400 emit_temps(
1401 func,
1402 xmm,
1403 reg->DstRegister.Index,
1404 chan_index );
1405 break;
1406
1407 case TGSI_FILE_ADDRESS:
1408 emit_addrs(
1409 func,
1410 xmm,
1411 reg->DstRegister.Index,
1412 chan_index );
1413 break;
1414
1415 default:
1416 assert( 0 );
1417 }
1418 }
1419
1420 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1421 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1422
1423
1424 static void PIPE_CDECL
1425 fetch_texel( struct tgsi_sampler **sampler,
1426 float *store )
1427 {
1428 #if 0
1429 uint j;
1430
1431 debug_printf("%s sampler: %p (%p) store: %p\n",
1432 __FUNCTION__,
1433 sampler, *sampler,
1434 store );
1435
1436 debug_printf("lodbias %f\n", store[12]);
1437
1438 for (j = 0; j < 4; j++)
1439 debug_printf("sample %d texcoord %f %f\n",
1440 j,
1441 store[0+j],
1442 store[4+j]);
1443 #endif
1444
1445 {
1446 float rgba[NUM_CHANNELS][QUAD_SIZE];
1447 (*sampler)->get_samples(*sampler,
1448 &store[0],
1449 &store[4],
1450 &store[8],
1451 0.0f, /*store[12], lodbias */
1452 rgba);
1453
1454 memcpy( store, rgba, 16 * sizeof(float));
1455 }
1456
1457 #if 0
1458 for (j = 0; j < 4; j++)
1459 debug_printf("sample %d result %f %f %f %f\n",
1460 j,
1461 store[0+j],
1462 store[4+j],
1463 store[8+j],
1464 store[12+j]);
1465 #endif
1466 }
1467
1468 /**
1469 * High-level instruction translators.
1470 */
1471
1472 static void
1473 emit_tex( struct x86_function *func,
1474 const struct tgsi_full_instruction *inst,
1475 boolean lodbias,
1476 boolean projected)
1477 {
1478 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1479 struct x86_reg args[2];
1480 unsigned count;
1481 unsigned i;
1482
1483 switch (inst->InstructionExtTexture.Texture) {
1484 case TGSI_TEXTURE_1D:
1485 count = 1;
1486 break;
1487 case TGSI_TEXTURE_2D:
1488 case TGSI_TEXTURE_RECT:
1489 count = 2;
1490 break;
1491 case TGSI_TEXTURE_SHADOW1D:
1492 case TGSI_TEXTURE_SHADOW2D:
1493 case TGSI_TEXTURE_SHADOWRECT:
1494 case TGSI_TEXTURE_3D:
1495 case TGSI_TEXTURE_CUBE:
1496 count = 3;
1497 break;
1498 default:
1499 assert(0);
1500 return;
1501 }
1502
1503 if (lodbias) {
1504 FETCH( func, *inst, 3, 0, 3 );
1505 }
1506 else {
1507 emit_tempf(
1508 func,
1509 3,
1510 TGSI_EXEC_TEMP_00000000_I,
1511 TGSI_EXEC_TEMP_00000000_C );
1512
1513 }
1514
1515 /* store lodbias whether enabled or not -- fetch_texel currently
1516 * respects it always.
1517 */
1518 sse_movaps( func,
1519 get_temp( TEMP_R0, 3 ),
1520 make_xmm( 3 ) );
1521
1522
1523 if (projected) {
1524 FETCH( func, *inst, 3, 0, 3 );
1525
1526 emit_rcp( func, 3, 3 );
1527 }
1528
1529 for (i = 0; i < count; i++) {
1530 FETCH( func, *inst, i, 0, i );
1531
1532 if (projected) {
1533 sse_mulps(
1534 func,
1535 make_xmm( i ),
1536 make_xmm( 3 ) );
1537 }
1538
1539 /* Store in the argument buffer:
1540 */
1541 sse_movaps(
1542 func,
1543 get_temp( TEMP_R0, i ),
1544 make_xmm( i ) );
1545 }
1546
1547 args[0] = get_temp( TEMP_R0, 0 );
1548 args[1] = get_sampler_ptr( unit );
1549
1550
1551 emit_func_call( func,
1552 0,
1553 args,
1554 Elements(args),
1555 fetch_texel );
1556
1557 /* If all four channels are enabled, could use a pointer to
1558 * dst[0].x instead of TEMP_R0 for store?
1559 */
1560 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1561
1562 sse_movaps(
1563 func,
1564 make_xmm( 0 ),
1565 get_temp( TEMP_R0, i ) );
1566
1567 STORE( func, *inst, 0, 0, i );
1568 }
1569 }
1570
1571
1572 static void
1573 emit_kil(
1574 struct x86_function *func,
1575 const struct tgsi_full_src_register *reg )
1576 {
1577 unsigned uniquemask;
1578 unsigned unique_count = 0;
1579 unsigned chan_index;
1580 unsigned i;
1581
1582 /* This mask stores component bits that were already tested. Note that
1583 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1584 * tested. */
1585 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1586
1587 FOR_EACH_CHANNEL( chan_index ) {
1588 unsigned swizzle;
1589
1590 /* unswizzle channel */
1591 swizzle = tgsi_util_get_full_src_register_extswizzle(
1592 reg,
1593 chan_index );
1594
1595 /* check if the component has not been already tested */
1596 if( !(uniquemask & (1 << swizzle)) ) {
1597 uniquemask |= 1 << swizzle;
1598
1599 /* allocate register */
1600 emit_fetch(
1601 func,
1602 unique_count++,
1603 reg,
1604 chan_index );
1605 }
1606 }
1607
1608 x86_push(
1609 func,
1610 x86_make_reg( file_REG32, reg_AX ) );
1611 x86_push(
1612 func,
1613 x86_make_reg( file_REG32, reg_DX ) );
1614
1615 for (i = 0 ; i < unique_count; i++ ) {
1616 struct x86_reg dataXMM = make_xmm(i);
1617
1618 sse_cmpps(
1619 func,
1620 dataXMM,
1621 get_temp(
1622 TGSI_EXEC_TEMP_00000000_I,
1623 TGSI_EXEC_TEMP_00000000_C ),
1624 cc_LessThan );
1625
1626 if( i == 0 ) {
1627 sse_movmskps(
1628 func,
1629 x86_make_reg( file_REG32, reg_AX ),
1630 dataXMM );
1631 }
1632 else {
1633 sse_movmskps(
1634 func,
1635 x86_make_reg( file_REG32, reg_DX ),
1636 dataXMM );
1637 x86_or(
1638 func,
1639 x86_make_reg( file_REG32, reg_AX ),
1640 x86_make_reg( file_REG32, reg_DX ) );
1641 }
1642 }
1643
1644 x86_or(
1645 func,
1646 get_temp(
1647 TGSI_EXEC_TEMP_KILMASK_I,
1648 TGSI_EXEC_TEMP_KILMASK_C ),
1649 x86_make_reg( file_REG32, reg_AX ) );
1650
1651 x86_pop(
1652 func,
1653 x86_make_reg( file_REG32, reg_DX ) );
1654 x86_pop(
1655 func,
1656 x86_make_reg( file_REG32, reg_AX ) );
1657 }
1658
1659
1660 static void
1661 emit_kilp(
1662 struct x86_function *func )
1663 {
1664 /* XXX todo / fix me */
1665 }
1666
1667
1668 static void
1669 emit_setcc(
1670 struct x86_function *func,
1671 struct tgsi_full_instruction *inst,
1672 enum sse_cc cc )
1673 {
1674 unsigned chan_index;
1675
1676 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677 FETCH( func, *inst, 0, 0, chan_index );
1678 FETCH( func, *inst, 1, 1, chan_index );
1679 sse_cmpps(
1680 func,
1681 make_xmm( 0 ),
1682 make_xmm( 1 ),
1683 cc );
1684 sse_andps(
1685 func,
1686 make_xmm( 0 ),
1687 get_temp(
1688 TEMP_ONE_I,
1689 TEMP_ONE_C ) );
1690 STORE( func, *inst, 0, 0, chan_index );
1691 }
1692 }
1693
1694 static void
1695 emit_cmp(
1696 struct x86_function *func,
1697 struct tgsi_full_instruction *inst )
1698 {
1699 unsigned chan_index;
1700
1701 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1702 FETCH( func, *inst, 0, 0, chan_index );
1703 FETCH( func, *inst, 1, 1, chan_index );
1704 FETCH( func, *inst, 2, 2, chan_index );
1705 sse_cmpps(
1706 func,
1707 make_xmm( 0 ),
1708 get_temp(
1709 TGSI_EXEC_TEMP_00000000_I,
1710 TGSI_EXEC_TEMP_00000000_C ),
1711 cc_LessThan );
1712 sse_andps(
1713 func,
1714 make_xmm( 1 ),
1715 make_xmm( 0 ) );
1716 sse_andnps(
1717 func,
1718 make_xmm( 0 ),
1719 make_xmm( 2 ) );
1720 sse_orps(
1721 func,
1722 make_xmm( 0 ),
1723 make_xmm( 1 ) );
1724 STORE( func, *inst, 0, 0, chan_index );
1725 }
1726 }
1727
1728
1729 /**
1730 * Check if inst src/dest regs use indirect addressing into temporary
1731 * register file.
1732 */
1733 static boolean
1734 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1735 {
1736 uint i;
1737 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1738 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1739 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1740 reg->SrcRegister.Indirect)
1741 return TRUE;
1742 }
1743 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1744 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1745 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1746 reg->DstRegister.Indirect)
1747 return TRUE;
1748 }
1749 return FALSE;
1750 }
1751
1752
1753 static int
1754 emit_instruction(
1755 struct x86_function *func,
1756 struct tgsi_full_instruction *inst )
1757 {
1758 unsigned chan_index;
1759
1760 /* we can't handle indirect addressing into temp register file yet */
1761 if (indirect_temp_reference(inst))
1762 return FALSE;
1763
1764 switch (inst->Instruction.Opcode) {
1765 case TGSI_OPCODE_ARL:
1766 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1767 FETCH( func, *inst, 0, 0, chan_index );
1768 emit_flr(func, 0, 0);
1769 emit_f2it( func, 0 );
1770 STORE( func, *inst, 0, 0, chan_index );
1771 }
1772 break;
1773
1774 case TGSI_OPCODE_MOV:
1775 case TGSI_OPCODE_SWZ:
1776 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1777 FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1778 }
1779 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1780 STORE( func, *inst, 4 + chan_index, 0, chan_index );
1781 }
1782 break;
1783
1784 case TGSI_OPCODE_LIT:
1785 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1786 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1787 emit_tempf(
1788 func,
1789 0,
1790 TEMP_ONE_I,
1791 TEMP_ONE_C);
1792 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1793 STORE( func, *inst, 0, 0, CHAN_X );
1794 }
1795 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1796 STORE( func, *inst, 0, 0, CHAN_W );
1797 }
1798 }
1799 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1800 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1801 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1802 FETCH( func, *inst, 0, 0, CHAN_X );
1803 sse_maxps(
1804 func,
1805 make_xmm( 0 ),
1806 get_temp(
1807 TGSI_EXEC_TEMP_00000000_I,
1808 TGSI_EXEC_TEMP_00000000_C ) );
1809 STORE( func, *inst, 0, 0, CHAN_Y );
1810 }
1811 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1812 /* XMM[1] = SrcReg[0].yyyy */
1813 FETCH( func, *inst, 1, 0, CHAN_Y );
1814 /* XMM[1] = max(XMM[1], 0) */
1815 sse_maxps(
1816 func,
1817 make_xmm( 1 ),
1818 get_temp(
1819 TGSI_EXEC_TEMP_00000000_I,
1820 TGSI_EXEC_TEMP_00000000_C ) );
1821 /* XMM[2] = SrcReg[0].wwww */
1822 FETCH( func, *inst, 2, 0, CHAN_W );
1823 /* XMM[2] = min(XMM[2], 128.0) */
1824 sse_minps(
1825 func,
1826 make_xmm( 2 ),
1827 get_temp(
1828 TGSI_EXEC_TEMP_128_I,
1829 TGSI_EXEC_TEMP_128_C ) );
1830 /* XMM[2] = max(XMM[2], -128.0) */
1831 sse_maxps(
1832 func,
1833 make_xmm( 2 ),
1834 get_temp(
1835 TGSI_EXEC_TEMP_MINUS_128_I,
1836 TGSI_EXEC_TEMP_MINUS_128_C ) );
1837 emit_pow( func, 3, 1, 1, 2 );
1838 FETCH( func, *inst, 0, 0, CHAN_X );
1839 sse_xorps(
1840 func,
1841 make_xmm( 2 ),
1842 make_xmm( 2 ) );
1843 sse_cmpps(
1844 func,
1845 make_xmm( 2 ),
1846 make_xmm( 0 ),
1847 cc_LessThan );
1848 sse_andps(
1849 func,
1850 make_xmm( 2 ),
1851 make_xmm( 1 ) );
1852 STORE( func, *inst, 2, 0, CHAN_Z );
1853 }
1854 }
1855 break;
1856
1857 case TGSI_OPCODE_RCP:
1858 /* TGSI_OPCODE_RECIP */
1859 FETCH( func, *inst, 0, 0, CHAN_X );
1860 emit_rcp( func, 0, 0 );
1861 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1862 STORE( func, *inst, 0, 0, chan_index );
1863 }
1864 break;
1865
1866 case TGSI_OPCODE_RSQ:
1867 /* TGSI_OPCODE_RECIPSQRT */
1868 FETCH( func, *inst, 0, 0, CHAN_X );
1869 emit_abs( func, 0 );
1870 emit_rsqrt( func, 1, 0 );
1871 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872 STORE( func, *inst, 1, 0, chan_index );
1873 }
1874 break;
1875
1876 case TGSI_OPCODE_EXP:
1877 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1878 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1879 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1880 FETCH( func, *inst, 0, 0, CHAN_X );
1881 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1882 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1883 emit_MOV( func, 1, 0 );
1884 emit_flr( func, 2, 1 );
1885 /* dst.x = ex2(floor(src.x)) */
1886 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1887 emit_MOV( func, 2, 1 );
1888 emit_ex2( func, 3, 2 );
1889 STORE( func, *inst, 2, 0, CHAN_X );
1890 }
1891 /* dst.y = src.x - floor(src.x) */
1892 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1893 emit_MOV( func, 2, 0 );
1894 emit_sub( func, 2, 1 );
1895 STORE( func, *inst, 2, 0, CHAN_Y );
1896 }
1897 }
1898 /* dst.z = ex2(src.x) */
1899 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900 emit_ex2( func, 3, 0 );
1901 STORE( func, *inst, 0, 0, CHAN_Z );
1902 }
1903 }
1904 /* dst.w = 1.0 */
1905 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1906 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1907 STORE( func, *inst, 0, 0, CHAN_W );
1908 }
1909 break;
1910
1911 case TGSI_OPCODE_LOG:
1912 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1913 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1914 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1915 FETCH( func, *inst, 0, 0, CHAN_X );
1916 emit_abs( func, 0 );
1917 emit_MOV( func, 1, 0 );
1918 emit_lg2( func, 2, 1 );
1919 /* dst.z = lg2(abs(src.x)) */
1920 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1921 STORE( func, *inst, 1, 0, CHAN_Z );
1922 }
1923 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1924 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1925 emit_flr( func, 2, 1 );
1926 /* dst.x = floor(lg2(abs(src.x))) */
1927 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1928 STORE( func, *inst, 1, 0, CHAN_X );
1929 }
1930 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1931 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1932 emit_ex2( func, 2, 1 );
1933 emit_rcp( func, 1, 1 );
1934 emit_mul( func, 0, 1 );
1935 STORE( func, *inst, 0, 0, CHAN_Y );
1936 }
1937 }
1938 }
1939 /* dst.w = 1.0 */
1940 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1941 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1942 STORE( func, *inst, 0, 0, CHAN_W );
1943 }
1944 break;
1945
1946 case TGSI_OPCODE_MUL:
1947 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1948 FETCH( func, *inst, 0, 0, chan_index );
1949 FETCH( func, *inst, 1, 1, chan_index );
1950 emit_mul( func, 0, 1 );
1951 STORE( func, *inst, 0, 0, chan_index );
1952 }
1953 break;
1954
1955 case TGSI_OPCODE_ADD:
1956 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1957 FETCH( func, *inst, 0, 0, chan_index );
1958 FETCH( func, *inst, 1, 1, chan_index );
1959 emit_add( func, 0, 1 );
1960 STORE( func, *inst, 0, 0, chan_index );
1961 }
1962 break;
1963
1964 case TGSI_OPCODE_DP3:
1965 /* TGSI_OPCODE_DOT3 */
1966 FETCH( func, *inst, 0, 0, CHAN_X );
1967 FETCH( func, *inst, 1, 1, CHAN_X );
1968 emit_mul( func, 0, 1 );
1969 FETCH( func, *inst, 1, 0, CHAN_Y );
1970 FETCH( func, *inst, 2, 1, CHAN_Y );
1971 emit_mul( func, 1, 2 );
1972 emit_add( func, 0, 1 );
1973 FETCH( func, *inst, 1, 0, CHAN_Z );
1974 FETCH( func, *inst, 2, 1, CHAN_Z );
1975 emit_mul( func, 1, 2 );
1976 emit_add( func, 0, 1 );
1977 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1978 STORE( func, *inst, 0, 0, chan_index );
1979 }
1980 break;
1981
1982 case TGSI_OPCODE_DP4:
1983 /* TGSI_OPCODE_DOT4 */
1984 FETCH( func, *inst, 0, 0, CHAN_X );
1985 FETCH( func, *inst, 1, 1, CHAN_X );
1986 emit_mul( func, 0, 1 );
1987 FETCH( func, *inst, 1, 0, CHAN_Y );
1988 FETCH( func, *inst, 2, 1, CHAN_Y );
1989 emit_mul( func, 1, 2 );
1990 emit_add( func, 0, 1 );
1991 FETCH( func, *inst, 1, 0, CHAN_Z );
1992 FETCH( func, *inst, 2, 1, CHAN_Z );
1993 emit_mul(func, 1, 2 );
1994 emit_add(func, 0, 1 );
1995 FETCH( func, *inst, 1, 0, CHAN_W );
1996 FETCH( func, *inst, 2, 1, CHAN_W );
1997 emit_mul( func, 1, 2 );
1998 emit_add( func, 0, 1 );
1999 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2000 STORE( func, *inst, 0, 0, chan_index );
2001 }
2002 break;
2003
2004 case TGSI_OPCODE_DST:
2005 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2006 emit_tempf(
2007 func,
2008 0,
2009 TEMP_ONE_I,
2010 TEMP_ONE_C );
2011 STORE( func, *inst, 0, 0, CHAN_X );
2012 }
2013 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2014 FETCH( func, *inst, 0, 0, CHAN_Y );
2015 FETCH( func, *inst, 1, 1, CHAN_Y );
2016 emit_mul( func, 0, 1 );
2017 STORE( func, *inst, 0, 0, CHAN_Y );
2018 }
2019 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2020 FETCH( func, *inst, 0, 0, CHAN_Z );
2021 STORE( func, *inst, 0, 0, CHAN_Z );
2022 }
2023 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2024 FETCH( func, *inst, 0, 1, CHAN_W );
2025 STORE( func, *inst, 0, 0, CHAN_W );
2026 }
2027 break;
2028
2029 case TGSI_OPCODE_MIN:
2030 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2031 FETCH( func, *inst, 0, 0, chan_index );
2032 FETCH( func, *inst, 1, 1, chan_index );
2033 sse_minps(
2034 func,
2035 make_xmm( 0 ),
2036 make_xmm( 1 ) );
2037 STORE( func, *inst, 0, 0, chan_index );
2038 }
2039 break;
2040
2041 case TGSI_OPCODE_MAX:
2042 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043 FETCH( func, *inst, 0, 0, chan_index );
2044 FETCH( func, *inst, 1, 1, chan_index );
2045 sse_maxps(
2046 func,
2047 make_xmm( 0 ),
2048 make_xmm( 1 ) );
2049 STORE( func, *inst, 0, 0, chan_index );
2050 }
2051 break;
2052
2053 case TGSI_OPCODE_SLT:
2054 /* TGSI_OPCODE_SETLT */
2055 emit_setcc( func, inst, cc_LessThan );
2056 break;
2057
2058 case TGSI_OPCODE_SGE:
2059 /* TGSI_OPCODE_SETGE */
2060 emit_setcc( func, inst, cc_NotLessThan );
2061 break;
2062
2063 case TGSI_OPCODE_MAD:
2064 /* TGSI_OPCODE_MADD */
2065 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2066 FETCH( func, *inst, 0, 0, chan_index );
2067 FETCH( func, *inst, 1, 1, chan_index );
2068 FETCH( func, *inst, 2, 2, chan_index );
2069 emit_mul( func, 0, 1 );
2070 emit_add( func, 0, 2 );
2071 STORE( func, *inst, 0, 0, chan_index );
2072 }
2073 break;
2074
2075 case TGSI_OPCODE_SUB:
2076 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2077 FETCH( func, *inst, 0, 0, chan_index );
2078 FETCH( func, *inst, 1, 1, chan_index );
2079 emit_sub( func, 0, 1 );
2080 STORE( func, *inst, 0, 0, chan_index );
2081 }
2082 break;
2083
2084 case TGSI_OPCODE_LRP:
2085 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2086 FETCH( func, *inst, 0, 0, chan_index );
2087 FETCH( func, *inst, 1, 1, chan_index );
2088 FETCH( func, *inst, 2, 2, chan_index );
2089 emit_sub( func, 1, 2 );
2090 emit_mul( func, 0, 1 );
2091 emit_add( func, 0, 2 );
2092 STORE( func, *inst, 0, 0, chan_index );
2093 }
2094 break;
2095
2096 case TGSI_OPCODE_CND:
2097 return 0;
2098 break;
2099
2100 case TGSI_OPCODE_DP2A:
2101 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2102 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2103 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2104 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2105 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2106 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2107 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2108 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2109 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2110 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2111 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2112 }
2113 break;
2114
2115 case TGSI_OPCODE_FRC:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117 FETCH( func, *inst, 0, 0, chan_index );
2118 emit_frc( func, 0, 0 );
2119 STORE( func, *inst, 0, 0, chan_index );
2120 }
2121 break;
2122
2123 case TGSI_OPCODE_CLAMP:
2124 return 0;
2125 break;
2126
2127 case TGSI_OPCODE_FLR:
2128 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2129 FETCH( func, *inst, 0, 0, chan_index );
2130 emit_flr( func, 0, 0 );
2131 STORE( func, *inst, 0, 0, chan_index );
2132 }
2133 break;
2134
2135 case TGSI_OPCODE_ROUND:
2136 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2137 FETCH( func, *inst, 0, 0, chan_index );
2138 emit_rnd( func, 0, 0 );
2139 STORE( func, *inst, 0, 0, chan_index );
2140 }
2141 break;
2142
2143 case TGSI_OPCODE_EX2:
2144 FETCH( func, *inst, 0, 0, CHAN_X );
2145 emit_ex2( func, 0, 0 );
2146 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147 STORE( func, *inst, 0, 0, chan_index );
2148 }
2149 break;
2150
2151 case TGSI_OPCODE_LG2:
2152 FETCH( func, *inst, 0, 0, CHAN_X );
2153 emit_lg2( func, 0, 0 );
2154 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2155 STORE( func, *inst, 0, 0, chan_index );
2156 }
2157 break;
2158
2159 case TGSI_OPCODE_POW:
2160 FETCH( func, *inst, 0, 0, CHAN_X );
2161 FETCH( func, *inst, 1, 1, CHAN_X );
2162 emit_pow( func, 0, 0, 0, 1 );
2163 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2164 STORE( func, *inst, 0, 0, chan_index );
2165 }
2166 break;
2167
2168 case TGSI_OPCODE_XPD:
2169 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2170 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2171 FETCH( func, *inst, 1, 1, CHAN_Z );
2172 FETCH( func, *inst, 3, 0, CHAN_Z );
2173 }
2174 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2175 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2176 FETCH( func, *inst, 0, 0, CHAN_Y );
2177 FETCH( func, *inst, 4, 1, CHAN_Y );
2178 }
2179 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2180 emit_MOV( func, 2, 0 );
2181 emit_mul( func, 2, 1 );
2182 emit_MOV( func, 5, 3 );
2183 emit_mul( func, 5, 4 );
2184 emit_sub( func, 2, 5 );
2185 STORE( func, *inst, 2, 0, CHAN_X );
2186 }
2187 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2188 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2189 FETCH( func, *inst, 2, 1, CHAN_X );
2190 FETCH( func, *inst, 5, 0, CHAN_X );
2191 }
2192 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2193 emit_mul( func, 3, 2 );
2194 emit_mul( func, 1, 5 );
2195 emit_sub( func, 3, 1 );
2196 STORE( func, *inst, 3, 0, CHAN_Y );
2197 }
2198 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2199 emit_mul( func, 5, 4 );
2200 emit_mul( func, 0, 2 );
2201 emit_sub( func, 5, 0 );
2202 STORE( func, *inst, 5, 0, CHAN_Z );
2203 }
2204 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2205 emit_tempf(
2206 func,
2207 0,
2208 TEMP_ONE_I,
2209 TEMP_ONE_C );
2210 STORE( func, *inst, 0, 0, CHAN_W );
2211 }
2212 break;
2213
2214 case TGSI_OPCODE_ABS:
2215 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2216 FETCH( func, *inst, 0, 0, chan_index );
2217 emit_abs( func, 0) ;
2218
2219 STORE( func, *inst, 0, 0, chan_index );
2220 }
2221 break;
2222
2223 case TGSI_OPCODE_RCC:
2224 return 0;
2225 break;
2226
2227 case TGSI_OPCODE_DPH:
2228 FETCH( func, *inst, 0, 0, CHAN_X );
2229 FETCH( func, *inst, 1, 1, CHAN_X );
2230 emit_mul( func, 0, 1 );
2231 FETCH( func, *inst, 1, 0, CHAN_Y );
2232 FETCH( func, *inst, 2, 1, CHAN_Y );
2233 emit_mul( func, 1, 2 );
2234 emit_add( func, 0, 1 );
2235 FETCH( func, *inst, 1, 0, CHAN_Z );
2236 FETCH( func, *inst, 2, 1, CHAN_Z );
2237 emit_mul( func, 1, 2 );
2238 emit_add( func, 0, 1 );
2239 FETCH( func, *inst, 1, 1, CHAN_W );
2240 emit_add( func, 0, 1 );
2241 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2242 STORE( func, *inst, 0, 0, chan_index );
2243 }
2244 break;
2245
2246 case TGSI_OPCODE_COS:
2247 FETCH( func, *inst, 0, 0, CHAN_X );
2248 emit_cos( func, 0, 0 );
2249 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2250 STORE( func, *inst, 0, 0, chan_index );
2251 }
2252 break;
2253
2254 case TGSI_OPCODE_DDX:
2255 return 0;
2256 break;
2257
2258 case TGSI_OPCODE_DDY:
2259 return 0;
2260 break;
2261
2262 case TGSI_OPCODE_KILP:
2263 /* predicated kill */
2264 emit_kilp( func );
2265 return 0; /* XXX fix me */
2266 break;
2267
2268 case TGSI_OPCODE_KIL:
2269 /* conditional kill */
2270 emit_kil( func, &inst->FullSrcRegisters[0] );
2271 break;
2272
2273 case TGSI_OPCODE_PK2H:
2274 return 0;
2275 break;
2276
2277 case TGSI_OPCODE_PK2US:
2278 return 0;
2279 break;
2280
2281 case TGSI_OPCODE_PK4B:
2282 return 0;
2283 break;
2284
2285 case TGSI_OPCODE_PK4UB:
2286 return 0;
2287 break;
2288
2289 case TGSI_OPCODE_RFL:
2290 return 0;
2291 break;
2292
2293 case TGSI_OPCODE_SEQ:
2294 return 0;
2295 break;
2296
2297 case TGSI_OPCODE_SFL:
2298 return 0;
2299 break;
2300
2301 case TGSI_OPCODE_SGT:
2302 return 0;
2303 break;
2304
2305 case TGSI_OPCODE_SIN:
2306 FETCH( func, *inst, 0, 0, CHAN_X );
2307 emit_sin( func, 0, 0 );
2308 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2309 STORE( func, *inst, 0, 0, chan_index );
2310 }
2311 break;
2312
2313 case TGSI_OPCODE_SLE:
2314 return 0;
2315 break;
2316
2317 case TGSI_OPCODE_SNE:
2318 return 0;
2319 break;
2320
2321 case TGSI_OPCODE_STR:
2322 return 0;
2323 break;
2324
2325 case TGSI_OPCODE_TEX:
2326 emit_tex( func, inst, FALSE, FALSE );
2327 break;
2328
2329 case TGSI_OPCODE_TXD:
2330 return 0;
2331 break;
2332
2333 case TGSI_OPCODE_UP2H:
2334 return 0;
2335 break;
2336
2337 case TGSI_OPCODE_UP2US:
2338 return 0;
2339 break;
2340
2341 case TGSI_OPCODE_UP4B:
2342 return 0;
2343 break;
2344
2345 case TGSI_OPCODE_UP4UB:
2346 return 0;
2347 break;
2348
2349 case TGSI_OPCODE_X2D:
2350 return 0;
2351 break;
2352
2353 case TGSI_OPCODE_ARA:
2354 return 0;
2355 break;
2356
2357 case TGSI_OPCODE_ARR:
2358 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2359 FETCH( func, *inst, 0, 0, chan_index );
2360 emit_rnd( func, 0, 0 );
2361 emit_f2it( func, 0 );
2362 STORE( func, *inst, 0, 0, chan_index );
2363 }
2364 break;
2365
2366 case TGSI_OPCODE_BRA:
2367 return 0;
2368 break;
2369
2370 case TGSI_OPCODE_CAL:
2371 return 0;
2372 break;
2373
2374 case TGSI_OPCODE_RET:
2375 emit_ret( func );
2376 break;
2377
2378 case TGSI_OPCODE_END:
2379 break;
2380
2381 case TGSI_OPCODE_SSG:
2382 /* TGSI_OPCODE_SGN */
2383 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2384 FETCH( func, *inst, 0, 0, chan_index );
2385 emit_sgn( func, 0, 0 );
2386 STORE( func, *inst, 0, 0, chan_index );
2387 }
2388 break;
2389
2390 case TGSI_OPCODE_CMP:
2391 emit_cmp (func, inst);
2392 break;
2393
2394 case TGSI_OPCODE_SCS:
2395 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2396 FETCH( func, *inst, 0, 0, CHAN_X );
2397 emit_cos( func, 0, 0 );
2398 STORE( func, *inst, 0, 0, CHAN_X );
2399 }
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2401 FETCH( func, *inst, 0, 0, CHAN_X );
2402 emit_sin( func, 0, 0 );
2403 STORE( func, *inst, 0, 0, CHAN_Y );
2404 }
2405 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2406 emit_tempf(
2407 func,
2408 0,
2409 TGSI_EXEC_TEMP_00000000_I,
2410 TGSI_EXEC_TEMP_00000000_C );
2411 STORE( func, *inst, 0, 0, CHAN_Z );
2412 }
2413 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2414 emit_tempf(
2415 func,
2416 0,
2417 TEMP_ONE_I,
2418 TEMP_ONE_C );
2419 STORE( func, *inst, 0, 0, CHAN_W );
2420 }
2421 break;
2422
2423 case TGSI_OPCODE_TXB:
2424 emit_tex( func, inst, TRUE, FALSE );
2425 break;
2426
2427 case TGSI_OPCODE_NRM:
2428 /* fall-through */
2429 case TGSI_OPCODE_NRM4:
2430 /* 3 or 4-component normalization */
2431 {
2432 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2433
2434 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2435 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2436 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2437 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2438
2439 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2440
2441 /* xmm4 = src.x */
2442 /* xmm0 = src.x * src.x */
2443 FETCH(func, *inst, 0, 0, CHAN_X);
2444 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2445 emit_MOV(func, 4, 0);
2446 }
2447 emit_mul(func, 0, 0);
2448
2449 /* xmm5 = src.y */
2450 /* xmm0 = xmm0 + src.y * src.y */
2451 FETCH(func, *inst, 1, 0, CHAN_Y);
2452 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2453 emit_MOV(func, 5, 1);
2454 }
2455 emit_mul(func, 1, 1);
2456 emit_add(func, 0, 1);
2457
2458 /* xmm6 = src.z */
2459 /* xmm0 = xmm0 + src.z * src.z */
2460 FETCH(func, *inst, 1, 0, CHAN_Z);
2461 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2462 emit_MOV(func, 6, 1);
2463 }
2464 emit_mul(func, 1, 1);
2465 emit_add(func, 0, 1);
2466
2467 if (dims == 4) {
2468 /* xmm7 = src.w */
2469 /* xmm0 = xmm0 + src.w * src.w */
2470 FETCH(func, *inst, 1, 0, CHAN_W);
2471 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2472 emit_MOV(func, 7, 1);
2473 }
2474 emit_mul(func, 1, 1);
2475 emit_add(func, 0, 1);
2476 }
2477
2478 /* xmm1 = 1 / sqrt(xmm0) */
2479 emit_rsqrt(func, 1, 0);
2480
2481 /* dst.x = xmm1 * src.x */
2482 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2483 emit_mul(func, 4, 1);
2484 STORE(func, *inst, 4, 0, CHAN_X);
2485 }
2486
2487 /* dst.y = xmm1 * src.y */
2488 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2489 emit_mul(func, 5, 1);
2490 STORE(func, *inst, 5, 0, CHAN_Y);
2491 }
2492
2493 /* dst.z = xmm1 * src.z */
2494 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2495 emit_mul(func, 6, 1);
2496 STORE(func, *inst, 6, 0, CHAN_Z);
2497 }
2498
2499 /* dst.w = xmm1 * src.w */
2500 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2501 emit_mul(func, 7, 1);
2502 STORE(func, *inst, 7, 0, CHAN_W);
2503 }
2504 }
2505
2506 /* dst0.w = 1.0 */
2507 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2508 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2509 STORE(func, *inst, 0, 0, CHAN_W);
2510 }
2511 }
2512 break;
2513
2514 case TGSI_OPCODE_DIV:
2515 return 0;
2516 break;
2517
2518 case TGSI_OPCODE_DP2:
2519 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2520 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2521 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2522 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2523 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2524 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2525 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2526 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2527 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2528 }
2529 break;
2530
2531 case TGSI_OPCODE_TXL:
2532 emit_tex( func, inst, TRUE, FALSE );
2533 break;
2534
2535 case TGSI_OPCODE_TXP:
2536 emit_tex( func, inst, FALSE, TRUE );
2537 break;
2538
2539 case TGSI_OPCODE_BRK:
2540 return 0;
2541 break;
2542
2543 case TGSI_OPCODE_IF:
2544 return 0;
2545 break;
2546
2547 case TGSI_OPCODE_BGNFOR:
2548 return 0;
2549 break;
2550
2551 case TGSI_OPCODE_REP:
2552 return 0;
2553 break;
2554
2555 case TGSI_OPCODE_ELSE:
2556 return 0;
2557 break;
2558
2559 case TGSI_OPCODE_ENDIF:
2560 return 0;
2561 break;
2562
2563 case TGSI_OPCODE_ENDFOR:
2564 return 0;
2565 break;
2566
2567 case TGSI_OPCODE_ENDREP:
2568 return 0;
2569 break;
2570
2571 case TGSI_OPCODE_PUSHA:
2572 return 0;
2573 break;
2574
2575 case TGSI_OPCODE_POPA:
2576 return 0;
2577 break;
2578
2579 case TGSI_OPCODE_CEIL:
2580 return 0;
2581 break;
2582
2583 case TGSI_OPCODE_I2F:
2584 return 0;
2585 break;
2586
2587 case TGSI_OPCODE_NOT:
2588 return 0;
2589 break;
2590
2591 case TGSI_OPCODE_TRUNC:
2592 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2593 FETCH( func, *inst, 0, 0, chan_index );
2594 emit_f2it( func, 0 );
2595 emit_i2f( func, 0 );
2596 STORE( func, *inst, 0, 0, chan_index );
2597 }
2598 break;
2599
2600 case TGSI_OPCODE_SHL:
2601 return 0;
2602 break;
2603
2604 case TGSI_OPCODE_SHR:
2605 return 0;
2606 break;
2607
2608 case TGSI_OPCODE_AND:
2609 return 0;
2610 break;
2611
2612 case TGSI_OPCODE_OR:
2613 return 0;
2614 break;
2615
2616 case TGSI_OPCODE_MOD:
2617 return 0;
2618 break;
2619
2620 case TGSI_OPCODE_XOR:
2621 return 0;
2622 break;
2623
2624 case TGSI_OPCODE_SAD:
2625 return 0;
2626 break;
2627
2628 case TGSI_OPCODE_TXF:
2629 return 0;
2630 break;
2631
2632 case TGSI_OPCODE_TXQ:
2633 return 0;
2634 break;
2635
2636 case TGSI_OPCODE_CONT:
2637 return 0;
2638 break;
2639
2640 case TGSI_OPCODE_EMIT:
2641 return 0;
2642 break;
2643
2644 case TGSI_OPCODE_ENDPRIM:
2645 return 0;
2646 break;
2647
2648 default:
2649 return 0;
2650 }
2651
2652 return 1;
2653 }
2654
2655 static void
2656 emit_declaration(
2657 struct x86_function *func,
2658 struct tgsi_full_declaration *decl )
2659 {
2660 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2661 unsigned first, last, mask;
2662 unsigned i, j;
2663
2664 first = decl->DeclarationRange.First;
2665 last = decl->DeclarationRange.Last;
2666 mask = decl->Declaration.UsageMask;
2667
2668 for( i = first; i <= last; i++ ) {
2669 for( j = 0; j < NUM_CHANNELS; j++ ) {
2670 if( mask & (1 << j) ) {
2671 switch( decl->Declaration.Interpolate ) {
2672 case TGSI_INTERPOLATE_CONSTANT:
2673 emit_coef_a0( func, 0, i, j );
2674 emit_inputs( func, 0, i, j );
2675 break;
2676
2677 case TGSI_INTERPOLATE_LINEAR:
2678 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2679 emit_coef_dadx( func, 1, i, j );
2680 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2681 emit_coef_dady( func, 3, i, j );
2682 emit_mul( func, 0, 1 ); /* x * dadx */
2683 emit_coef_a0( func, 4, i, j );
2684 emit_mul( func, 2, 3 ); /* y * dady */
2685 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2686 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2687 emit_inputs( func, 0, i, j );
2688 break;
2689
2690 case TGSI_INTERPOLATE_PERSPECTIVE:
2691 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2692 emit_coef_dadx( func, 1, i, j );
2693 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2694 emit_coef_dady( func, 3, i, j );
2695 emit_mul( func, 0, 1 ); /* x * dadx */
2696 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2697 emit_coef_a0( func, 5, i, j );
2698 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2699 emit_mul( func, 2, 3 ); /* y * dady */
2700 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2701 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2702 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2703 emit_inputs( func, 0, i, j );
2704 break;
2705
2706 default:
2707 assert( 0 );
2708 break;
2709 }
2710 }
2711 }
2712 }
2713 }
2714 }
2715
2716 static void aos_to_soa( struct x86_function *func,
2717 uint arg_aos,
2718 uint arg_machine,
2719 uint arg_num,
2720 uint arg_stride )
2721 {
2722 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2723 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2724 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2725 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2726 int inner_loop;
2727
2728
2729 /* Save EBX */
2730 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2731
2732 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2733 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2734 x86_lea( func, soa_input,
2735 x86_make_disp( soa_input,
2736 Offset(struct tgsi_exec_machine, Inputs) ) );
2737 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2738 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2739
2740 /* do */
2741 inner_loop = x86_get_label( func );
2742 {
2743 x86_push( func, aos_input );
2744 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2745 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2746 x86_add( func, aos_input, stride );
2747 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2748 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2749 x86_add( func, aos_input, stride );
2750 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2751 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2752 x86_add( func, aos_input, stride );
2753 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2754 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2755 x86_pop( func, aos_input );
2756
2757 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2758 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2759 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2760 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2761 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2762 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2763
2764 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2765 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2766 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2767 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2768
2769 /* Advance to next input */
2770 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2771 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2772 }
2773 /* while --num_inputs */
2774 x86_dec( func, num_inputs );
2775 x86_jcc( func, cc_NE, inner_loop );
2776
2777 /* Restore EBX */
2778 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2779 }
2780
2781 static void soa_to_aos( struct x86_function *func,
2782 uint arg_aos,
2783 uint arg_machine,
2784 uint arg_num,
2785 uint arg_stride )
2786 {
2787 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2788 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2789 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2790 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2791 int inner_loop;
2792
2793 /* Save EBX */
2794 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2795
2796 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2797 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2798 x86_lea( func, soa_output,
2799 x86_make_disp( soa_output,
2800 Offset(struct tgsi_exec_machine, Outputs) ) );
2801 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2802
2803 /* do */
2804 inner_loop = x86_get_label( func );
2805 {
2806 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2807 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2808 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2809 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2810
2811 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2812 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2813 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2814 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2815 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2816 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2817
2818 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2819 x86_push( func, aos_output );
2820 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2821 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2822 x86_add( func, aos_output, temp );
2823 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2824 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2825 x86_add( func, aos_output, temp );
2826 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2827 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2828 x86_add( func, aos_output, temp );
2829 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2830 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2831 x86_pop( func, aos_output );
2832
2833 /* Advance to next output */
2834 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2835 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2836 }
2837 /* while --num_outputs */
2838 x86_dec( func, num_outputs );
2839 x86_jcc( func, cc_NE, inner_loop );
2840
2841 /* Restore EBX */
2842 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2843 }
2844
2845 /**
2846 * Translate a TGSI vertex/fragment shader to SSE2 code.
2847 * Slightly different things are done for vertex vs. fragment shaders.
2848 *
2849 * \param tokens the TGSI input shader
2850 * \param func the output SSE code/function
2851 * \param immediates buffer to place immediates, later passed to SSE func
2852 * \param return 1 for success, 0 if translation failed
2853 */
2854 unsigned
2855 tgsi_emit_sse2(
2856 const struct tgsi_token *tokens,
2857 struct x86_function *func,
2858 float (*immediates)[4],
2859 boolean do_swizzles )
2860 {
2861 struct tgsi_parse_context parse;
2862 unsigned ok = 1;
2863 uint num_immediates = 0;
2864
2865 util_init_math();
2866
2867 func->csr = func->store;
2868
2869 tgsi_parse_init( &parse, tokens );
2870
2871 /* Can't just use EDI, EBX without save/restoring them:
2872 */
2873 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2874 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2875
2876 /*
2877 * Different function args for vertex/fragment shaders:
2878 */
2879 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2880 if (do_swizzles)
2881 aos_to_soa( func,
2882 4, /* aos_input */
2883 1, /* machine */
2884 5, /* num_inputs */
2885 6 ); /* input_stride */
2886 }
2887
2888 x86_mov(
2889 func,
2890 get_machine_base(),
2891 x86_fn_arg( func, 1 ) );
2892 x86_mov(
2893 func,
2894 get_const_base(),
2895 x86_fn_arg( func, 2 ) );
2896 x86_mov(
2897 func,
2898 get_immediate_base(),
2899 x86_fn_arg( func, 3 ) );
2900
2901 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2902 x86_mov(
2903 func,
2904 get_coef_base(),
2905 x86_fn_arg( func, 4 ) );
2906 }
2907
2908 x86_mov(
2909 func,
2910 get_sampler_base(),
2911 x86_make_disp( get_machine_base(),
2912 Offset( struct tgsi_exec_machine, Samplers ) ) );
2913
2914
2915 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2916 tgsi_parse_token( &parse );
2917
2918 switch( parse.FullToken.Token.Type ) {
2919 case TGSI_TOKEN_TYPE_DECLARATION:
2920 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2921 emit_declaration(
2922 func,
2923 &parse.FullToken.FullDeclaration );
2924 }
2925 break;
2926
2927 case TGSI_TOKEN_TYPE_INSTRUCTION:
2928 ok = emit_instruction(
2929 func,
2930 &parse.FullToken.FullInstruction );
2931
2932 if (!ok) {
2933 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2934 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2935 opcode,
2936 tgsi_get_opcode_name(opcode),
2937 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2938 "vertex shader" : "fragment shader");
2939 }
2940
2941 if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2942 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2943
2944 /* XXX: we only handle src/dst aliasing in a few opcodes
2945 * currently. Need to use an additional temporay to hold
2946 * the result in the cases where the code is too opaque to
2947 * fix.
2948 */
2949 if (opcode != TGSI_OPCODE_MOV &&
2950 opcode != TGSI_OPCODE_SWZ) {
2951 debug_printf("Warning: src/dst aliasing in instruction"
2952 " is not handled:\n");
2953 tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2954 }
2955 }
2956 break;
2957
2958 case TGSI_TOKEN_TYPE_IMMEDIATE:
2959 /* simply copy the immediate values into the next immediates[] slot */
2960 {
2961 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2962 uint i;
2963 assert(size <= 4);
2964 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2965 for( i = 0; i < size; i++ ) {
2966 immediates[num_immediates][i] =
2967 parse.FullToken.FullImmediate.u[i].Float;
2968 }
2969 #if 0
2970 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2971 num_immediates,
2972 immediates[num_immediates][0],
2973 immediates[num_immediates][1],
2974 immediates[num_immediates][2],
2975 immediates[num_immediates][3]);
2976 #endif
2977 num_immediates++;
2978 }
2979 break;
2980
2981 default:
2982 ok = 0;
2983 assert( 0 );
2984 }
2985 }
2986
2987 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2988 if (do_swizzles)
2989 soa_to_aos( func,
2990 7, /* aos_output */
2991 1, /* machine */
2992 8, /* num_outputs */
2993 9 ); /* output_stride */
2994 }
2995
2996 /* Can't just use EBX, EDI without save/restoring them:
2997 */
2998 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2999 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3000
3001 emit_ret( func );
3002
3003 tgsi_parse_free( &parse );
3004
3005 return ok;
3006 }
3007
3008 #endif /* PIPE_ARCH_X86 */
3009