tgsi: simplify and fix sse KIL implementation
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 /* for 1/sqrt()
47 *
48 * This costs about 100fps (close to 10%) in gears:
49 */
50 #define HIGH_PRECISION 1
51
52 #define FAST_MATH 1
53
54
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
57
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
60
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
63
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67
68 #define CHAN_X 0
69 #define CHAN_Y 1
70 #define CHAN_Z 2
71 #define CHAN_W 3
72
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
75
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
80
81
82 /**
83 * X86 utility functions.
84 */
85
86 static struct x86_reg
87 make_xmm(
88 unsigned xmm )
89 {
90 return x86_make_reg(
91 file_XMM,
92 (enum x86_reg_name) xmm );
93 }
94
95 /**
96 * X86 register mapping helpers.
97 */
98
99 static struct x86_reg
100 get_const_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_AX );
105 }
106
107 static struct x86_reg
108 get_machine_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_CX );
113 }
114
115 static struct x86_reg
116 get_input_base( void )
117 {
118 return x86_make_disp(
119 get_machine_base(),
120 Offset(struct tgsi_exec_machine, Inputs) );
121 }
122
123 static struct x86_reg
124 get_output_base( void )
125 {
126 return x86_make_disp(
127 get_machine_base(),
128 Offset(struct tgsi_exec_machine, Outputs) );
129 }
130
131 static struct x86_reg
132 get_temp_base( void )
133 {
134 return x86_make_disp(
135 get_machine_base(),
136 Offset(struct tgsi_exec_machine, Temps) );
137 }
138
139 static struct x86_reg
140 get_coef_base( void )
141 {
142 return x86_make_reg(
143 file_REG32,
144 reg_BX );
145 }
146
147 static struct x86_reg
148 get_sampler_base( void )
149 {
150 return x86_make_reg(
151 file_REG32,
152 reg_DI );
153 }
154
155 static struct x86_reg
156 get_immediate_base( void )
157 {
158 return x86_make_reg(
159 file_REG32,
160 reg_DX );
161 }
162
163
164 /**
165 * Data access helpers.
166 */
167
168
169 static struct x86_reg
170 get_immediate(
171 unsigned vec,
172 unsigned chan )
173 {
174 return x86_make_disp(
175 get_immediate_base(),
176 (vec * 4 + chan) * 4 );
177 }
178
179 static struct x86_reg
180 get_const(
181 unsigned vec,
182 unsigned chan )
183 {
184 return x86_make_disp(
185 get_const_base(),
186 (vec * 4 + chan) * 4 );
187 }
188
189 static struct x86_reg
190 get_sampler_ptr(
191 unsigned unit )
192 {
193 return x86_make_disp(
194 get_sampler_base(),
195 unit * sizeof( struct tgsi_sampler * ) );
196 }
197
198 static struct x86_reg
199 get_input(
200 unsigned vec,
201 unsigned chan )
202 {
203 return x86_make_disp(
204 get_input_base(),
205 (vec * 4 + chan) * 16 );
206 }
207
208 static struct x86_reg
209 get_output(
210 unsigned vec,
211 unsigned chan )
212 {
213 return x86_make_disp(
214 get_output_base(),
215 (vec * 4 + chan) * 16 );
216 }
217
218 static struct x86_reg
219 get_temp(
220 unsigned vec,
221 unsigned chan )
222 {
223 return x86_make_disp(
224 get_temp_base(),
225 (vec * 4 + chan) * 16 );
226 }
227
228 static struct x86_reg
229 get_coef(
230 unsigned vec,
231 unsigned chan,
232 unsigned member )
233 {
234 return x86_make_disp(
235 get_coef_base(),
236 ((vec * 3 + member) * 4 + chan) * 4 );
237 }
238
239
240 static void
241 emit_ret(
242 struct x86_function *func )
243 {
244 x86_ret( func );
245 }
246
247
248 /**
249 * Data fetch helpers.
250 */
251
252 /**
253 * Copy a shader constant to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src const buffer index
256 * \param chan src channel to fetch (X, Y, Z or W)
257 */
258 static void
259 emit_const(
260 struct x86_function *func,
261 uint xmm,
262 int vec,
263 uint chan,
264 uint indirect,
265 uint indirectFile,
266 int indirectIndex )
267 {
268 if (indirect) {
269 /* 'vec' is the offset from the address register's value.
270 * We're loading CONST[ADDR+vec] into an xmm register.
271 */
272 struct x86_reg r0 = get_input_base();
273 struct x86_reg r1 = get_output_base();
274 uint i;
275
276 assert( indirectFile == TGSI_FILE_ADDRESS );
277 assert( indirectIndex == 0 );
278
279 x86_push( func, r0 );
280 x86_push( func, r1 );
281
282 /*
283 * Loop over the four pixels or vertices in the quad.
284 * Get the value of the address (offset) register for pixel/vertex[i],
285 * add it to the src offset and index into the constant buffer.
286 * Note that we're working on SOA data.
287 * If any of the pixel/vertex execution channels are unused their
288 * values will be garbage. It's very important that we don't use
289 * those garbage values as indexes into the constant buffer since
290 * that'll cause segfaults.
291 * The solution is to bitwise-AND the offset with the execution mask
292 * register whose values are either 0 or ~0.
293 * The caller must setup the execution mask register to indicate
294 * which channels are valid/alive before running the shader.
295 * The execution mask will also figure into loops and conditionals
296 * someday.
297 */
298 for (i = 0; i < QUAD_SIZE; i++) {
299 /* r1 = address register[i] */
300 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
301 /* r0 = execution mask[i] */
302 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
303 /* r1 = r1 & r0 */
304 x86_and( func, r1, r0 );
305 /* r0 = 'vec', the offset */
306 x86_lea( func, r0, get_const( vec, chan ) );
307
308 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
309 */
310 x86_add( func, r1, r1 );
311 x86_add( func, r1, r1 );
312 x86_add( func, r1, r1 );
313 x86_add( func, r1, r1 );
314
315 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
316 x86_mov( func, r1, x86_deref( r0 ) );
317 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
318 }
319
320 x86_pop( func, r1 );
321 x86_pop( func, r0 );
322
323 sse_movaps(
324 func,
325 make_xmm( xmm ),
326 get_temp( TEMP_R0, CHAN_X ) );
327 }
328 else {
329 /* 'vec' is the index into the src register file, such as TEMP[vec] */
330 assert( vec >= 0 );
331
332 sse_movss(
333 func,
334 make_xmm( xmm ),
335 get_const( vec, chan ) );
336 sse_shufps(
337 func,
338 make_xmm( xmm ),
339 make_xmm( xmm ),
340 SHUF( 0, 0, 0, 0 ) );
341 }
342 }
343
344 static void
345 emit_immediate(
346 struct x86_function *func,
347 unsigned xmm,
348 unsigned vec,
349 unsigned chan )
350 {
351 sse_movss(
352 func,
353 make_xmm( xmm ),
354 get_immediate( vec, chan ) );
355 sse_shufps(
356 func,
357 make_xmm( xmm ),
358 make_xmm( xmm ),
359 SHUF( 0, 0, 0, 0 ) );
360 }
361
362
363 /**
364 * Copy a shader input to xmm register
365 * \param xmm the destination xmm register
366 * \param vec the src input attrib
367 * \param chan src channel to fetch (X, Y, Z or W)
368 */
369 static void
370 emit_inputf(
371 struct x86_function *func,
372 unsigned xmm,
373 unsigned vec,
374 unsigned chan )
375 {
376 sse_movups(
377 func,
378 make_xmm( xmm ),
379 get_input( vec, chan ) );
380 }
381
382 /**
383 * Store an xmm register to a shader output
384 * \param xmm the source xmm register
385 * \param vec the dest output attrib
386 * \param chan src dest channel to store (X, Y, Z or W)
387 */
388 static void
389 emit_output(
390 struct x86_function *func,
391 unsigned xmm,
392 unsigned vec,
393 unsigned chan )
394 {
395 sse_movups(
396 func,
397 get_output( vec, chan ),
398 make_xmm( xmm ) );
399 }
400
401 /**
402 * Copy a shader temporary to xmm register
403 * \param xmm the destination xmm register
404 * \param vec the src temp register
405 * \param chan src channel to fetch (X, Y, Z or W)
406 */
407 static void
408 emit_tempf(
409 struct x86_function *func,
410 unsigned xmm,
411 unsigned vec,
412 unsigned chan )
413 {
414 sse_movaps(
415 func,
416 make_xmm( xmm ),
417 get_temp( vec, chan ) );
418 }
419
420 /**
421 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
422 * \param xmm the destination xmm register
423 * \param vec the src input/attribute coefficient index
424 * \param chan src channel to fetch (X, Y, Z or W)
425 * \param member 0=a0, 1=dadx, 2=dady
426 */
427 static void
428 emit_coef(
429 struct x86_function *func,
430 unsigned xmm,
431 unsigned vec,
432 unsigned chan,
433 unsigned member )
434 {
435 sse_movss(
436 func,
437 make_xmm( xmm ),
438 get_coef( vec, chan, member ) );
439 sse_shufps(
440 func,
441 make_xmm( xmm ),
442 make_xmm( xmm ),
443 SHUF( 0, 0, 0, 0 ) );
444 }
445
446 /**
447 * Data store helpers.
448 */
449
450 static void
451 emit_inputs(
452 struct x86_function *func,
453 unsigned xmm,
454 unsigned vec,
455 unsigned chan )
456 {
457 sse_movups(
458 func,
459 get_input( vec, chan ),
460 make_xmm( xmm ) );
461 }
462
463 static void
464 emit_temps(
465 struct x86_function *func,
466 unsigned xmm,
467 unsigned vec,
468 unsigned chan )
469 {
470 sse_movaps(
471 func,
472 get_temp( vec, chan ),
473 make_xmm( xmm ) );
474 }
475
476 static void
477 emit_addrs(
478 struct x86_function *func,
479 unsigned xmm,
480 unsigned vec,
481 unsigned chan )
482 {
483 assert( vec == 0 );
484
485 emit_temps(
486 func,
487 xmm,
488 vec + TGSI_EXEC_TEMP_ADDR,
489 chan );
490 }
491
492 /**
493 * Coefficent fetch helpers.
494 */
495
496 static void
497 emit_coef_a0(
498 struct x86_function *func,
499 unsigned xmm,
500 unsigned vec,
501 unsigned chan )
502 {
503 emit_coef(
504 func,
505 xmm,
506 vec,
507 chan,
508 0 );
509 }
510
511 static void
512 emit_coef_dadx(
513 struct x86_function *func,
514 unsigned xmm,
515 unsigned vec,
516 unsigned chan )
517 {
518 emit_coef(
519 func,
520 xmm,
521 vec,
522 chan,
523 1 );
524 }
525
526 static void
527 emit_coef_dady(
528 struct x86_function *func,
529 unsigned xmm,
530 unsigned vec,
531 unsigned chan )
532 {
533 emit_coef(
534 func,
535 xmm,
536 vec,
537 chan,
538 2 );
539 }
540
541 /**
542 * Function call helpers.
543 */
544
545 /**
546 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
547 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
548 * that the stack pointer is 16 byte aligned, as expected.
549 */
550 static void
551 emit_func_call(
552 struct x86_function *func,
553 unsigned xmm_save_mask,
554 const struct x86_reg *arg,
555 unsigned nr_args,
556 void (PIPE_CDECL *code)() )
557 {
558 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
559 unsigned i, n;
560
561 x86_push(
562 func,
563 x86_make_reg( file_REG32, reg_AX) );
564 x86_push(
565 func,
566 x86_make_reg( file_REG32, reg_CX) );
567 x86_push(
568 func,
569 x86_make_reg( file_REG32, reg_DX) );
570
571 /* Store XMM regs to the stack
572 */
573 for(i = 0, n = 0; i < 8; ++i)
574 if(xmm_save_mask & (1 << i))
575 ++n;
576
577 x86_sub_imm(
578 func,
579 x86_make_reg( file_REG32, reg_SP ),
580 n*16);
581
582 for(i = 0, n = 0; i < 8; ++i)
583 if(xmm_save_mask & (1 << i)) {
584 sse_movups(
585 func,
586 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
587 make_xmm( i ) );
588 ++n;
589 }
590
591 for (i = 0; i < nr_args; i++) {
592 /* Load the address of the buffer we use for passing arguments and
593 * receiving results:
594 */
595 x86_lea(
596 func,
597 ecx,
598 arg[i] );
599
600 /* Push actual function arguments (currently just the pointer to
601 * the buffer above), and call the function:
602 */
603 x86_push( func, ecx );
604 }
605
606 x86_mov_reg_imm( func, ecx, (unsigned long) code );
607 x86_call( func, ecx );
608
609 /* Pop the arguments (or just add an immediate to esp)
610 */
611 for (i = 0; i < nr_args; i++) {
612 x86_pop(func, ecx );
613 }
614
615 /* Pop the saved XMM regs:
616 */
617 for(i = 0, n = 0; i < 8; ++i)
618 if(xmm_save_mask & (1 << i)) {
619 sse_movups(
620 func,
621 make_xmm( i ),
622 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
623 ++n;
624 }
625
626 x86_add_imm(
627 func,
628 x86_make_reg( file_REG32, reg_SP ),
629 n*16);
630
631 /* Restore GP registers in a reverse order.
632 */
633 x86_pop(
634 func,
635 x86_make_reg( file_REG32, reg_DX) );
636 x86_pop(
637 func,
638 x86_make_reg( file_REG32, reg_CX) );
639 x86_pop(
640 func,
641 x86_make_reg( file_REG32, reg_AX) );
642 }
643
644 static void
645 emit_func_call_dst_src1(
646 struct x86_function *func,
647 unsigned xmm_save,
648 unsigned xmm_dst,
649 unsigned xmm_src0,
650 void (PIPE_CDECL *code)() )
651 {
652 struct x86_reg store = get_temp( TEMP_R0, 0 );
653 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
654
655 /* Store our input parameters (in xmm regs) to the buffer we use
656 * for passing arguments. We will pass a pointer to this buffer as
657 * the actual function argument.
658 */
659 sse_movaps(
660 func,
661 store,
662 make_xmm( xmm_src0 ) );
663
664 emit_func_call( func,
665 xmm_mask,
666 &store,
667 1,
668 code );
669
670 sse_movaps(
671 func,
672 make_xmm( xmm_dst ),
673 store );
674 }
675
676
677 static void
678 emit_func_call_dst_src2(
679 struct x86_function *func,
680 unsigned xmm_save,
681 unsigned xmm_dst,
682 unsigned xmm_src0,
683 unsigned xmm_src1,
684 void (PIPE_CDECL *code)() )
685 {
686 struct x86_reg store = get_temp( TEMP_R0, 0 );
687 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
688
689 /* Store two inputs to parameter buffer.
690 */
691 sse_movaps(
692 func,
693 store,
694 make_xmm( xmm_src0 ) );
695
696 sse_movaps(
697 func,
698 x86_make_disp( store, 4 * sizeof(float) ),
699 make_xmm( xmm_src1 ) );
700
701
702 /* Emit the call
703 */
704 emit_func_call( func,
705 xmm_mask,
706 &store,
707 1,
708 code );
709
710 /* Retrieve the results:
711 */
712 sse_movaps(
713 func,
714 make_xmm( xmm_dst ),
715 store );
716 }
717
718
719
720
721
722 #if defined(PIPE_ARCH_SSE)
723
724 /*
725 * Fast SSE2 implementation of special math functions.
726 */
727
728 #define POLY0(x, c0) _mm_set1_ps(c0)
729 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
730 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
731 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
732 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
733 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
734
735 #define EXP_POLY_DEGREE 3
736 #define LOG_POLY_DEGREE 5
737
738 /**
739 * See http://www.devmaster.net/forums/showthread.php?p=43580
740 */
741 static INLINE __m128
742 exp2f4(__m128 x)
743 {
744 __m128i ipart;
745 __m128 fpart, expipart, expfpart;
746
747 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
748 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
749
750 /* ipart = int(x - 0.5) */
751 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
752
753 /* fpart = x - ipart */
754 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
755
756 /* expipart = (float) (1 << ipart) */
757 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
758
759 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
760 #if EXP_POLY_DEGREE == 5
761 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
762 #elif EXP_POLY_DEGREE == 4
763 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
764 #elif EXP_POLY_DEGREE == 3
765 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
766 #elif EXP_POLY_DEGREE == 2
767 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
768 #else
769 #error
770 #endif
771
772 return _mm_mul_ps(expipart, expfpart);
773 }
774
775
776 /**
777 * See http://www.devmaster.net/forums/showthread.php?p=43580
778 */
779 static INLINE __m128
780 log2f4(__m128 x)
781 {
782 __m128i expmask = _mm_set1_epi32(0x7f800000);
783 __m128i mantmask = _mm_set1_epi32(0x007fffff);
784 __m128 one = _mm_set1_ps(1.0f);
785
786 __m128i i = _mm_castps_si128(x);
787
788 /* exp = (float) exponent(x) */
789 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
790
791 /* mant = (float) mantissa(x) */
792 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
793
794 __m128 logmant;
795
796 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
797 * These coefficients can be generate with
798 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
799 */
800 #if LOG_POLY_DEGREE == 6
801 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
802 #elif LOG_POLY_DEGREE == 5
803 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
804 #elif LOG_POLY_DEGREE == 4
805 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
806 #elif LOG_POLY_DEGREE == 3
807 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
808 #else
809 #error
810 #endif
811
812 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
813 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
814
815 return _mm_add_ps(logmant, exp);
816 }
817
818
819 static INLINE __m128
820 powf4(__m128 x, __m128 y)
821 {
822 return exp2f4(_mm_mul_ps(log2f4(x), y));
823 }
824
825 #endif /* PIPE_ARCH_SSE */
826
827
828
829 /**
830 * Low-level instruction translators.
831 */
832
833 static void
834 emit_abs(
835 struct x86_function *func,
836 unsigned xmm )
837 {
838 sse_andps(
839 func,
840 make_xmm( xmm ),
841 get_temp(
842 TGSI_EXEC_TEMP_7FFFFFFF_I,
843 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
844 }
845
846 static void
847 emit_add(
848 struct x86_function *func,
849 unsigned xmm_dst,
850 unsigned xmm_src )
851 {
852 sse_addps(
853 func,
854 make_xmm( xmm_dst ),
855 make_xmm( xmm_src ) );
856 }
857
858 static void PIPE_CDECL
859 cos4f(
860 float *store )
861 {
862 store[0] = cosf( store[0] );
863 store[1] = cosf( store[1] );
864 store[2] = cosf( store[2] );
865 store[3] = cosf( store[3] );
866 }
867
868 static void
869 emit_cos(
870 struct x86_function *func,
871 unsigned xmm_save,
872 unsigned xmm_dst )
873 {
874 emit_func_call_dst_src1(
875 func,
876 xmm_save,
877 xmm_dst,
878 xmm_dst,
879 cos4f );
880 }
881
882 static void PIPE_CDECL
883 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
884 __attribute__((force_align_arg_pointer))
885 #endif
886 ex24f(
887 float *store )
888 {
889 #if defined(PIPE_ARCH_SSE)
890 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
891 #else
892 store[0] = util_fast_exp2( store[0] );
893 store[1] = util_fast_exp2( store[1] );
894 store[2] = util_fast_exp2( store[2] );
895 store[3] = util_fast_exp2( store[3] );
896 #endif
897 }
898
899 static void
900 emit_ex2(
901 struct x86_function *func,
902 unsigned xmm_save,
903 unsigned xmm_dst )
904 {
905 emit_func_call_dst_src1(
906 func,
907 xmm_save,
908 xmm_dst,
909 xmm_dst,
910 ex24f );
911 }
912
913 static void
914 emit_f2it(
915 struct x86_function *func,
916 unsigned xmm )
917 {
918 sse2_cvttps2dq(
919 func,
920 make_xmm( xmm ),
921 make_xmm( xmm ) );
922 }
923
924 static void
925 emit_i2f(
926 struct x86_function *func,
927 unsigned xmm )
928 {
929 sse2_cvtdq2ps(
930 func,
931 make_xmm( xmm ),
932 make_xmm( xmm ) );
933 }
934
935 static void PIPE_CDECL
936 flr4f(
937 float *store )
938 {
939 store[0] = floorf( store[0] );
940 store[1] = floorf( store[1] );
941 store[2] = floorf( store[2] );
942 store[3] = floorf( store[3] );
943 }
944
945 static void
946 emit_flr(
947 struct x86_function *func,
948 unsigned xmm_save,
949 unsigned xmm_dst )
950 {
951 emit_func_call_dst_src1(
952 func,
953 xmm_save,
954 xmm_dst,
955 xmm_dst,
956 flr4f );
957 }
958
959 static void PIPE_CDECL
960 frc4f(
961 float *store )
962 {
963 store[0] -= floorf( store[0] );
964 store[1] -= floorf( store[1] );
965 store[2] -= floorf( store[2] );
966 store[3] -= floorf( store[3] );
967 }
968
969 static void
970 emit_frc(
971 struct x86_function *func,
972 unsigned xmm_save,
973 unsigned xmm_dst )
974 {
975 emit_func_call_dst_src1(
976 func,
977 xmm_save,
978 xmm_dst,
979 xmm_dst,
980 frc4f );
981 }
982
983 static void PIPE_CDECL
984 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
985 __attribute__((force_align_arg_pointer))
986 #endif
987 lg24f(
988 float *store )
989 {
990 #if defined(PIPE_ARCH_SSE)
991 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
992 #else
993 store[0] = util_fast_log2( store[0] );
994 store[1] = util_fast_log2( store[1] );
995 store[2] = util_fast_log2( store[2] );
996 store[3] = util_fast_log2( store[3] );
997 #endif
998 }
999
1000 static void
1001 emit_lg2(
1002 struct x86_function *func,
1003 unsigned xmm_save,
1004 unsigned xmm_dst )
1005 {
1006 emit_func_call_dst_src1(
1007 func,
1008 xmm_save,
1009 xmm_dst,
1010 xmm_dst,
1011 lg24f );
1012 }
1013
1014 static void
1015 emit_MOV(
1016 struct x86_function *func,
1017 unsigned xmm_dst,
1018 unsigned xmm_src )
1019 {
1020 sse_movups(
1021 func,
1022 make_xmm( xmm_dst ),
1023 make_xmm( xmm_src ) );
1024 }
1025
1026 static void
1027 emit_mul (struct x86_function *func,
1028 unsigned xmm_dst,
1029 unsigned xmm_src)
1030 {
1031 sse_mulps(
1032 func,
1033 make_xmm( xmm_dst ),
1034 make_xmm( xmm_src ) );
1035 }
1036
1037 static void
1038 emit_neg(
1039 struct x86_function *func,
1040 unsigned xmm )
1041 {
1042 sse_xorps(
1043 func,
1044 make_xmm( xmm ),
1045 get_temp(
1046 TGSI_EXEC_TEMP_80000000_I,
1047 TGSI_EXEC_TEMP_80000000_C ) );
1048 }
1049
1050 static void PIPE_CDECL
1051 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1052 __attribute__((force_align_arg_pointer))
1053 #endif
1054 pow4f(
1055 float *store )
1056 {
1057 #if defined(PIPE_ARCH_SSE)
1058 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1059 #else
1060 store[0] = util_fast_pow( store[0], store[4] );
1061 store[1] = util_fast_pow( store[1], store[5] );
1062 store[2] = util_fast_pow( store[2], store[6] );
1063 store[3] = util_fast_pow( store[3], store[7] );
1064 #endif
1065 }
1066
1067 static void
1068 emit_pow(
1069 struct x86_function *func,
1070 unsigned xmm_save,
1071 unsigned xmm_dst,
1072 unsigned xmm_src0,
1073 unsigned xmm_src1 )
1074 {
1075 emit_func_call_dst_src2(
1076 func,
1077 xmm_save,
1078 xmm_dst,
1079 xmm_src0,
1080 xmm_src1,
1081 pow4f );
1082 }
1083
1084 static void
1085 emit_rcp (
1086 struct x86_function *func,
1087 unsigned xmm_dst,
1088 unsigned xmm_src )
1089 {
1090 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1091 * good enough. Need to either emit a proper divide or use the
1092 * iterative technique described below in emit_rsqrt().
1093 */
1094 sse2_rcpps(
1095 func,
1096 make_xmm( xmm_dst ),
1097 make_xmm( xmm_src ) );
1098 }
1099
1100 static void PIPE_CDECL
1101 rnd4f(
1102 float *store )
1103 {
1104 store[0] = floorf( store[0] + 0.5f );
1105 store[1] = floorf( store[1] + 0.5f );
1106 store[2] = floorf( store[2] + 0.5f );
1107 store[3] = floorf( store[3] + 0.5f );
1108 }
1109
1110 static void
1111 emit_rnd(
1112 struct x86_function *func,
1113 unsigned xmm_save,
1114 unsigned xmm_dst )
1115 {
1116 emit_func_call_dst_src1(
1117 func,
1118 xmm_save,
1119 xmm_dst,
1120 xmm_dst,
1121 rnd4f );
1122 }
1123
1124 static void
1125 emit_rsqrt(
1126 struct x86_function *func,
1127 unsigned xmm_dst,
1128 unsigned xmm_src )
1129 {
1130 #if HIGH_PRECISION
1131 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1132 * implementations, it is possible to improve its precision at
1133 * fairly low cost, using a newton/raphson step, as below:
1134 *
1135 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1136 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1137 *
1138 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1139 */
1140 {
1141 struct x86_reg dst = make_xmm( xmm_dst );
1142 struct x86_reg src = make_xmm( xmm_src );
1143 struct x86_reg tmp0 = make_xmm( 2 );
1144 struct x86_reg tmp1 = make_xmm( 3 );
1145
1146 assert( xmm_dst != xmm_src );
1147 assert( xmm_dst != 2 && xmm_dst != 3 );
1148 assert( xmm_src != 2 && xmm_src != 3 );
1149
1150 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1151 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1152 sse_rsqrtps( func, tmp1, src );
1153 sse_mulps( func, src, tmp1 );
1154 sse_mulps( func, dst, tmp1 );
1155 sse_mulps( func, src, tmp1 );
1156 sse_subps( func, tmp0, src );
1157 sse_mulps( func, dst, tmp0 );
1158 }
1159 #else
1160 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1161 * good enough.
1162 */
1163 sse_rsqrtps(
1164 func,
1165 make_xmm( xmm_dst ),
1166 make_xmm( xmm_src ) );
1167 #endif
1168 }
1169
1170 static void
1171 emit_setsign(
1172 struct x86_function *func,
1173 unsigned xmm )
1174 {
1175 sse_orps(
1176 func,
1177 make_xmm( xmm ),
1178 get_temp(
1179 TGSI_EXEC_TEMP_80000000_I,
1180 TGSI_EXEC_TEMP_80000000_C ) );
1181 }
1182
1183 static void PIPE_CDECL
1184 sgn4f(
1185 float *store )
1186 {
1187 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1188 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1189 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1190 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1191 }
1192
1193 static void
1194 emit_sgn(
1195 struct x86_function *func,
1196 unsigned xmm_save,
1197 unsigned xmm_dst )
1198 {
1199 emit_func_call_dst_src1(
1200 func,
1201 xmm_save,
1202 xmm_dst,
1203 xmm_dst,
1204 sgn4f );
1205 }
1206
1207 static void PIPE_CDECL
1208 sin4f(
1209 float *store )
1210 {
1211 store[0] = sinf( store[0] );
1212 store[1] = sinf( store[1] );
1213 store[2] = sinf( store[2] );
1214 store[3] = sinf( store[3] );
1215 }
1216
1217 static void
1218 emit_sin (struct x86_function *func,
1219 unsigned xmm_save,
1220 unsigned xmm_dst)
1221 {
1222 emit_func_call_dst_src1(
1223 func,
1224 xmm_save,
1225 xmm_dst,
1226 xmm_dst,
1227 sin4f );
1228 }
1229
1230 static void
1231 emit_sub(
1232 struct x86_function *func,
1233 unsigned xmm_dst,
1234 unsigned xmm_src )
1235 {
1236 sse_subps(
1237 func,
1238 make_xmm( xmm_dst ),
1239 make_xmm( xmm_src ) );
1240 }
1241
1242
1243
1244
1245
1246
1247
1248 /**
1249 * Register fetch.
1250 */
1251
1252 static void
1253 emit_fetch(
1254 struct x86_function *func,
1255 unsigned xmm,
1256 const struct tgsi_full_src_register *reg,
1257 const unsigned chan_index )
1258 {
1259 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1260
1261 switch (swizzle) {
1262 case TGSI_EXTSWIZZLE_X:
1263 case TGSI_EXTSWIZZLE_Y:
1264 case TGSI_EXTSWIZZLE_Z:
1265 case TGSI_EXTSWIZZLE_W:
1266 switch (reg->SrcRegister.File) {
1267 case TGSI_FILE_CONSTANT:
1268 emit_const(
1269 func,
1270 xmm,
1271 reg->SrcRegister.Index,
1272 swizzle,
1273 reg->SrcRegister.Indirect,
1274 reg->SrcRegisterInd.File,
1275 reg->SrcRegisterInd.Index );
1276 break;
1277
1278 case TGSI_FILE_IMMEDIATE:
1279 emit_immediate(
1280 func,
1281 xmm,
1282 reg->SrcRegister.Index,
1283 swizzle );
1284 break;
1285
1286 case TGSI_FILE_INPUT:
1287 emit_inputf(
1288 func,
1289 xmm,
1290 reg->SrcRegister.Index,
1291 swizzle );
1292 break;
1293
1294 case TGSI_FILE_TEMPORARY:
1295 emit_tempf(
1296 func,
1297 xmm,
1298 reg->SrcRegister.Index,
1299 swizzle );
1300 break;
1301
1302 default:
1303 assert( 0 );
1304 }
1305 break;
1306
1307 case TGSI_EXTSWIZZLE_ZERO:
1308 emit_tempf(
1309 func,
1310 xmm,
1311 TGSI_EXEC_TEMP_00000000_I,
1312 TGSI_EXEC_TEMP_00000000_C );
1313 break;
1314
1315 case TGSI_EXTSWIZZLE_ONE:
1316 emit_tempf(
1317 func,
1318 xmm,
1319 TEMP_ONE_I,
1320 TEMP_ONE_C );
1321 break;
1322
1323 default:
1324 assert( 0 );
1325 }
1326
1327 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1328 case TGSI_UTIL_SIGN_CLEAR:
1329 emit_abs( func, xmm );
1330 break;
1331
1332 case TGSI_UTIL_SIGN_SET:
1333 emit_setsign( func, xmm );
1334 break;
1335
1336 case TGSI_UTIL_SIGN_TOGGLE:
1337 emit_neg( func, xmm );
1338 break;
1339
1340 case TGSI_UTIL_SIGN_KEEP:
1341 break;
1342 }
1343 }
1344
1345 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1346 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1347
1348 /**
1349 * Register store.
1350 */
1351
1352 static void
1353 emit_store(
1354 struct x86_function *func,
1355 unsigned xmm,
1356 const struct tgsi_full_dst_register *reg,
1357 const struct tgsi_full_instruction *inst,
1358 unsigned chan_index )
1359 {
1360 switch( reg->DstRegister.File ) {
1361 case TGSI_FILE_OUTPUT:
1362 emit_output(
1363 func,
1364 xmm,
1365 reg->DstRegister.Index,
1366 chan_index );
1367 break;
1368
1369 case TGSI_FILE_TEMPORARY:
1370 emit_temps(
1371 func,
1372 xmm,
1373 reg->DstRegister.Index,
1374 chan_index );
1375 break;
1376
1377 case TGSI_FILE_ADDRESS:
1378 emit_addrs(
1379 func,
1380 xmm,
1381 reg->DstRegister.Index,
1382 chan_index );
1383 break;
1384
1385 default:
1386 assert( 0 );
1387 }
1388
1389 switch( inst->Instruction.Saturate ) {
1390 case TGSI_SAT_NONE:
1391 break;
1392
1393 case TGSI_SAT_ZERO_ONE:
1394 /* assert( 0 ); */
1395 break;
1396
1397 case TGSI_SAT_MINUS_PLUS_ONE:
1398 assert( 0 );
1399 break;
1400 }
1401 }
1402
1403 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1404 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1405
1406
1407 static void PIPE_CDECL
1408 fetch_texel( struct tgsi_sampler **sampler,
1409 float *store )
1410 {
1411 #if 0
1412 uint j;
1413
1414 debug_printf("%s sampler: %p (%p) store: %p\n",
1415 __FUNCTION__,
1416 sampler, *sampler,
1417 store );
1418
1419 debug_printf("lodbias %f\n", store[12]);
1420
1421 for (j = 0; j < 4; j++)
1422 debug_printf("sample %d texcoord %f %f\n",
1423 j,
1424 store[0+j],
1425 store[4+j]);
1426 #endif
1427
1428 {
1429 float rgba[NUM_CHANNELS][QUAD_SIZE];
1430 (*sampler)->get_samples(*sampler,
1431 &store[0],
1432 &store[4],
1433 &store[8],
1434 0.0f, /*store[12], lodbias */
1435 rgba);
1436
1437 memcpy( store, rgba, 16 * sizeof(float));
1438 }
1439
1440 #if 0
1441 for (j = 0; j < 4; j++)
1442 debug_printf("sample %d result %f %f %f %f\n",
1443 j,
1444 store[0+j],
1445 store[4+j],
1446 store[8+j],
1447 store[12+j]);
1448 #endif
1449 }
1450
1451 /**
1452 * High-level instruction translators.
1453 */
1454
1455 static void
1456 emit_tex( struct x86_function *func,
1457 const struct tgsi_full_instruction *inst,
1458 boolean lodbias,
1459 boolean projected)
1460 {
1461 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1462 struct x86_reg args[2];
1463 unsigned count;
1464 unsigned i;
1465
1466 switch (inst->InstructionExtTexture.Texture) {
1467 case TGSI_TEXTURE_1D:
1468 case TGSI_TEXTURE_SHADOW1D:
1469 count = 1;
1470 break;
1471 case TGSI_TEXTURE_2D:
1472 case TGSI_TEXTURE_RECT:
1473 case TGSI_TEXTURE_SHADOW2D:
1474 case TGSI_TEXTURE_SHADOWRECT:
1475 count = 2;
1476 break;
1477 case TGSI_TEXTURE_3D:
1478 case TGSI_TEXTURE_CUBE:
1479 count = 3;
1480 break;
1481 default:
1482 assert(0);
1483 return;
1484 }
1485
1486 if (lodbias) {
1487 FETCH( func, *inst, 3, 0, 3 );
1488 }
1489 else {
1490 emit_tempf(
1491 func,
1492 3,
1493 TGSI_EXEC_TEMP_00000000_I,
1494 TGSI_EXEC_TEMP_00000000_C );
1495
1496 }
1497
1498 /* store lodbias whether enabled or not -- fetch_texel currently
1499 * respects it always.
1500 */
1501 sse_movaps( func,
1502 get_temp( TEMP_R0, 3 ),
1503 make_xmm( 3 ) );
1504
1505
1506 if (projected) {
1507 FETCH( func, *inst, 3, 0, 3 );
1508
1509 emit_rcp( func, 3, 3 );
1510 }
1511
1512 for (i = 0; i < count; i++) {
1513 FETCH( func, *inst, i, 0, i );
1514
1515 if (projected) {
1516 sse_mulps(
1517 func,
1518 make_xmm( i ),
1519 make_xmm( 3 ) );
1520 }
1521
1522 /* Store in the argument buffer:
1523 */
1524 sse_movaps(
1525 func,
1526 get_temp( TEMP_R0, i ),
1527 make_xmm( i ) );
1528 }
1529
1530 args[0] = get_temp( TEMP_R0, 0 );
1531 args[1] = get_sampler_ptr( unit );
1532
1533
1534 emit_func_call( func,
1535 0,
1536 args,
1537 Elements(args),
1538 fetch_texel );
1539
1540 /* If all four channels are enabled, could use a pointer to
1541 * dst[0].x instead of TEMP_R0 for store?
1542 */
1543 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1544
1545 sse_movaps(
1546 func,
1547 make_xmm( 0 ),
1548 get_temp( TEMP_R0, i ) );
1549
1550 STORE( func, *inst, 0, 0, i );
1551 }
1552 }
1553
1554
1555 static void
1556 emit_kil(
1557 struct x86_function *func,
1558 const struct tgsi_full_src_register *reg )
1559 {
1560 unsigned uniquemask;
1561 unsigned unique_count = 0;
1562 unsigned chan_index;
1563 unsigned i;
1564
1565 /* This mask stores component bits that were already tested. Note that
1566 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1567 * tested. */
1568 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1569
1570 FOR_EACH_CHANNEL( chan_index ) {
1571 unsigned swizzle;
1572
1573 /* unswizzle channel */
1574 swizzle = tgsi_util_get_full_src_register_extswizzle(
1575 reg,
1576 chan_index );
1577
1578 /* check if the component has not been already tested */
1579 if( !(uniquemask & (1 << swizzle)) ) {
1580 uniquemask |= 1 << swizzle;
1581
1582 /* allocate register */
1583 emit_fetch(
1584 func,
1585 unique_count++,
1586 reg,
1587 chan_index );
1588 }
1589 }
1590
1591 x86_push(
1592 func,
1593 x86_make_reg( file_REG32, reg_AX ) );
1594 x86_push(
1595 func,
1596 x86_make_reg( file_REG32, reg_DX ) );
1597
1598 for (i = 0 ; i < unique_count; i++ ) {
1599 struct x86_reg dataXMM = make_xmm(i);
1600
1601 sse_cmpps(
1602 func,
1603 dataXMM,
1604 get_temp(
1605 TGSI_EXEC_TEMP_00000000_I,
1606 TGSI_EXEC_TEMP_00000000_C ),
1607 cc_LessThan );
1608
1609 if( i == 0 ) {
1610 sse_movmskps(
1611 func,
1612 x86_make_reg( file_REG32, reg_AX ),
1613 dataXMM );
1614 }
1615 else {
1616 sse_movmskps(
1617 func,
1618 x86_make_reg( file_REG32, reg_DX ),
1619 dataXMM );
1620 x86_or(
1621 func,
1622 x86_make_reg( file_REG32, reg_AX ),
1623 x86_make_reg( file_REG32, reg_DX ) );
1624 }
1625 }
1626
1627 x86_or(
1628 func,
1629 get_temp(
1630 TGSI_EXEC_TEMP_KILMASK_I,
1631 TGSI_EXEC_TEMP_KILMASK_C ),
1632 x86_make_reg( file_REG32, reg_AX ) );
1633
1634 x86_pop(
1635 func,
1636 x86_make_reg( file_REG32, reg_DX ) );
1637 x86_pop(
1638 func,
1639 x86_make_reg( file_REG32, reg_AX ) );
1640 }
1641
1642
1643 static void
1644 emit_kilp(
1645 struct x86_function *func )
1646 {
1647 /* XXX todo / fix me */
1648 }
1649
1650
1651 static void
1652 emit_setcc(
1653 struct x86_function *func,
1654 struct tgsi_full_instruction *inst,
1655 enum sse_cc cc )
1656 {
1657 unsigned chan_index;
1658
1659 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1660 FETCH( func, *inst, 0, 0, chan_index );
1661 FETCH( func, *inst, 1, 1, chan_index );
1662 sse_cmpps(
1663 func,
1664 make_xmm( 0 ),
1665 make_xmm( 1 ),
1666 cc );
1667 sse_andps(
1668 func,
1669 make_xmm( 0 ),
1670 get_temp(
1671 TEMP_ONE_I,
1672 TEMP_ONE_C ) );
1673 STORE( func, *inst, 0, 0, chan_index );
1674 }
1675 }
1676
1677 static void
1678 emit_cmp(
1679 struct x86_function *func,
1680 struct tgsi_full_instruction *inst )
1681 {
1682 unsigned chan_index;
1683
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685 FETCH( func, *inst, 0, 0, chan_index );
1686 FETCH( func, *inst, 1, 1, chan_index );
1687 FETCH( func, *inst, 2, 2, chan_index );
1688 sse_cmpps(
1689 func,
1690 make_xmm( 0 ),
1691 get_temp(
1692 TGSI_EXEC_TEMP_00000000_I,
1693 TGSI_EXEC_TEMP_00000000_C ),
1694 cc_LessThan );
1695 sse_andps(
1696 func,
1697 make_xmm( 1 ),
1698 make_xmm( 0 ) );
1699 sse_andnps(
1700 func,
1701 make_xmm( 0 ),
1702 make_xmm( 2 ) );
1703 sse_orps(
1704 func,
1705 make_xmm( 0 ),
1706 make_xmm( 1 ) );
1707 STORE( func, *inst, 0, 0, chan_index );
1708 }
1709 }
1710
1711
1712 /**
1713 * Check if inst src/dest regs use indirect addressing into temporary
1714 * register file.
1715 */
1716 static boolean
1717 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1718 {
1719 uint i;
1720 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1721 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1722 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1723 reg->SrcRegister.Indirect)
1724 return TRUE;
1725 }
1726 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1727 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1728 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1729 reg->DstRegister.Indirect)
1730 return TRUE;
1731 }
1732 return FALSE;
1733 }
1734
1735
1736 static int
1737 emit_instruction(
1738 struct x86_function *func,
1739 struct tgsi_full_instruction *inst )
1740 {
1741 unsigned chan_index;
1742
1743 /* we can't handle indirect addressing into temp register file yet */
1744 if (indirect_temp_reference(inst))
1745 return FALSE;
1746
1747 switch (inst->Instruction.Opcode) {
1748 case TGSI_OPCODE_ARL:
1749 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1750 FETCH( func, *inst, 0, 0, chan_index );
1751 emit_flr(func, 0, 0);
1752 emit_f2it( func, 0 );
1753 STORE( func, *inst, 0, 0, chan_index );
1754 }
1755 break;
1756
1757 case TGSI_OPCODE_MOV:
1758 case TGSI_OPCODE_SWZ:
1759 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1760 FETCH( func, *inst, 0, 0, chan_index );
1761 STORE( func, *inst, 0, 0, chan_index );
1762 }
1763 break;
1764
1765 case TGSI_OPCODE_LIT:
1766 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1767 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1768 emit_tempf(
1769 func,
1770 0,
1771 TEMP_ONE_I,
1772 TEMP_ONE_C);
1773 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1774 STORE( func, *inst, 0, 0, CHAN_X );
1775 }
1776 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1777 STORE( func, *inst, 0, 0, CHAN_W );
1778 }
1779 }
1780 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1781 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1783 FETCH( func, *inst, 0, 0, CHAN_X );
1784 sse_maxps(
1785 func,
1786 make_xmm( 0 ),
1787 get_temp(
1788 TGSI_EXEC_TEMP_00000000_I,
1789 TGSI_EXEC_TEMP_00000000_C ) );
1790 STORE( func, *inst, 0, 0, CHAN_Y );
1791 }
1792 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793 /* XMM[1] = SrcReg[0].yyyy */
1794 FETCH( func, *inst, 1, 0, CHAN_Y );
1795 /* XMM[1] = max(XMM[1], 0) */
1796 sse_maxps(
1797 func,
1798 make_xmm( 1 ),
1799 get_temp(
1800 TGSI_EXEC_TEMP_00000000_I,
1801 TGSI_EXEC_TEMP_00000000_C ) );
1802 /* XMM[2] = SrcReg[0].wwww */
1803 FETCH( func, *inst, 2, 0, CHAN_W );
1804 /* XMM[2] = min(XMM[2], 128.0) */
1805 sse_minps(
1806 func,
1807 make_xmm( 2 ),
1808 get_temp(
1809 TGSI_EXEC_TEMP_128_I,
1810 TGSI_EXEC_TEMP_128_C ) );
1811 /* XMM[2] = max(XMM[2], -128.0) */
1812 sse_maxps(
1813 func,
1814 make_xmm( 2 ),
1815 get_temp(
1816 TGSI_EXEC_TEMP_MINUS_128_I,
1817 TGSI_EXEC_TEMP_MINUS_128_C ) );
1818 emit_pow( func, 3, 1, 1, 2 );
1819 FETCH( func, *inst, 0, 0, CHAN_X );
1820 sse_xorps(
1821 func,
1822 make_xmm( 2 ),
1823 make_xmm( 2 ) );
1824 sse_cmpps(
1825 func,
1826 make_xmm( 2 ),
1827 make_xmm( 0 ),
1828 cc_LessThan );
1829 sse_andps(
1830 func,
1831 make_xmm( 2 ),
1832 make_xmm( 1 ) );
1833 STORE( func, *inst, 2, 0, CHAN_Z );
1834 }
1835 }
1836 break;
1837
1838 case TGSI_OPCODE_RCP:
1839 /* TGSI_OPCODE_RECIP */
1840 FETCH( func, *inst, 0, 0, CHAN_X );
1841 emit_rcp( func, 0, 0 );
1842 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1843 STORE( func, *inst, 0, 0, chan_index );
1844 }
1845 break;
1846
1847 case TGSI_OPCODE_RSQ:
1848 /* TGSI_OPCODE_RECIPSQRT */
1849 FETCH( func, *inst, 0, 0, CHAN_X );
1850 emit_abs( func, 0 );
1851 emit_rsqrt( func, 1, 0 );
1852 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1853 STORE( func, *inst, 1, 0, chan_index );
1854 }
1855 break;
1856
1857 case TGSI_OPCODE_EXP:
1858 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1859 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1860 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1861 FETCH( func, *inst, 0, 0, CHAN_X );
1862 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1863 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1864 emit_MOV( func, 1, 0 );
1865 emit_flr( func, 2, 1 );
1866 /* dst.x = ex2(floor(src.x)) */
1867 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1868 emit_MOV( func, 2, 1 );
1869 emit_ex2( func, 3, 2 );
1870 STORE( func, *inst, 2, 0, CHAN_X );
1871 }
1872 /* dst.y = src.x - floor(src.x) */
1873 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1874 emit_MOV( func, 2, 0 );
1875 emit_sub( func, 2, 1 );
1876 STORE( func, *inst, 2, 0, CHAN_Y );
1877 }
1878 }
1879 /* dst.z = ex2(src.x) */
1880 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1881 emit_ex2( func, 3, 0 );
1882 STORE( func, *inst, 0, 0, CHAN_Z );
1883 }
1884 }
1885 /* dst.w = 1.0 */
1886 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1887 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1888 STORE( func, *inst, 0, 0, CHAN_W );
1889 }
1890 break;
1891
1892 case TGSI_OPCODE_LOG:
1893 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1894 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1895 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1896 FETCH( func, *inst, 0, 0, CHAN_X );
1897 emit_abs( func, 0 );
1898 emit_MOV( func, 1, 0 );
1899 emit_lg2( func, 2, 1 );
1900 /* dst.z = lg2(abs(src.x)) */
1901 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1902 STORE( func, *inst, 1, 0, CHAN_Z );
1903 }
1904 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1905 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1906 emit_flr( func, 2, 1 );
1907 /* dst.x = floor(lg2(abs(src.x))) */
1908 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1909 STORE( func, *inst, 1, 0, CHAN_X );
1910 }
1911 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1912 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1913 emit_ex2( func, 2, 1 );
1914 emit_rcp( func, 1, 1 );
1915 emit_mul( func, 0, 1 );
1916 STORE( func, *inst, 0, 0, CHAN_Y );
1917 }
1918 }
1919 }
1920 /* dst.w = 1.0 */
1921 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1922 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1923 STORE( func, *inst, 0, 0, CHAN_W );
1924 }
1925 break;
1926
1927 case TGSI_OPCODE_MUL:
1928 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1929 FETCH( func, *inst, 0, 0, chan_index );
1930 FETCH( func, *inst, 1, 1, chan_index );
1931 emit_mul( func, 0, 1 );
1932 STORE( func, *inst, 0, 0, chan_index );
1933 }
1934 break;
1935
1936 case TGSI_OPCODE_ADD:
1937 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1938 FETCH( func, *inst, 0, 0, chan_index );
1939 FETCH( func, *inst, 1, 1, chan_index );
1940 emit_add( func, 0, 1 );
1941 STORE( func, *inst, 0, 0, chan_index );
1942 }
1943 break;
1944
1945 case TGSI_OPCODE_DP3:
1946 /* TGSI_OPCODE_DOT3 */
1947 FETCH( func, *inst, 0, 0, CHAN_X );
1948 FETCH( func, *inst, 1, 1, CHAN_X );
1949 emit_mul( func, 0, 1 );
1950 FETCH( func, *inst, 1, 0, CHAN_Y );
1951 FETCH( func, *inst, 2, 1, CHAN_Y );
1952 emit_mul( func, 1, 2 );
1953 emit_add( func, 0, 1 );
1954 FETCH( func, *inst, 1, 0, CHAN_Z );
1955 FETCH( func, *inst, 2, 1, CHAN_Z );
1956 emit_mul( func, 1, 2 );
1957 emit_add( func, 0, 1 );
1958 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1959 STORE( func, *inst, 0, 0, chan_index );
1960 }
1961 break;
1962
1963 case TGSI_OPCODE_DP4:
1964 /* TGSI_OPCODE_DOT4 */
1965 FETCH( func, *inst, 0, 0, CHAN_X );
1966 FETCH( func, *inst, 1, 1, CHAN_X );
1967 emit_mul( func, 0, 1 );
1968 FETCH( func, *inst, 1, 0, CHAN_Y );
1969 FETCH( func, *inst, 2, 1, CHAN_Y );
1970 emit_mul( func, 1, 2 );
1971 emit_add( func, 0, 1 );
1972 FETCH( func, *inst, 1, 0, CHAN_Z );
1973 FETCH( func, *inst, 2, 1, CHAN_Z );
1974 emit_mul(func, 1, 2 );
1975 emit_add(func, 0, 1 );
1976 FETCH( func, *inst, 1, 0, CHAN_W );
1977 FETCH( func, *inst, 2, 1, CHAN_W );
1978 emit_mul( func, 1, 2 );
1979 emit_add( func, 0, 1 );
1980 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1981 STORE( func, *inst, 0, 0, chan_index );
1982 }
1983 break;
1984
1985 case TGSI_OPCODE_DST:
1986 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1987 emit_tempf(
1988 func,
1989 0,
1990 TEMP_ONE_I,
1991 TEMP_ONE_C );
1992 STORE( func, *inst, 0, 0, CHAN_X );
1993 }
1994 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1995 FETCH( func, *inst, 0, 0, CHAN_Y );
1996 FETCH( func, *inst, 1, 1, CHAN_Y );
1997 emit_mul( func, 0, 1 );
1998 STORE( func, *inst, 0, 0, CHAN_Y );
1999 }
2000 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2001 FETCH( func, *inst, 0, 0, CHAN_Z );
2002 STORE( func, *inst, 0, 0, CHAN_Z );
2003 }
2004 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2005 FETCH( func, *inst, 0, 1, CHAN_W );
2006 STORE( func, *inst, 0, 0, CHAN_W );
2007 }
2008 break;
2009
2010 case TGSI_OPCODE_MIN:
2011 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2012 FETCH( func, *inst, 0, 0, chan_index );
2013 FETCH( func, *inst, 1, 1, chan_index );
2014 sse_minps(
2015 func,
2016 make_xmm( 0 ),
2017 make_xmm( 1 ) );
2018 STORE( func, *inst, 0, 0, chan_index );
2019 }
2020 break;
2021
2022 case TGSI_OPCODE_MAX:
2023 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2024 FETCH( func, *inst, 0, 0, chan_index );
2025 FETCH( func, *inst, 1, 1, chan_index );
2026 sse_maxps(
2027 func,
2028 make_xmm( 0 ),
2029 make_xmm( 1 ) );
2030 STORE( func, *inst, 0, 0, chan_index );
2031 }
2032 break;
2033
2034 case TGSI_OPCODE_SLT:
2035 /* TGSI_OPCODE_SETLT */
2036 emit_setcc( func, inst, cc_LessThan );
2037 break;
2038
2039 case TGSI_OPCODE_SGE:
2040 /* TGSI_OPCODE_SETGE */
2041 emit_setcc( func, inst, cc_NotLessThan );
2042 break;
2043
2044 case TGSI_OPCODE_MAD:
2045 /* TGSI_OPCODE_MADD */
2046 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2047 FETCH( func, *inst, 0, 0, chan_index );
2048 FETCH( func, *inst, 1, 1, chan_index );
2049 FETCH( func, *inst, 2, 2, chan_index );
2050 emit_mul( func, 0, 1 );
2051 emit_add( func, 0, 2 );
2052 STORE( func, *inst, 0, 0, chan_index );
2053 }
2054 break;
2055
2056 case TGSI_OPCODE_SUB:
2057 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2058 FETCH( func, *inst, 0, 0, chan_index );
2059 FETCH( func, *inst, 1, 1, chan_index );
2060 emit_sub( func, 0, 1 );
2061 STORE( func, *inst, 0, 0, chan_index );
2062 }
2063 break;
2064
2065 case TGSI_OPCODE_LERP:
2066 /* TGSI_OPCODE_LRP */
2067 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2068 FETCH( func, *inst, 0, 0, chan_index );
2069 FETCH( func, *inst, 1, 1, chan_index );
2070 FETCH( func, *inst, 2, 2, chan_index );
2071 emit_sub( func, 1, 2 );
2072 emit_mul( func, 0, 1 );
2073 emit_add( func, 0, 2 );
2074 STORE( func, *inst, 0, 0, chan_index );
2075 }
2076 break;
2077
2078 case TGSI_OPCODE_CND:
2079 return 0;
2080 break;
2081
2082 case TGSI_OPCODE_CND0:
2083 return 0;
2084 break;
2085
2086 case TGSI_OPCODE_DOT2ADD:
2087 /* TGSI_OPCODE_DP2A */
2088 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2089 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2090 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2091 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2092 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2093 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2094 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2095 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2096 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2097 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2098 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2099 }
2100 break;
2101
2102 case TGSI_OPCODE_INDEX:
2103 return 0;
2104 break;
2105
2106 case TGSI_OPCODE_NEGATE:
2107 return 0;
2108 break;
2109
2110 case TGSI_OPCODE_FRAC:
2111 /* TGSI_OPCODE_FRC */
2112 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2113 FETCH( func, *inst, 0, 0, chan_index );
2114 emit_frc( func, 0, 0 );
2115 STORE( func, *inst, 0, 0, chan_index );
2116 }
2117 break;
2118
2119 case TGSI_OPCODE_CLAMP:
2120 return 0;
2121 break;
2122
2123 case TGSI_OPCODE_FLOOR:
2124 /* TGSI_OPCODE_FLR */
2125 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2126 FETCH( func, *inst, 0, 0, chan_index );
2127 emit_flr( func, 0, 0 );
2128 STORE( func, *inst, 0, 0, chan_index );
2129 }
2130 break;
2131
2132 case TGSI_OPCODE_ROUND:
2133 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2134 FETCH( func, *inst, 0, 0, chan_index );
2135 emit_rnd( func, 0, 0 );
2136 STORE( func, *inst, 0, 0, chan_index );
2137 }
2138 break;
2139
2140 case TGSI_OPCODE_EXPBASE2:
2141 /* TGSI_OPCODE_EX2 */
2142 FETCH( func, *inst, 0, 0, CHAN_X );
2143 emit_ex2( func, 0, 0 );
2144 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2145 STORE( func, *inst, 0, 0, chan_index );
2146 }
2147 break;
2148
2149 case TGSI_OPCODE_LOGBASE2:
2150 /* TGSI_OPCODE_LG2 */
2151 FETCH( func, *inst, 0, 0, CHAN_X );
2152 emit_lg2( func, 0, 0 );
2153 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2154 STORE( func, *inst, 0, 0, chan_index );
2155 }
2156 break;
2157
2158 case TGSI_OPCODE_POWER:
2159 /* TGSI_OPCODE_POW */
2160 FETCH( func, *inst, 0, 0, CHAN_X );
2161 FETCH( func, *inst, 1, 1, CHAN_X );
2162 emit_pow( func, 0, 0, 0, 1 );
2163 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2164 STORE( func, *inst, 0, 0, chan_index );
2165 }
2166 break;
2167
2168 case TGSI_OPCODE_CROSSPRODUCT:
2169 /* TGSI_OPCODE_XPD */
2170 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2171 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2172 FETCH( func, *inst, 1, 1, CHAN_Z );
2173 FETCH( func, *inst, 3, 0, CHAN_Z );
2174 }
2175 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2176 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2177 FETCH( func, *inst, 0, 0, CHAN_Y );
2178 FETCH( func, *inst, 4, 1, CHAN_Y );
2179 }
2180 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2181 emit_MOV( func, 2, 0 );
2182 emit_mul( func, 2, 1 );
2183 emit_MOV( func, 5, 3 );
2184 emit_mul( func, 5, 4 );
2185 emit_sub( func, 2, 5 );
2186 STORE( func, *inst, 2, 0, CHAN_X );
2187 }
2188 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2189 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2190 FETCH( func, *inst, 2, 1, CHAN_X );
2191 FETCH( func, *inst, 5, 0, CHAN_X );
2192 }
2193 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2194 emit_mul( func, 3, 2 );
2195 emit_mul( func, 1, 5 );
2196 emit_sub( func, 3, 1 );
2197 STORE( func, *inst, 3, 0, CHAN_Y );
2198 }
2199 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2200 emit_mul( func, 5, 4 );
2201 emit_mul( func, 0, 2 );
2202 emit_sub( func, 5, 0 );
2203 STORE( func, *inst, 5, 0, CHAN_Z );
2204 }
2205 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2206 emit_tempf(
2207 func,
2208 0,
2209 TEMP_ONE_I,
2210 TEMP_ONE_C );
2211 STORE( func, *inst, 0, 0, CHAN_W );
2212 }
2213 break;
2214
2215 case TGSI_OPCODE_MULTIPLYMATRIX:
2216 return 0;
2217 break;
2218
2219 case TGSI_OPCODE_ABS:
2220 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2221 FETCH( func, *inst, 0, 0, chan_index );
2222 emit_abs( func, 0) ;
2223
2224 STORE( func, *inst, 0, 0, chan_index );
2225 }
2226 break;
2227
2228 case TGSI_OPCODE_RCC:
2229 return 0;
2230 break;
2231
2232 case TGSI_OPCODE_DPH:
2233 FETCH( func, *inst, 0, 0, CHAN_X );
2234 FETCH( func, *inst, 1, 1, CHAN_X );
2235 emit_mul( func, 0, 1 );
2236 FETCH( func, *inst, 1, 0, CHAN_Y );
2237 FETCH( func, *inst, 2, 1, CHAN_Y );
2238 emit_mul( func, 1, 2 );
2239 emit_add( func, 0, 1 );
2240 FETCH( func, *inst, 1, 0, CHAN_Z );
2241 FETCH( func, *inst, 2, 1, CHAN_Z );
2242 emit_mul( func, 1, 2 );
2243 emit_add( func, 0, 1 );
2244 FETCH( func, *inst, 1, 1, CHAN_W );
2245 emit_add( func, 0, 1 );
2246 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2247 STORE( func, *inst, 0, 0, chan_index );
2248 }
2249 break;
2250
2251 case TGSI_OPCODE_COS:
2252 FETCH( func, *inst, 0, 0, CHAN_X );
2253 emit_cos( func, 0, 0 );
2254 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2255 STORE( func, *inst, 0, 0, chan_index );
2256 }
2257 break;
2258
2259 case TGSI_OPCODE_DDX:
2260 return 0;
2261 break;
2262
2263 case TGSI_OPCODE_DDY:
2264 return 0;
2265 break;
2266
2267 case TGSI_OPCODE_KILP:
2268 /* predicated kill */
2269 emit_kilp( func );
2270 return 0; /* XXX fix me */
2271 break;
2272
2273 case TGSI_OPCODE_KIL:
2274 /* conditional kill */
2275 emit_kil( func, &inst->FullSrcRegisters[0] );
2276 break;
2277
2278 case TGSI_OPCODE_PK2H:
2279 return 0;
2280 break;
2281
2282 case TGSI_OPCODE_PK2US:
2283 return 0;
2284 break;
2285
2286 case TGSI_OPCODE_PK4B:
2287 return 0;
2288 break;
2289
2290 case TGSI_OPCODE_PK4UB:
2291 return 0;
2292 break;
2293
2294 case TGSI_OPCODE_RFL:
2295 return 0;
2296 break;
2297
2298 case TGSI_OPCODE_SEQ:
2299 return 0;
2300 break;
2301
2302 case TGSI_OPCODE_SFL:
2303 return 0;
2304 break;
2305
2306 case TGSI_OPCODE_SGT:
2307 return 0;
2308 break;
2309
2310 case TGSI_OPCODE_SIN:
2311 FETCH( func, *inst, 0, 0, CHAN_X );
2312 emit_sin( func, 0, 0 );
2313 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2314 STORE( func, *inst, 0, 0, chan_index );
2315 }
2316 break;
2317
2318 case TGSI_OPCODE_SLE:
2319 return 0;
2320 break;
2321
2322 case TGSI_OPCODE_SNE:
2323 return 0;
2324 break;
2325
2326 case TGSI_OPCODE_STR:
2327 return 0;
2328 break;
2329
2330 case TGSI_OPCODE_TEX:
2331 emit_tex( func, inst, FALSE, FALSE );
2332 break;
2333
2334 case TGSI_OPCODE_TXD:
2335 return 0;
2336 break;
2337
2338 case TGSI_OPCODE_UP2H:
2339 return 0;
2340 break;
2341
2342 case TGSI_OPCODE_UP2US:
2343 return 0;
2344 break;
2345
2346 case TGSI_OPCODE_UP4B:
2347 return 0;
2348 break;
2349
2350 case TGSI_OPCODE_UP4UB:
2351 return 0;
2352 break;
2353
2354 case TGSI_OPCODE_X2D:
2355 return 0;
2356 break;
2357
2358 case TGSI_OPCODE_ARA:
2359 return 0;
2360 break;
2361
2362 case TGSI_OPCODE_ARR:
2363 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2364 FETCH( func, *inst, 0, 0, chan_index );
2365 emit_rnd( func, 0, 0 );
2366 emit_f2it( func, 0 );
2367 STORE( func, *inst, 0, 0, chan_index );
2368 }
2369 break;
2370
2371 case TGSI_OPCODE_BRA:
2372 return 0;
2373 break;
2374
2375 case TGSI_OPCODE_CAL:
2376 return 0;
2377 break;
2378
2379 case TGSI_OPCODE_RET:
2380 emit_ret( func );
2381 break;
2382
2383 case TGSI_OPCODE_END:
2384 break;
2385
2386 case TGSI_OPCODE_SSG:
2387 /* TGSI_OPCODE_SGN */
2388 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2389 FETCH( func, *inst, 0, 0, chan_index );
2390 emit_sgn( func, 0, 0 );
2391 STORE( func, *inst, 0, 0, chan_index );
2392 }
2393 break;
2394
2395 case TGSI_OPCODE_CMP:
2396 emit_cmp (func, inst);
2397 break;
2398
2399 case TGSI_OPCODE_SCS:
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2401 FETCH( func, *inst, 0, 0, CHAN_X );
2402 emit_cos( func, 0, 0 );
2403 STORE( func, *inst, 0, 0, CHAN_X );
2404 }
2405 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2406 FETCH( func, *inst, 0, 0, CHAN_X );
2407 emit_sin( func, 0, 0 );
2408 STORE( func, *inst, 0, 0, CHAN_Y );
2409 }
2410 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2411 emit_tempf(
2412 func,
2413 0,
2414 TGSI_EXEC_TEMP_00000000_I,
2415 TGSI_EXEC_TEMP_00000000_C );
2416 STORE( func, *inst, 0, 0, CHAN_Z );
2417 }
2418 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2419 emit_tempf(
2420 func,
2421 0,
2422 TEMP_ONE_I,
2423 TEMP_ONE_C );
2424 STORE( func, *inst, 0, 0, CHAN_W );
2425 }
2426 break;
2427
2428 case TGSI_OPCODE_TXB:
2429 emit_tex( func, inst, TRUE, FALSE );
2430 break;
2431
2432 case TGSI_OPCODE_NRM:
2433 /* fall-through */
2434 case TGSI_OPCODE_NRM4:
2435 /* 3 or 4-component normalization */
2436 {
2437 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2438
2439 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2440 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2441 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2442 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2443
2444 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2445
2446 /* xmm4 = src.x */
2447 /* xmm0 = src.x * src.x */
2448 FETCH(func, *inst, 0, 0, CHAN_X);
2449 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2450 emit_MOV(func, 4, 0);
2451 }
2452 emit_mul(func, 0, 0);
2453
2454 /* xmm5 = src.y */
2455 /* xmm0 = xmm0 + src.y * src.y */
2456 FETCH(func, *inst, 1, 0, CHAN_Y);
2457 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2458 emit_MOV(func, 5, 1);
2459 }
2460 emit_mul(func, 1, 1);
2461 emit_add(func, 0, 1);
2462
2463 /* xmm6 = src.z */
2464 /* xmm0 = xmm0 + src.z * src.z */
2465 FETCH(func, *inst, 1, 0, CHAN_Z);
2466 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2467 emit_MOV(func, 6, 1);
2468 }
2469 emit_mul(func, 1, 1);
2470 emit_add(func, 0, 1);
2471
2472 if (dims == 4) {
2473 /* xmm7 = src.w */
2474 /* xmm0 = xmm0 + src.w * src.w */
2475 FETCH(func, *inst, 1, 0, CHAN_W);
2476 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2477 emit_MOV(func, 7, 1);
2478 }
2479 emit_mul(func, 1, 1);
2480 emit_add(func, 0, 1);
2481 }
2482
2483 /* xmm1 = 1 / sqrt(xmm0) */
2484 emit_rsqrt(func, 1, 0);
2485
2486 /* dst.x = xmm1 * src.x */
2487 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2488 emit_mul(func, 4, 1);
2489 STORE(func, *inst, 4, 0, CHAN_X);
2490 }
2491
2492 /* dst.y = xmm1 * src.y */
2493 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2494 emit_mul(func, 5, 1);
2495 STORE(func, *inst, 5, 0, CHAN_Y);
2496 }
2497
2498 /* dst.z = xmm1 * src.z */
2499 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2500 emit_mul(func, 6, 1);
2501 STORE(func, *inst, 6, 0, CHAN_Z);
2502 }
2503
2504 /* dst.w = xmm1 * src.w */
2505 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2506 emit_mul(func, 7, 1);
2507 STORE(func, *inst, 7, 0, CHAN_W);
2508 }
2509 }
2510
2511 /* dst0.w = 1.0 */
2512 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2513 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2514 STORE(func, *inst, 0, 0, CHAN_W);
2515 }
2516 }
2517 break;
2518
2519 case TGSI_OPCODE_DIV:
2520 return 0;
2521 break;
2522
2523 case TGSI_OPCODE_DP2:
2524 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2525 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2526 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2527 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2528 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2529 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2530 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2531 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2532 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2533 }
2534 break;
2535
2536 case TGSI_OPCODE_TXL:
2537 emit_tex( func, inst, TRUE, FALSE );
2538 break;
2539
2540 case TGSI_OPCODE_TXP:
2541 emit_tex( func, inst, FALSE, TRUE );
2542 break;
2543
2544 case TGSI_OPCODE_BRK:
2545 return 0;
2546 break;
2547
2548 case TGSI_OPCODE_IF:
2549 return 0;
2550 break;
2551
2552 case TGSI_OPCODE_LOOP:
2553 return 0;
2554 break;
2555
2556 case TGSI_OPCODE_REP:
2557 return 0;
2558 break;
2559
2560 case TGSI_OPCODE_ELSE:
2561 return 0;
2562 break;
2563
2564 case TGSI_OPCODE_ENDIF:
2565 return 0;
2566 break;
2567
2568 case TGSI_OPCODE_ENDLOOP:
2569 return 0;
2570 break;
2571
2572 case TGSI_OPCODE_ENDREP:
2573 return 0;
2574 break;
2575
2576 case TGSI_OPCODE_PUSHA:
2577 return 0;
2578 break;
2579
2580 case TGSI_OPCODE_POPA:
2581 return 0;
2582 break;
2583
2584 case TGSI_OPCODE_CEIL:
2585 return 0;
2586 break;
2587
2588 case TGSI_OPCODE_I2F:
2589 return 0;
2590 break;
2591
2592 case TGSI_OPCODE_NOT:
2593 return 0;
2594 break;
2595
2596 case TGSI_OPCODE_TRUNC:
2597 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2598 FETCH( func, *inst, 0, 0, chan_index );
2599 emit_f2it( func, 0 );
2600 emit_i2f( func, 0 );
2601 STORE( func, *inst, 0, 0, chan_index );
2602 }
2603 break;
2604
2605 case TGSI_OPCODE_SHL:
2606 return 0;
2607 break;
2608
2609 case TGSI_OPCODE_SHR:
2610 return 0;
2611 break;
2612
2613 case TGSI_OPCODE_AND:
2614 return 0;
2615 break;
2616
2617 case TGSI_OPCODE_OR:
2618 return 0;
2619 break;
2620
2621 case TGSI_OPCODE_MOD:
2622 return 0;
2623 break;
2624
2625 case TGSI_OPCODE_XOR:
2626 return 0;
2627 break;
2628
2629 case TGSI_OPCODE_SAD:
2630 return 0;
2631 break;
2632
2633 case TGSI_OPCODE_TXF:
2634 return 0;
2635 break;
2636
2637 case TGSI_OPCODE_TXQ:
2638 return 0;
2639 break;
2640
2641 case TGSI_OPCODE_CONT:
2642 return 0;
2643 break;
2644
2645 case TGSI_OPCODE_EMIT:
2646 return 0;
2647 break;
2648
2649 case TGSI_OPCODE_ENDPRIM:
2650 return 0;
2651 break;
2652
2653 default:
2654 return 0;
2655 }
2656
2657 return 1;
2658 }
2659
2660 static void
2661 emit_declaration(
2662 struct x86_function *func,
2663 struct tgsi_full_declaration *decl )
2664 {
2665 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2666 unsigned first, last, mask;
2667 unsigned i, j;
2668
2669 first = decl->DeclarationRange.First;
2670 last = decl->DeclarationRange.Last;
2671 mask = decl->Declaration.UsageMask;
2672
2673 for( i = first; i <= last; i++ ) {
2674 for( j = 0; j < NUM_CHANNELS; j++ ) {
2675 if( mask & (1 << j) ) {
2676 switch( decl->Declaration.Interpolate ) {
2677 case TGSI_INTERPOLATE_CONSTANT:
2678 emit_coef_a0( func, 0, i, j );
2679 emit_inputs( func, 0, i, j );
2680 break;
2681
2682 case TGSI_INTERPOLATE_LINEAR:
2683 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2684 emit_coef_dadx( func, 1, i, j );
2685 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2686 emit_coef_dady( func, 3, i, j );
2687 emit_mul( func, 0, 1 ); /* x * dadx */
2688 emit_coef_a0( func, 4, i, j );
2689 emit_mul( func, 2, 3 ); /* y * dady */
2690 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2691 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2692 emit_inputs( func, 0, i, j );
2693 break;
2694
2695 case TGSI_INTERPOLATE_PERSPECTIVE:
2696 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2697 emit_coef_dadx( func, 1, i, j );
2698 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2699 emit_coef_dady( func, 3, i, j );
2700 emit_mul( func, 0, 1 ); /* x * dadx */
2701 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2702 emit_coef_a0( func, 5, i, j );
2703 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2704 emit_mul( func, 2, 3 ); /* y * dady */
2705 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2706 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2707 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2708 emit_inputs( func, 0, i, j );
2709 break;
2710
2711 default:
2712 assert( 0 );
2713 break;
2714 }
2715 }
2716 }
2717 }
2718 }
2719 }
2720
2721 static void aos_to_soa( struct x86_function *func,
2722 uint arg_aos,
2723 uint arg_machine,
2724 uint arg_num,
2725 uint arg_stride )
2726 {
2727 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2728 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2729 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2730 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2731 int inner_loop;
2732
2733
2734 /* Save EBX */
2735 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2736
2737 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2738 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2739 x86_lea( func, soa_input,
2740 x86_make_disp( soa_input,
2741 Offset(struct tgsi_exec_machine, Inputs) ) );
2742 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2743 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2744
2745 /* do */
2746 inner_loop = x86_get_label( func );
2747 {
2748 x86_push( func, aos_input );
2749 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2750 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2751 x86_add( func, aos_input, stride );
2752 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2753 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2754 x86_add( func, aos_input, stride );
2755 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2756 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2757 x86_add( func, aos_input, stride );
2758 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2759 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2760 x86_pop( func, aos_input );
2761
2762 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2763 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2764 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2765 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2766 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2767 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2768
2769 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2770 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2771 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2772 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2773
2774 /* Advance to next input */
2775 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2776 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2777 }
2778 /* while --num_inputs */
2779 x86_dec( func, num_inputs );
2780 x86_jcc( func, cc_NE, inner_loop );
2781
2782 /* Restore EBX */
2783 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2784 }
2785
2786 static void soa_to_aos( struct x86_function *func,
2787 uint arg_aos,
2788 uint arg_machine,
2789 uint arg_num,
2790 uint arg_stride )
2791 {
2792 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2793 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2794 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2795 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2796 int inner_loop;
2797
2798 /* Save EBX */
2799 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2800
2801 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2802 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2803 x86_lea( func, soa_output,
2804 x86_make_disp( soa_output,
2805 Offset(struct tgsi_exec_machine, Outputs) ) );
2806 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2807
2808 /* do */
2809 inner_loop = x86_get_label( func );
2810 {
2811 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2812 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2813 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2814 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2815
2816 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2817 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2818 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2819 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2820 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2821 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2822
2823 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2824 x86_push( func, aos_output );
2825 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2826 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2827 x86_add( func, aos_output, temp );
2828 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2829 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2830 x86_add( func, aos_output, temp );
2831 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2832 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2833 x86_add( func, aos_output, temp );
2834 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2835 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2836 x86_pop( func, aos_output );
2837
2838 /* Advance to next output */
2839 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2840 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2841 }
2842 /* while --num_outputs */
2843 x86_dec( func, num_outputs );
2844 x86_jcc( func, cc_NE, inner_loop );
2845
2846 /* Restore EBX */
2847 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2848 }
2849
2850 /**
2851 * Translate a TGSI vertex/fragment shader to SSE2 code.
2852 * Slightly different things are done for vertex vs. fragment shaders.
2853 *
2854 * \param tokens the TGSI input shader
2855 * \param func the output SSE code/function
2856 * \param immediates buffer to place immediates, later passed to SSE func
2857 * \param return 1 for success, 0 if translation failed
2858 */
2859 unsigned
2860 tgsi_emit_sse2(
2861 const struct tgsi_token *tokens,
2862 struct x86_function *func,
2863 float (*immediates)[4],
2864 boolean do_swizzles )
2865 {
2866 struct tgsi_parse_context parse;
2867 unsigned ok = 1;
2868 uint num_immediates = 0;
2869
2870 util_init_math();
2871
2872 func->csr = func->store;
2873
2874 tgsi_parse_init( &parse, tokens );
2875
2876 /* Can't just use EDI, EBX without save/restoring them:
2877 */
2878 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2879 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2880
2881 /*
2882 * Different function args for vertex/fragment shaders:
2883 */
2884 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2885 if (do_swizzles)
2886 aos_to_soa( func,
2887 4, /* aos_input */
2888 1, /* machine */
2889 5, /* num_inputs */
2890 6 ); /* input_stride */
2891 }
2892
2893 x86_mov(
2894 func,
2895 get_machine_base(),
2896 x86_fn_arg( func, 1 ) );
2897 x86_mov(
2898 func,
2899 get_const_base(),
2900 x86_fn_arg( func, 2 ) );
2901 x86_mov(
2902 func,
2903 get_immediate_base(),
2904 x86_fn_arg( func, 3 ) );
2905
2906 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2907 x86_mov(
2908 func,
2909 get_coef_base(),
2910 x86_fn_arg( func, 4 ) );
2911
2912 x86_mov(
2913 func,
2914 get_sampler_base(),
2915 x86_make_disp( get_machine_base(),
2916 Offset( struct tgsi_exec_machine, Samplers ) ) );
2917 }
2918
2919
2920 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2921 tgsi_parse_token( &parse );
2922
2923 switch( parse.FullToken.Token.Type ) {
2924 case TGSI_TOKEN_TYPE_DECLARATION:
2925 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2926 emit_declaration(
2927 func,
2928 &parse.FullToken.FullDeclaration );
2929 }
2930 break;
2931
2932 case TGSI_TOKEN_TYPE_INSTRUCTION:
2933 ok = emit_instruction(
2934 func,
2935 &parse.FullToken.FullInstruction );
2936
2937 if (!ok) {
2938 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2939 parse.FullToken.FullInstruction.Instruction.Opcode,
2940 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2941 "vertex shader" : "fragment shader");
2942 }
2943 break;
2944
2945 case TGSI_TOKEN_TYPE_IMMEDIATE:
2946 /* simply copy the immediate values into the next immediates[] slot */
2947 {
2948 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2949 uint i;
2950 assert(size <= 4);
2951 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2952 for( i = 0; i < size; i++ ) {
2953 immediates[num_immediates][i] =
2954 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2955 }
2956 #if 0
2957 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2958 num_immediates,
2959 immediates[num_immediates][0],
2960 immediates[num_immediates][1],
2961 immediates[num_immediates][2],
2962 immediates[num_immediates][3]);
2963 #endif
2964 num_immediates++;
2965 }
2966 break;
2967
2968 default:
2969 ok = 0;
2970 assert( 0 );
2971 }
2972 }
2973
2974 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2975 if (do_swizzles)
2976 soa_to_aos( func,
2977 7, /* aos_output */
2978 1, /* machine */
2979 8, /* num_outputs */
2980 9 ); /* output_stride */
2981 }
2982
2983 /* Can't just use EBX, EDI without save/restoring them:
2984 */
2985 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2986 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2987
2988 emit_ret( func );
2989
2990 tgsi_parse_free( &parse );
2991
2992 return ok;
2993 }
2994
2995 #endif /* PIPE_ARCH_X86 */
2996