tgsi: Disable SSE2 code generation.
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 #include "pipe/p_config.h"
30
31 #include "tgsi/tgsi_sse2.h"
32
33 #if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */
34
35 #include "util/u_debug.h"
36 #include "pipe/p_shader_tokens.h"
37 #include "util/u_math.h"
38 #include "util/u_memory.h"
39 #if defined(PIPE_ARCH_SSE)
40 #include "util/u_sse.h"
41 #endif
42 #include "tgsi/tgsi_info.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_util.h"
45 #include "tgsi/tgsi_dump.h"
46 #include "tgsi/tgsi_exec.h"
47
48 #include "rtasm/rtasm_x86sse.h"
49
50 /* for 1/sqrt()
51 *
52 * This costs about 100fps (close to 10%) in gears:
53 */
54 #define HIGH_PRECISION 1
55
56 #define FAST_MATH 1
57
58
59 #define FOR_EACH_CHANNEL( CHAN )\
60 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
61
62 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
63 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
64
65 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
66 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
67
68 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
69 FOR_EACH_CHANNEL( CHAN )\
70 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
71
72 #define CHAN_X 0
73 #define CHAN_Y 1
74 #define CHAN_Z 2
75 #define CHAN_W 3
76
77 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
78 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
79
80 #define TEMP_R0 TGSI_EXEC_TEMP_R0
81 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
82 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
83 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
84
85
86 /**
87 * X86 utility functions.
88 */
89
90 static struct x86_reg
91 make_xmm(
92 unsigned xmm )
93 {
94 return x86_make_reg(
95 file_XMM,
96 (enum x86_reg_name) xmm );
97 }
98
99 /**
100 * X86 register mapping helpers.
101 */
102
103 static struct x86_reg
104 get_const_base( void )
105 {
106 return x86_make_reg(
107 file_REG32,
108 reg_AX );
109 }
110
111 static struct x86_reg
112 get_machine_base( void )
113 {
114 return x86_make_reg(
115 file_REG32,
116 reg_CX );
117 }
118
119 static struct x86_reg
120 get_input_base( void )
121 {
122 /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
123 return x86_make_disp(
124 get_machine_base(),
125 Offset(struct tgsi_exec_machine, Inputs) );
126 }
127
128 static struct x86_reg
129 get_output_base( void )
130 {
131 /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
132 return x86_make_disp(
133 get_machine_base(),
134 Offset(struct tgsi_exec_machine, Outputs) );
135 }
136
137 static struct x86_reg
138 get_temp_base( void )
139 {
140 return x86_make_disp(
141 get_machine_base(),
142 Offset(struct tgsi_exec_machine, Temps) );
143 }
144
145 static struct x86_reg
146 get_coef_base( void )
147 {
148 return x86_make_reg(
149 file_REG32,
150 reg_BX );
151 }
152
153 static struct x86_reg
154 get_sampler_base( void )
155 {
156 return x86_make_reg(
157 file_REG32,
158 reg_DI );
159 }
160
161 static struct x86_reg
162 get_immediate_base( void )
163 {
164 return x86_make_reg(
165 file_REG32,
166 reg_DX );
167 }
168
169 static struct x86_reg
170 get_system_value_base( void )
171 {
172 return x86_make_disp(
173 get_machine_base(),
174 Offset(struct tgsi_exec_machine, SystemValue) );
175 }
176
177
178 /**
179 * Data access helpers.
180 */
181
182
183 static struct x86_reg
184 get_immediate(
185 unsigned vec,
186 unsigned chan )
187 {
188 return x86_make_disp(
189 get_immediate_base(),
190 (vec * 4 + chan) * 4 );
191 }
192
193 static struct x86_reg
194 get_const(
195 unsigned vec,
196 unsigned chan )
197 {
198 return x86_make_disp(
199 get_const_base(),
200 (vec * 4 + chan) * 4 );
201 }
202
203 static struct x86_reg
204 get_sampler_ptr(
205 unsigned unit )
206 {
207 return x86_make_disp(
208 get_sampler_base(),
209 unit * sizeof( struct tgsi_sampler * ) );
210 }
211
212 static struct x86_reg
213 get_input(
214 unsigned vec,
215 unsigned chan )
216 {
217 return x86_make_disp(
218 get_input_base(),
219 (vec * 4 + chan) * 16 );
220 }
221
222 static struct x86_reg
223 get_output(
224 unsigned vec,
225 unsigned chan )
226 {
227 return x86_make_disp(
228 get_output_base(),
229 (vec * 4 + chan) * 16 );
230 }
231
232 static struct x86_reg
233 get_temp(
234 unsigned vec,
235 unsigned chan )
236 {
237 return x86_make_disp(
238 get_temp_base(),
239 (vec * 4 + chan) * 16 );
240 }
241
242 static struct x86_reg
243 get_system_value(
244 unsigned vec,
245 unsigned chan )
246 {
247 return x86_make_disp(
248 get_system_value_base(), /* base */
249 (vec * 4 + chan) * 4 ); /* byte offset from base */
250 }
251
252 static struct x86_reg
253 get_coef(
254 unsigned vec,
255 unsigned chan,
256 unsigned member )
257 {
258 return x86_make_disp(
259 get_coef_base(),
260 ((vec * 3 + member) * 4 + chan) * 4 );
261 }
262
263
264 static void
265 emit_ret(
266 struct x86_function *func )
267 {
268 x86_ret( func );
269 }
270
271
272 /**
273 * Data fetch helpers.
274 */
275
276 /**
277 * Copy a shader constant to xmm register
278 * \param xmm the destination xmm register
279 * \param vec the src const buffer index
280 * \param chan src channel to fetch (X, Y, Z or W)
281 */
282 static void
283 emit_const(
284 struct x86_function *func,
285 uint xmm,
286 int vec,
287 uint chan,
288 uint indirect,
289 uint indirectFile,
290 int indirectIndex )
291 {
292 if (indirect) {
293 /* 'vec' is the offset from the address register's value.
294 * We're loading CONST[ADDR+vec] into an xmm register.
295 */
296 struct x86_reg r0 = get_immediate_base();
297 struct x86_reg r1 = get_coef_base();
298 uint i;
299
300 assert( indirectFile == TGSI_FILE_ADDRESS );
301 assert( indirectIndex == 0 );
302 assert( r0.mod == mod_REG );
303 assert( r1.mod == mod_REG );
304
305 x86_push( func, r0 );
306 x86_push( func, r1 );
307
308 /*
309 * Loop over the four pixels or vertices in the quad.
310 * Get the value of the address (offset) register for pixel/vertex[i],
311 * add it to the src offset and index into the constant buffer.
312 * Note that we're working on SOA data.
313 * If any of the pixel/vertex execution channels are unused their
314 * values will be garbage. It's very important that we don't use
315 * those garbage values as indexes into the constant buffer since
316 * that'll cause segfaults.
317 * The solution is to bitwise-AND the offset with the execution mask
318 * register whose values are either 0 or ~0.
319 * The caller must setup the execution mask register to indicate
320 * which channels are valid/alive before running the shader.
321 * The execution mask will also figure into loops and conditionals
322 * someday.
323 */
324 for (i = 0; i < QUAD_SIZE; i++) {
325 /* r1 = address register[i] */
326 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
327 /* r0 = execution mask[i] */
328 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
329 /* r1 = r1 & r0 */
330 x86_and( func, r1, r0 );
331 /* r0 = 'vec', the offset */
332 x86_lea( func, r0, get_const( vec, chan ) );
333
334 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
335 */
336 x86_add( func, r1, r1 );
337 x86_add( func, r1, r1 );
338 x86_add( func, r1, r1 );
339 x86_add( func, r1, r1 );
340
341 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
342 x86_mov( func, r1, x86_deref( r0 ) );
343 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
344 }
345
346 x86_pop( func, r1 );
347 x86_pop( func, r0 );
348
349 sse_movaps(
350 func,
351 make_xmm( xmm ),
352 get_temp( TEMP_R0, CHAN_X ) );
353 }
354 else {
355 /* 'vec' is the index into the src register file, such as TEMP[vec] */
356 assert( vec >= 0 );
357
358 sse_movss(
359 func,
360 make_xmm( xmm ),
361 get_const( vec, chan ) );
362 sse_shufps(
363 func,
364 make_xmm( xmm ),
365 make_xmm( xmm ),
366 SHUF( 0, 0, 0, 0 ) );
367 }
368 }
369
370 static void
371 emit_immediate(
372 struct x86_function *func,
373 unsigned xmm,
374 unsigned vec,
375 unsigned chan )
376 {
377 sse_movss(
378 func,
379 make_xmm( xmm ),
380 get_immediate( vec, chan ) );
381 sse_shufps(
382 func,
383 make_xmm( xmm ),
384 make_xmm( xmm ),
385 SHUF( 0, 0, 0, 0 ) );
386 }
387
388
389 /**
390 * Copy a shader input to xmm register
391 * \param xmm the destination xmm register
392 * \param vec the src input attrib
393 * \param chan src channel to fetch (X, Y, Z or W)
394 */
395 static void
396 emit_inputf(
397 struct x86_function *func,
398 unsigned xmm,
399 unsigned vec,
400 unsigned chan )
401 {
402 sse_movups(
403 func,
404 make_xmm( xmm ),
405 get_input( vec, chan ) );
406 }
407
408 /**
409 * Store an xmm register to a shader output
410 * \param xmm the source xmm register
411 * \param vec the dest output attrib
412 * \param chan src dest channel to store (X, Y, Z or W)
413 */
414 static void
415 emit_output(
416 struct x86_function *func,
417 unsigned xmm,
418 unsigned vec,
419 unsigned chan )
420 {
421 sse_movups(
422 func,
423 get_output( vec, chan ),
424 make_xmm( xmm ) );
425 }
426
427 /**
428 * Copy a shader temporary to xmm register
429 * \param xmm the destination xmm register
430 * \param vec the src temp register
431 * \param chan src channel to fetch (X, Y, Z or W)
432 */
433 static void
434 emit_tempf(
435 struct x86_function *func,
436 unsigned xmm,
437 unsigned vec,
438 unsigned chan )
439 {
440 sse_movaps(
441 func,
442 make_xmm( xmm ),
443 get_temp( vec, chan ) );
444 }
445
446 /**
447 * Copy a system value to xmm register
448 * \param xmm the destination xmm register
449 * \param vec the source system value register
450 * \param chan src channel to fetch (X, Y, Z or W)
451 */
452 static void
453 emit_system_value(
454 struct x86_function *func,
455 unsigned xmm,
456 unsigned vec,
457 unsigned chan )
458 {
459 sse_movss(
460 func,
461 make_xmm( xmm ),
462 get_system_value( vec, chan ) );
463 sse_shufps(
464 func,
465 make_xmm( xmm ),
466 make_xmm( xmm ),
467 SHUF( 0, 0, 0, 0 ) );
468 }
469
470 /**
471 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
472 * \param xmm the destination xmm register
473 * \param vec the src input/attribute coefficient index
474 * \param chan src channel to fetch (X, Y, Z or W)
475 * \param member 0=a0, 1=dadx, 2=dady
476 */
477 static void
478 emit_coef(
479 struct x86_function *func,
480 unsigned xmm,
481 unsigned vec,
482 unsigned chan,
483 unsigned member )
484 {
485 sse_movss(
486 func,
487 make_xmm( xmm ),
488 get_coef( vec, chan, member ) );
489 sse_shufps(
490 func,
491 make_xmm( xmm ),
492 make_xmm( xmm ),
493 SHUF( 0, 0, 0, 0 ) );
494 }
495
496 /**
497 * Data store helpers.
498 */
499
500 static void
501 emit_inputs(
502 struct x86_function *func,
503 unsigned xmm,
504 unsigned vec,
505 unsigned chan )
506 {
507 sse_movups(
508 func,
509 get_input( vec, chan ),
510 make_xmm( xmm ) );
511 }
512
513 static void
514 emit_temps(
515 struct x86_function *func,
516 unsigned xmm,
517 unsigned vec,
518 unsigned chan )
519 {
520 sse_movaps(
521 func,
522 get_temp( vec, chan ),
523 make_xmm( xmm ) );
524 }
525
526 static void
527 emit_addrs(
528 struct x86_function *func,
529 unsigned xmm,
530 unsigned vec,
531 unsigned chan )
532 {
533 assert( vec == 0 );
534
535 emit_temps(
536 func,
537 xmm,
538 vec + TGSI_EXEC_TEMP_ADDR,
539 chan );
540 }
541
542 /**
543 * Coefficent fetch helpers.
544 */
545
546 static void
547 emit_coef_a0(
548 struct x86_function *func,
549 unsigned xmm,
550 unsigned vec,
551 unsigned chan )
552 {
553 emit_coef(
554 func,
555 xmm,
556 vec,
557 chan,
558 0 );
559 }
560
561 static void
562 emit_coef_dadx(
563 struct x86_function *func,
564 unsigned xmm,
565 unsigned vec,
566 unsigned chan )
567 {
568 emit_coef(
569 func,
570 xmm,
571 vec,
572 chan,
573 1 );
574 }
575
576 static void
577 emit_coef_dady(
578 struct x86_function *func,
579 unsigned xmm,
580 unsigned vec,
581 unsigned chan )
582 {
583 emit_coef(
584 func,
585 xmm,
586 vec,
587 chan,
588 2 );
589 }
590
591 /**
592 * Function call helpers.
593 */
594
595 /**
596 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
597 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
598 * that the stack pointer is 16 byte aligned, as expected.
599 */
600 static void
601 emit_func_call(
602 struct x86_function *func,
603 unsigned xmm_save_mask,
604 const struct x86_reg *arg,
605 unsigned nr_args,
606 void (PIPE_CDECL *code)() )
607 {
608 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
609 unsigned i, n;
610
611 x86_push(
612 func,
613 x86_make_reg( file_REG32, reg_AX) );
614 x86_push(
615 func,
616 x86_make_reg( file_REG32, reg_CX) );
617 x86_push(
618 func,
619 x86_make_reg( file_REG32, reg_DX) );
620
621 /* Store XMM regs to the stack
622 */
623 for(i = 0, n = 0; i < 8; ++i)
624 if(xmm_save_mask & (1 << i))
625 ++n;
626
627 x86_sub_imm(
628 func,
629 x86_make_reg( file_REG32, reg_SP ),
630 n*16);
631
632 for(i = 0, n = 0; i < 8; ++i)
633 if(xmm_save_mask & (1 << i)) {
634 sse_movups(
635 func,
636 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
637 make_xmm( i ) );
638 ++n;
639 }
640
641 for (i = 0; i < nr_args; i++) {
642 /* Load the address of the buffer we use for passing arguments and
643 * receiving results:
644 */
645 x86_lea(
646 func,
647 ecx,
648 arg[i] );
649
650 /* Push actual function arguments (currently just the pointer to
651 * the buffer above), and call the function:
652 */
653 x86_push( func, ecx );
654 }
655
656 x86_mov_reg_imm( func, ecx, (unsigned long) code );
657 x86_call( func, ecx );
658
659 /* Pop the arguments (or just add an immediate to esp)
660 */
661 for (i = 0; i < nr_args; i++) {
662 x86_pop(func, ecx );
663 }
664
665 /* Pop the saved XMM regs:
666 */
667 for(i = 0, n = 0; i < 8; ++i)
668 if(xmm_save_mask & (1 << i)) {
669 sse_movups(
670 func,
671 make_xmm( i ),
672 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
673 ++n;
674 }
675
676 x86_add_imm(
677 func,
678 x86_make_reg( file_REG32, reg_SP ),
679 n*16);
680
681 /* Restore GP registers in a reverse order.
682 */
683 x86_pop(
684 func,
685 x86_make_reg( file_REG32, reg_DX) );
686 x86_pop(
687 func,
688 x86_make_reg( file_REG32, reg_CX) );
689 x86_pop(
690 func,
691 x86_make_reg( file_REG32, reg_AX) );
692 }
693
694 static void
695 emit_func_call_dst_src1(
696 struct x86_function *func,
697 unsigned xmm_save,
698 unsigned xmm_dst,
699 unsigned xmm_src0,
700 void (PIPE_CDECL *code)() )
701 {
702 struct x86_reg store = get_temp( TEMP_R0, 0 );
703 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
704
705 /* Store our input parameters (in xmm regs) to the buffer we use
706 * for passing arguments. We will pass a pointer to this buffer as
707 * the actual function argument.
708 */
709 sse_movaps(
710 func,
711 store,
712 make_xmm( xmm_src0 ) );
713
714 emit_func_call( func,
715 xmm_mask,
716 &store,
717 1,
718 code );
719
720 sse_movaps(
721 func,
722 make_xmm( xmm_dst ),
723 store );
724 }
725
726
727 static void
728 emit_func_call_dst_src2(
729 struct x86_function *func,
730 unsigned xmm_save,
731 unsigned xmm_dst,
732 unsigned xmm_src0,
733 unsigned xmm_src1,
734 void (PIPE_CDECL *code)() )
735 {
736 struct x86_reg store = get_temp( TEMP_R0, 0 );
737 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
738
739 /* Store two inputs to parameter buffer.
740 */
741 sse_movaps(
742 func,
743 store,
744 make_xmm( xmm_src0 ) );
745
746 sse_movaps(
747 func,
748 x86_make_disp( store, 4 * sizeof(float) ),
749 make_xmm( xmm_src1 ) );
750
751
752 /* Emit the call
753 */
754 emit_func_call( func,
755 xmm_mask,
756 &store,
757 1,
758 code );
759
760 /* Retrieve the results:
761 */
762 sse_movaps(
763 func,
764 make_xmm( xmm_dst ),
765 store );
766 }
767
768
769
770
771
772 #if defined(PIPE_ARCH_SSE)
773
774 /*
775 * Fast SSE2 implementation of special math functions.
776 */
777
778 #define POLY0(x, c0) _mm_set1_ps(c0)
779 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
780 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
781 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
782 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
783 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
784
785 #define EXP_POLY_DEGREE 3
786 #define LOG_POLY_DEGREE 5
787
788 /**
789 * See http://www.devmaster.net/forums/showthread.php?p=43580
790 */
791 static INLINE __m128
792 exp2f4(__m128 x)
793 {
794 __m128i ipart;
795 __m128 fpart, expipart, expfpart;
796
797 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
798 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
799
800 /* ipart = int(x - 0.5) */
801 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
802
803 /* fpart = x - ipart */
804 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
805
806 /* expipart = (float) (1 << ipart) */
807 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
808
809 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
810 #if EXP_POLY_DEGREE == 5
811 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
812 #elif EXP_POLY_DEGREE == 4
813 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
814 #elif EXP_POLY_DEGREE == 3
815 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
816 #elif EXP_POLY_DEGREE == 2
817 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
818 #else
819 #error
820 #endif
821
822 return _mm_mul_ps(expipart, expfpart);
823 }
824
825
826 /**
827 * See http://www.devmaster.net/forums/showthread.php?p=43580
828 */
829 static INLINE __m128
830 log2f4(__m128 x)
831 {
832 __m128i expmask = _mm_set1_epi32(0x7f800000);
833 __m128i mantmask = _mm_set1_epi32(0x007fffff);
834 __m128 one = _mm_set1_ps(1.0f);
835
836 __m128i i = _mm_castps_si128(x);
837
838 /* exp = (float) exponent(x) */
839 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
840
841 /* mant = (float) mantissa(x) */
842 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
843
844 __m128 logmant;
845
846 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
847 * These coefficients can be generate with
848 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
849 */
850 #if LOG_POLY_DEGREE == 6
851 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
852 #elif LOG_POLY_DEGREE == 5
853 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
854 #elif LOG_POLY_DEGREE == 4
855 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
856 #elif LOG_POLY_DEGREE == 3
857 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
858 #else
859 #error
860 #endif
861
862 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
863 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
864
865 return _mm_add_ps(logmant, exp);
866 }
867
868
869 static INLINE __m128
870 powf4(__m128 x, __m128 y)
871 {
872 return exp2f4(_mm_mul_ps(log2f4(x), y));
873 }
874
875 #endif /* PIPE_ARCH_SSE */
876
877
878
879 /**
880 * Low-level instruction translators.
881 */
882
883 static void
884 emit_abs(
885 struct x86_function *func,
886 unsigned xmm )
887 {
888 sse_andps(
889 func,
890 make_xmm( xmm ),
891 get_temp(
892 TGSI_EXEC_TEMP_7FFFFFFF_I,
893 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
894 }
895
896 static void
897 emit_add(
898 struct x86_function *func,
899 unsigned xmm_dst,
900 unsigned xmm_src )
901 {
902 sse_addps(
903 func,
904 make_xmm( xmm_dst ),
905 make_xmm( xmm_src ) );
906 }
907
908 static void PIPE_CDECL
909 cos4f(
910 float *store )
911 {
912 store[0] = cosf( store[0] );
913 store[1] = cosf( store[1] );
914 store[2] = cosf( store[2] );
915 store[3] = cosf( store[3] );
916 }
917
918 static void
919 emit_cos(
920 struct x86_function *func,
921 unsigned xmm_save,
922 unsigned xmm_dst )
923 {
924 emit_func_call_dst_src1(
925 func,
926 xmm_save,
927 xmm_dst,
928 xmm_dst,
929 cos4f );
930 }
931
932 static void PIPE_CDECL
933 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
934 __attribute__((force_align_arg_pointer))
935 #endif
936 ex24f(
937 float *store )
938 {
939 #if defined(PIPE_ARCH_SSE)
940 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
941 #else
942 store[0] = util_fast_exp2( store[0] );
943 store[1] = util_fast_exp2( store[1] );
944 store[2] = util_fast_exp2( store[2] );
945 store[3] = util_fast_exp2( store[3] );
946 #endif
947 }
948
949 static void
950 emit_ex2(
951 struct x86_function *func,
952 unsigned xmm_save,
953 unsigned xmm_dst )
954 {
955 emit_func_call_dst_src1(
956 func,
957 xmm_save,
958 xmm_dst,
959 xmm_dst,
960 ex24f );
961 }
962
963 static void
964 emit_f2it(
965 struct x86_function *func,
966 unsigned xmm )
967 {
968 sse2_cvttps2dq(
969 func,
970 make_xmm( xmm ),
971 make_xmm( xmm ) );
972 }
973
974 static void
975 emit_i2f(
976 struct x86_function *func,
977 unsigned xmm )
978 {
979 sse2_cvtdq2ps(
980 func,
981 make_xmm( xmm ),
982 make_xmm( xmm ) );
983 }
984
985 static void PIPE_CDECL
986 flr4f(
987 float *store )
988 {
989 store[0] = floorf( store[0] );
990 store[1] = floorf( store[1] );
991 store[2] = floorf( store[2] );
992 store[3] = floorf( store[3] );
993 }
994
995 static void
996 emit_flr(
997 struct x86_function *func,
998 unsigned xmm_save,
999 unsigned xmm_dst )
1000 {
1001 emit_func_call_dst_src1(
1002 func,
1003 xmm_save,
1004 xmm_dst,
1005 xmm_dst,
1006 flr4f );
1007 }
1008
1009 static void PIPE_CDECL
1010 frc4f(
1011 float *store )
1012 {
1013 store[0] -= floorf( store[0] );
1014 store[1] -= floorf( store[1] );
1015 store[2] -= floorf( store[2] );
1016 store[3] -= floorf( store[3] );
1017 }
1018
1019 static void
1020 emit_frc(
1021 struct x86_function *func,
1022 unsigned xmm_save,
1023 unsigned xmm_dst )
1024 {
1025 emit_func_call_dst_src1(
1026 func,
1027 xmm_save,
1028 xmm_dst,
1029 xmm_dst,
1030 frc4f );
1031 }
1032
1033 static void PIPE_CDECL
1034 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1035 __attribute__((force_align_arg_pointer))
1036 #endif
1037 lg24f(
1038 float *store )
1039 {
1040 #if defined(PIPE_ARCH_SSE)
1041 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
1042 #else
1043 store[0] = util_fast_log2( store[0] );
1044 store[1] = util_fast_log2( store[1] );
1045 store[2] = util_fast_log2( store[2] );
1046 store[3] = util_fast_log2( store[3] );
1047 #endif
1048 }
1049
1050 static void
1051 emit_lg2(
1052 struct x86_function *func,
1053 unsigned xmm_save,
1054 unsigned xmm_dst )
1055 {
1056 emit_func_call_dst_src1(
1057 func,
1058 xmm_save,
1059 xmm_dst,
1060 xmm_dst,
1061 lg24f );
1062 }
1063
1064 static void
1065 emit_MOV(
1066 struct x86_function *func,
1067 unsigned xmm_dst,
1068 unsigned xmm_src )
1069 {
1070 sse_movups(
1071 func,
1072 make_xmm( xmm_dst ),
1073 make_xmm( xmm_src ) );
1074 }
1075
1076 static void
1077 emit_mul (struct x86_function *func,
1078 unsigned xmm_dst,
1079 unsigned xmm_src)
1080 {
1081 sse_mulps(
1082 func,
1083 make_xmm( xmm_dst ),
1084 make_xmm( xmm_src ) );
1085 }
1086
1087 static void
1088 emit_neg(
1089 struct x86_function *func,
1090 unsigned xmm )
1091 {
1092 sse_xorps(
1093 func,
1094 make_xmm( xmm ),
1095 get_temp(
1096 TGSI_EXEC_TEMP_80000000_I,
1097 TGSI_EXEC_TEMP_80000000_C ) );
1098 }
1099
1100 static void PIPE_CDECL
1101 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1102 __attribute__((force_align_arg_pointer))
1103 #endif
1104 pow4f(
1105 float *store )
1106 {
1107 #if defined(PIPE_ARCH_SSE)
1108 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1109 #else
1110 store[0] = util_fast_pow( store[0], store[4] );
1111 store[1] = util_fast_pow( store[1], store[5] );
1112 store[2] = util_fast_pow( store[2], store[6] );
1113 store[3] = util_fast_pow( store[3], store[7] );
1114 #endif
1115 }
1116
1117 static void
1118 emit_pow(
1119 struct x86_function *func,
1120 unsigned xmm_save,
1121 unsigned xmm_dst,
1122 unsigned xmm_src0,
1123 unsigned xmm_src1 )
1124 {
1125 emit_func_call_dst_src2(
1126 func,
1127 xmm_save,
1128 xmm_dst,
1129 xmm_src0,
1130 xmm_src1,
1131 pow4f );
1132 }
1133
1134 static void
1135 emit_rcp (
1136 struct x86_function *func,
1137 unsigned xmm_dst,
1138 unsigned xmm_src )
1139 {
1140 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1141 * good enough. Need to either emit a proper divide or use the
1142 * iterative technique described below in emit_rsqrt().
1143 */
1144 sse2_rcpps(
1145 func,
1146 make_xmm( xmm_dst ),
1147 make_xmm( xmm_src ) );
1148 }
1149
1150 static void PIPE_CDECL
1151 rnd4f(
1152 float *store )
1153 {
1154 store[0] = floorf( store[0] + 0.5f );
1155 store[1] = floorf( store[1] + 0.5f );
1156 store[2] = floorf( store[2] + 0.5f );
1157 store[3] = floorf( store[3] + 0.5f );
1158 }
1159
1160 static void
1161 emit_rnd(
1162 struct x86_function *func,
1163 unsigned xmm_save,
1164 unsigned xmm_dst )
1165 {
1166 emit_func_call_dst_src1(
1167 func,
1168 xmm_save,
1169 xmm_dst,
1170 xmm_dst,
1171 rnd4f );
1172 }
1173
1174 static void
1175 emit_rsqrt(
1176 struct x86_function *func,
1177 unsigned xmm_dst,
1178 unsigned xmm_src )
1179 {
1180 #if HIGH_PRECISION
1181 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1182 * implementations, it is possible to improve its precision at
1183 * fairly low cost, using a newton/raphson step, as below:
1184 *
1185 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1186 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1187 *
1188 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1189 */
1190 {
1191 struct x86_reg dst = make_xmm( xmm_dst );
1192 struct x86_reg src = make_xmm( xmm_src );
1193 struct x86_reg tmp0 = make_xmm( 2 );
1194 struct x86_reg tmp1 = make_xmm( 3 );
1195
1196 assert( xmm_dst != xmm_src );
1197 assert( xmm_dst != 2 && xmm_dst != 3 );
1198 assert( xmm_src != 2 && xmm_src != 3 );
1199
1200 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1201 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1202 sse_rsqrtps( func, tmp1, src );
1203 sse_mulps( func, src, tmp1 );
1204 sse_mulps( func, dst, tmp1 );
1205 sse_mulps( func, src, tmp1 );
1206 sse_subps( func, tmp0, src );
1207 sse_mulps( func, dst, tmp0 );
1208 }
1209 #else
1210 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1211 * good enough.
1212 */
1213 sse_rsqrtps(
1214 func,
1215 make_xmm( xmm_dst ),
1216 make_xmm( xmm_src ) );
1217 #endif
1218 }
1219
1220 static void
1221 emit_setsign(
1222 struct x86_function *func,
1223 unsigned xmm )
1224 {
1225 sse_orps(
1226 func,
1227 make_xmm( xmm ),
1228 get_temp(
1229 TGSI_EXEC_TEMP_80000000_I,
1230 TGSI_EXEC_TEMP_80000000_C ) );
1231 }
1232
1233 static void PIPE_CDECL
1234 sgn4f(
1235 float *store )
1236 {
1237 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1238 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1239 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1240 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1241 }
1242
1243 static void
1244 emit_sgn(
1245 struct x86_function *func,
1246 unsigned xmm_save,
1247 unsigned xmm_dst )
1248 {
1249 emit_func_call_dst_src1(
1250 func,
1251 xmm_save,
1252 xmm_dst,
1253 xmm_dst,
1254 sgn4f );
1255 }
1256
1257 static void PIPE_CDECL
1258 sin4f(
1259 float *store )
1260 {
1261 store[0] = sinf( store[0] );
1262 store[1] = sinf( store[1] );
1263 store[2] = sinf( store[2] );
1264 store[3] = sinf( store[3] );
1265 }
1266
1267 static void
1268 emit_sin (struct x86_function *func,
1269 unsigned xmm_save,
1270 unsigned xmm_dst)
1271 {
1272 emit_func_call_dst_src1(
1273 func,
1274 xmm_save,
1275 xmm_dst,
1276 xmm_dst,
1277 sin4f );
1278 }
1279
1280 static void
1281 emit_sub(
1282 struct x86_function *func,
1283 unsigned xmm_dst,
1284 unsigned xmm_src )
1285 {
1286 sse_subps(
1287 func,
1288 make_xmm( xmm_dst ),
1289 make_xmm( xmm_src ) );
1290 }
1291
1292 /**
1293 * Register fetch.
1294 */
1295 static void
1296 emit_fetch(
1297 struct x86_function *func,
1298 unsigned xmm,
1299 const struct tgsi_full_src_register *reg,
1300 const unsigned chan_index )
1301 {
1302 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1303
1304 switch (swizzle) {
1305 case TGSI_SWIZZLE_X:
1306 case TGSI_SWIZZLE_Y:
1307 case TGSI_SWIZZLE_Z:
1308 case TGSI_SWIZZLE_W:
1309 switch (reg->Register.File) {
1310 case TGSI_FILE_CONSTANT:
1311 emit_const(
1312 func,
1313 xmm,
1314 reg->Register.Index,
1315 swizzle,
1316 reg->Register.Indirect,
1317 reg->Indirect.File,
1318 reg->Indirect.Index );
1319 break;
1320
1321 case TGSI_FILE_IMMEDIATE:
1322 emit_immediate(
1323 func,
1324 xmm,
1325 reg->Register.Index,
1326 swizzle );
1327 break;
1328
1329 case TGSI_FILE_SYSTEM_VALUE:
1330 emit_system_value(
1331 func,
1332 xmm,
1333 reg->Register.Index,
1334 swizzle );
1335 break;
1336
1337 case TGSI_FILE_INPUT:
1338 emit_inputf(
1339 func,
1340 xmm,
1341 reg->Register.Index,
1342 swizzle );
1343 break;
1344
1345 case TGSI_FILE_TEMPORARY:
1346 emit_tempf(
1347 func,
1348 xmm,
1349 reg->Register.Index,
1350 swizzle );
1351 break;
1352
1353 default:
1354 assert( 0 );
1355 }
1356 break;
1357
1358 default:
1359 assert( 0 );
1360 }
1361
1362 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1363 case TGSI_UTIL_SIGN_CLEAR:
1364 emit_abs( func, xmm );
1365 break;
1366
1367 case TGSI_UTIL_SIGN_SET:
1368 emit_setsign( func, xmm );
1369 break;
1370
1371 case TGSI_UTIL_SIGN_TOGGLE:
1372 emit_neg( func, xmm );
1373 break;
1374
1375 case TGSI_UTIL_SIGN_KEEP:
1376 break;
1377 }
1378 }
1379
1380 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1381 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1382
1383 /**
1384 * Register store.
1385 */
1386 static void
1387 emit_store(
1388 struct x86_function *func,
1389 unsigned xmm,
1390 const struct tgsi_full_dst_register *reg,
1391 const struct tgsi_full_instruction *inst,
1392 unsigned chan_index )
1393 {
1394 switch( inst->Instruction.Saturate ) {
1395 case TGSI_SAT_NONE:
1396 break;
1397
1398 case TGSI_SAT_ZERO_ONE:
1399 sse_maxps(
1400 func,
1401 make_xmm( xmm ),
1402 get_temp(
1403 TGSI_EXEC_TEMP_00000000_I,
1404 TGSI_EXEC_TEMP_00000000_C ) );
1405
1406 sse_minps(
1407 func,
1408 make_xmm( xmm ),
1409 get_temp(
1410 TGSI_EXEC_TEMP_ONE_I,
1411 TGSI_EXEC_TEMP_ONE_C ) );
1412 break;
1413
1414 case TGSI_SAT_MINUS_PLUS_ONE:
1415 assert( 0 );
1416 break;
1417 }
1418
1419
1420 switch( reg->Register.File ) {
1421 case TGSI_FILE_OUTPUT:
1422 emit_output(
1423 func,
1424 xmm,
1425 reg->Register.Index,
1426 chan_index );
1427 break;
1428
1429 case TGSI_FILE_TEMPORARY:
1430 emit_temps(
1431 func,
1432 xmm,
1433 reg->Register.Index,
1434 chan_index );
1435 break;
1436
1437 case TGSI_FILE_ADDRESS:
1438 emit_addrs(
1439 func,
1440 xmm,
1441 reg->Register.Index,
1442 chan_index );
1443 break;
1444
1445 default:
1446 assert( 0 );
1447 }
1448 }
1449
1450 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1451 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1452
1453
1454 static void PIPE_CDECL
1455 fetch_texel( struct tgsi_sampler **sampler,
1456 float *store )
1457 {
1458 #if 0
1459 uint j;
1460
1461 debug_printf("%s sampler: %p (%p) store: %p\n",
1462 __FUNCTION__,
1463 sampler, *sampler,
1464 store );
1465
1466 for (j = 0; j < 4; j++)
1467 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1468 j,
1469 store[0+j],
1470 store[4+j],
1471 store[8 + j],
1472 store[12 + j]);
1473 #endif
1474
1475 {
1476 float rgba[NUM_CHANNELS][QUAD_SIZE];
1477 (*sampler)->get_samples(*sampler,
1478 &store[0], /* s */
1479 &store[4], /* t */
1480 &store[8], /* r */
1481 &store[12], /* lodbias */
1482 tgsi_sampler_lod_bias,
1483 rgba); /* results */
1484
1485 memcpy( store, rgba, 16 * sizeof(float));
1486 }
1487
1488 #if 0
1489 for (j = 0; j < 4; j++)
1490 debug_printf("sample %d result %f %f %f %f\n",
1491 j,
1492 store[0+j],
1493 store[4+j],
1494 store[8+j],
1495 store[12+j]);
1496 #endif
1497 }
1498
1499 /**
1500 * High-level instruction translators.
1501 */
1502 static void
1503 emit_tex( struct x86_function *func,
1504 const struct tgsi_full_instruction *inst,
1505 boolean lodbias,
1506 boolean projected)
1507 {
1508 const uint unit = inst->Src[1].Register.Index;
1509 struct x86_reg args[2];
1510 unsigned count;
1511 unsigned i;
1512
1513 assert(inst->Instruction.Texture);
1514 switch (inst->Texture.Texture) {
1515 case TGSI_TEXTURE_1D:
1516 count = 1;
1517 break;
1518 case TGSI_TEXTURE_2D:
1519 case TGSI_TEXTURE_RECT:
1520 case TGSI_TEXTURE_1D_ARRAY:
1521 count = 2;
1522 break;
1523 case TGSI_TEXTURE_SHADOW1D:
1524 case TGSI_TEXTURE_SHADOW2D:
1525 case TGSI_TEXTURE_SHADOWRECT:
1526 case TGSI_TEXTURE_3D:
1527 case TGSI_TEXTURE_CUBE:
1528 case TGSI_TEXTURE_2D_ARRAY:
1529 count = 3;
1530 break;
1531 default:
1532 assert(0);
1533 return;
1534 }
1535
1536 if (lodbias) {
1537 FETCH( func, *inst, 3, 0, 3 );
1538 }
1539 else {
1540 emit_tempf(
1541 func,
1542 3,
1543 TGSI_EXEC_TEMP_00000000_I,
1544 TGSI_EXEC_TEMP_00000000_C );
1545
1546 }
1547
1548 /* store lodbias whether enabled or not -- fetch_texel currently
1549 * respects it always.
1550 */
1551 sse_movaps( func,
1552 get_temp( TEMP_R0, 3 ),
1553 make_xmm( 3 ) );
1554
1555 if (projected) {
1556 FETCH( func, *inst, 3, 0, 3 );
1557
1558 emit_rcp( func, 3, 3 );
1559 }
1560
1561 for (i = 0; i < count; i++) {
1562 FETCH( func, *inst, i, 0, i );
1563
1564 if (projected) {
1565 sse_mulps(
1566 func,
1567 make_xmm( i ),
1568 make_xmm( 3 ) );
1569 }
1570
1571 /* Store in the argument buffer:
1572 */
1573 sse_movaps(
1574 func,
1575 get_temp( TEMP_R0, i ),
1576 make_xmm( i ) );
1577 }
1578
1579 args[0] = get_temp( TEMP_R0, 0 );
1580 args[1] = get_sampler_ptr( unit );
1581
1582 emit_func_call( func,
1583 0,
1584 args,
1585 Elements(args),
1586 fetch_texel );
1587
1588 /* If all four channels are enabled, could use a pointer to
1589 * dst[0].x instead of TEMP_R0 for store?
1590 */
1591 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1592
1593 sse_movaps(
1594 func,
1595 make_xmm( 0 ),
1596 get_temp( TEMP_R0, i ) );
1597
1598 STORE( func, *inst, 0, 0, i );
1599 }
1600 }
1601
1602
1603 static void
1604 emit_kil(
1605 struct x86_function *func,
1606 const struct tgsi_full_src_register *reg )
1607 {
1608 unsigned uniquemask;
1609 unsigned unique_count = 0;
1610 unsigned chan_index;
1611 unsigned i;
1612
1613 /* This mask stores component bits that were already tested. Note that
1614 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1615 * tested.
1616 */
1617 uniquemask = 0;
1618
1619 FOR_EACH_CHANNEL( chan_index ) {
1620 unsigned swizzle;
1621
1622 /* unswizzle channel */
1623 swizzle = tgsi_util_get_full_src_register_swizzle(
1624 reg,
1625 chan_index );
1626
1627 /* check if the component has not been already tested */
1628 if( !(uniquemask & (1 << swizzle)) ) {
1629 uniquemask |= 1 << swizzle;
1630
1631 /* allocate register */
1632 emit_fetch(
1633 func,
1634 unique_count++,
1635 reg,
1636 chan_index );
1637 }
1638 }
1639
1640 x86_push(
1641 func,
1642 x86_make_reg( file_REG32, reg_AX ) );
1643 x86_push(
1644 func,
1645 x86_make_reg( file_REG32, reg_DX ) );
1646
1647 for (i = 0 ; i < unique_count; i++ ) {
1648 struct x86_reg dataXMM = make_xmm(i);
1649
1650 sse_cmpps(
1651 func,
1652 dataXMM,
1653 get_temp(
1654 TGSI_EXEC_TEMP_00000000_I,
1655 TGSI_EXEC_TEMP_00000000_C ),
1656 cc_LessThan );
1657
1658 if( i == 0 ) {
1659 sse_movmskps(
1660 func,
1661 x86_make_reg( file_REG32, reg_AX ),
1662 dataXMM );
1663 }
1664 else {
1665 sse_movmskps(
1666 func,
1667 x86_make_reg( file_REG32, reg_DX ),
1668 dataXMM );
1669 x86_or(
1670 func,
1671 x86_make_reg( file_REG32, reg_AX ),
1672 x86_make_reg( file_REG32, reg_DX ) );
1673 }
1674 }
1675
1676 x86_or(
1677 func,
1678 get_temp(
1679 TGSI_EXEC_TEMP_KILMASK_I,
1680 TGSI_EXEC_TEMP_KILMASK_C ),
1681 x86_make_reg( file_REG32, reg_AX ) );
1682
1683 x86_pop(
1684 func,
1685 x86_make_reg( file_REG32, reg_DX ) );
1686 x86_pop(
1687 func,
1688 x86_make_reg( file_REG32, reg_AX ) );
1689 }
1690
1691
1692 static void
1693 emit_kilp(
1694 struct x86_function *func )
1695 {
1696 /* XXX todo / fix me */
1697 }
1698
1699
1700 static void
1701 emit_setcc(
1702 struct x86_function *func,
1703 struct tgsi_full_instruction *inst,
1704 enum sse_cc cc )
1705 {
1706 unsigned chan_index;
1707
1708 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1709 FETCH( func, *inst, 0, 0, chan_index );
1710 FETCH( func, *inst, 1, 1, chan_index );
1711 sse_cmpps(
1712 func,
1713 make_xmm( 0 ),
1714 make_xmm( 1 ),
1715 cc );
1716 sse_andps(
1717 func,
1718 make_xmm( 0 ),
1719 get_temp(
1720 TEMP_ONE_I,
1721 TEMP_ONE_C ) );
1722 STORE( func, *inst, 0, 0, chan_index );
1723 }
1724 }
1725
1726 static void
1727 emit_cmp(
1728 struct x86_function *func,
1729 struct tgsi_full_instruction *inst )
1730 {
1731 unsigned chan_index;
1732
1733 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1734 FETCH( func, *inst, 0, 0, chan_index );
1735 FETCH( func, *inst, 1, 1, chan_index );
1736 FETCH( func, *inst, 2, 2, chan_index );
1737 sse_cmpps(
1738 func,
1739 make_xmm( 0 ),
1740 get_temp(
1741 TGSI_EXEC_TEMP_00000000_I,
1742 TGSI_EXEC_TEMP_00000000_C ),
1743 cc_LessThan );
1744 sse_andps(
1745 func,
1746 make_xmm( 1 ),
1747 make_xmm( 0 ) );
1748 sse_andnps(
1749 func,
1750 make_xmm( 0 ),
1751 make_xmm( 2 ) );
1752 sse_orps(
1753 func,
1754 make_xmm( 0 ),
1755 make_xmm( 1 ) );
1756 STORE( func, *inst, 0, 0, chan_index );
1757 }
1758 }
1759
1760
1761 /**
1762 * Check if inst src/dest regs use indirect addressing into temporary,
1763 * input or output register files.
1764 */
1765 static boolean
1766 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1767 {
1768 uint i;
1769 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1770 const struct tgsi_full_src_register *reg = &inst->Src[i];
1771 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1772 reg->Register.File == TGSI_FILE_INPUT ||
1773 reg->Register.File == TGSI_FILE_OUTPUT) &&
1774 reg->Register.Indirect)
1775 return TRUE;
1776 }
1777 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1778 const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1779 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1780 reg->Register.File == TGSI_FILE_INPUT ||
1781 reg->Register.File == TGSI_FILE_OUTPUT) &&
1782 reg->Register.Indirect)
1783 return TRUE;
1784 }
1785 return FALSE;
1786 }
1787
1788
1789 static int
1790 emit_instruction(
1791 struct x86_function *func,
1792 struct tgsi_full_instruction *inst )
1793 {
1794 unsigned chan_index;
1795
1796 /* we can't handle indirect addressing into temp register file yet */
1797 if (indirect_reg_reference(inst))
1798 return FALSE;
1799
1800 switch (inst->Instruction.Opcode) {
1801 case TGSI_OPCODE_ARL:
1802 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1803 FETCH( func, *inst, 0, 0, chan_index );
1804 emit_flr(func, 0, 0);
1805 emit_f2it( func, 0 );
1806 STORE( func, *inst, 0, 0, chan_index );
1807 }
1808 break;
1809
1810 case TGSI_OPCODE_MOV:
1811 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1812 FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1813 }
1814 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1815 STORE( func, *inst, 4 + chan_index, 0, chan_index );
1816 }
1817 break;
1818
1819 case TGSI_OPCODE_LIT:
1820 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1821 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1822 emit_tempf(
1823 func,
1824 0,
1825 TEMP_ONE_I,
1826 TEMP_ONE_C);
1827 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1828 STORE( func, *inst, 0, 0, CHAN_X );
1829 }
1830 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1831 STORE( func, *inst, 0, 0, CHAN_W );
1832 }
1833 }
1834 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1835 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1836 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1837 FETCH( func, *inst, 0, 0, CHAN_X );
1838 sse_maxps(
1839 func,
1840 make_xmm( 0 ),
1841 get_temp(
1842 TGSI_EXEC_TEMP_00000000_I,
1843 TGSI_EXEC_TEMP_00000000_C ) );
1844 STORE( func, *inst, 0, 0, CHAN_Y );
1845 }
1846 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1847 /* XMM[1] = SrcReg[0].yyyy */
1848 FETCH( func, *inst, 1, 0, CHAN_Y );
1849 /* XMM[1] = max(XMM[1], 0) */
1850 sse_maxps(
1851 func,
1852 make_xmm( 1 ),
1853 get_temp(
1854 TGSI_EXEC_TEMP_00000000_I,
1855 TGSI_EXEC_TEMP_00000000_C ) );
1856 /* XMM[2] = SrcReg[0].wwww */
1857 FETCH( func, *inst, 2, 0, CHAN_W );
1858 /* XMM[2] = min(XMM[2], 128.0) */
1859 sse_minps(
1860 func,
1861 make_xmm( 2 ),
1862 get_temp(
1863 TGSI_EXEC_TEMP_128_I,
1864 TGSI_EXEC_TEMP_128_C ) );
1865 /* XMM[2] = max(XMM[2], -128.0) */
1866 sse_maxps(
1867 func,
1868 make_xmm( 2 ),
1869 get_temp(
1870 TGSI_EXEC_TEMP_MINUS_128_I,
1871 TGSI_EXEC_TEMP_MINUS_128_C ) );
1872 emit_pow( func, 3, 1, 1, 2 );
1873 FETCH( func, *inst, 0, 0, CHAN_X );
1874 sse_xorps(
1875 func,
1876 make_xmm( 2 ),
1877 make_xmm( 2 ) );
1878 sse_cmpps(
1879 func,
1880 make_xmm( 2 ),
1881 make_xmm( 0 ),
1882 cc_LessThan );
1883 sse_andps(
1884 func,
1885 make_xmm( 2 ),
1886 make_xmm( 1 ) );
1887 STORE( func, *inst, 2, 0, CHAN_Z );
1888 }
1889 }
1890 break;
1891
1892 case TGSI_OPCODE_RCP:
1893 FETCH( func, *inst, 0, 0, CHAN_X );
1894 emit_rcp( func, 0, 0 );
1895 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1896 STORE( func, *inst, 0, 0, chan_index );
1897 }
1898 break;
1899
1900 case TGSI_OPCODE_RSQ:
1901 FETCH( func, *inst, 0, 0, CHAN_X );
1902 emit_abs( func, 0 );
1903 emit_rsqrt( func, 1, 0 );
1904 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1905 STORE( func, *inst, 1, 0, chan_index );
1906 }
1907 break;
1908
1909 case TGSI_OPCODE_EXP:
1910 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1911 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1912 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1913 FETCH( func, *inst, 0, 0, CHAN_X );
1914 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1915 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1916 emit_MOV( func, 1, 0 );
1917 emit_flr( func, 2, 1 );
1918 /* dst.x = ex2(floor(src.x)) */
1919 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1920 emit_MOV( func, 2, 1 );
1921 emit_ex2( func, 3, 2 );
1922 STORE( func, *inst, 2, 0, CHAN_X );
1923 }
1924 /* dst.y = src.x - floor(src.x) */
1925 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1926 emit_MOV( func, 2, 0 );
1927 emit_sub( func, 2, 1 );
1928 STORE( func, *inst, 2, 0, CHAN_Y );
1929 }
1930 }
1931 /* dst.z = ex2(src.x) */
1932 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1933 emit_ex2( func, 3, 0 );
1934 STORE( func, *inst, 0, 0, CHAN_Z );
1935 }
1936 }
1937 /* dst.w = 1.0 */
1938 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1939 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1940 STORE( func, *inst, 0, 0, CHAN_W );
1941 }
1942 break;
1943
1944 case TGSI_OPCODE_LOG:
1945 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1946 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1947 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1948 FETCH( func, *inst, 0, 0, CHAN_X );
1949 emit_abs( func, 0 );
1950 emit_MOV( func, 1, 0 );
1951 emit_lg2( func, 2, 1 );
1952 /* dst.z = lg2(abs(src.x)) */
1953 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1954 STORE( func, *inst, 1, 0, CHAN_Z );
1955 }
1956 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1957 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1958 emit_flr( func, 2, 1 );
1959 /* dst.x = floor(lg2(abs(src.x))) */
1960 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1961 STORE( func, *inst, 1, 0, CHAN_X );
1962 }
1963 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1964 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1965 emit_ex2( func, 2, 1 );
1966 emit_rcp( func, 1, 1 );
1967 emit_mul( func, 0, 1 );
1968 STORE( func, *inst, 0, 0, CHAN_Y );
1969 }
1970 }
1971 }
1972 /* dst.w = 1.0 */
1973 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1974 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1975 STORE( func, *inst, 0, 0, CHAN_W );
1976 }
1977 break;
1978
1979 case TGSI_OPCODE_MUL:
1980 /* do all fetches and adds, storing results in temp regs */
1981 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1982 int r = chan_index + 1;
1983 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1984 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1985 emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */
1986 }
1987 /* do all stores of the temp regs */
1988 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1989 int r = chan_index + 1;
1990 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1991 }
1992 break;
1993
1994 case TGSI_OPCODE_ADD:
1995 /* do all fetches and adds, storing results in temp regs */
1996 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1997 int r = chan_index + 1;
1998 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1999 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
2000 emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */
2001 }
2002 /* do all stores of the temp regs */
2003 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2004 int r = chan_index + 1;
2005 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
2006 }
2007 break;
2008
2009 case TGSI_OPCODE_DP3:
2010 FETCH( func, *inst, 0, 0, CHAN_X );
2011 FETCH( func, *inst, 1, 1, CHAN_X );
2012 emit_mul( func, 0, 1 );
2013 FETCH( func, *inst, 1, 0, CHAN_Y );
2014 FETCH( func, *inst, 2, 1, CHAN_Y );
2015 emit_mul( func, 1, 2 );
2016 emit_add( func, 0, 1 );
2017 FETCH( func, *inst, 1, 0, CHAN_Z );
2018 FETCH( func, *inst, 2, 1, CHAN_Z );
2019 emit_mul( func, 1, 2 );
2020 emit_add( func, 0, 1 );
2021 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2022 STORE( func, *inst, 0, 0, chan_index );
2023 }
2024 break;
2025
2026 case TGSI_OPCODE_DP4:
2027 FETCH( func, *inst, 0, 0, CHAN_X );
2028 FETCH( func, *inst, 1, 1, CHAN_X );
2029 emit_mul( func, 0, 1 );
2030 FETCH( func, *inst, 1, 0, CHAN_Y );
2031 FETCH( func, *inst, 2, 1, CHAN_Y );
2032 emit_mul( func, 1, 2 );
2033 emit_add( func, 0, 1 );
2034 FETCH( func, *inst, 1, 0, CHAN_Z );
2035 FETCH( func, *inst, 2, 1, CHAN_Z );
2036 emit_mul(func, 1, 2 );
2037 emit_add(func, 0, 1 );
2038 FETCH( func, *inst, 1, 0, CHAN_W );
2039 FETCH( func, *inst, 2, 1, CHAN_W );
2040 emit_mul( func, 1, 2 );
2041 emit_add( func, 0, 1 );
2042 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043 STORE( func, *inst, 0, 0, chan_index );
2044 }
2045 break;
2046
2047 case TGSI_OPCODE_DST:
2048 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2049 emit_tempf(
2050 func,
2051 0,
2052 TEMP_ONE_I,
2053 TEMP_ONE_C );
2054 STORE( func, *inst, 0, 0, CHAN_X );
2055 }
2056 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2057 FETCH( func, *inst, 0, 0, CHAN_Y );
2058 FETCH( func, *inst, 1, 1, CHAN_Y );
2059 emit_mul( func, 0, 1 );
2060 STORE( func, *inst, 0, 0, CHAN_Y );
2061 }
2062 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2063 FETCH( func, *inst, 0, 0, CHAN_Z );
2064 STORE( func, *inst, 0, 0, CHAN_Z );
2065 }
2066 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2067 FETCH( func, *inst, 0, 1, CHAN_W );
2068 STORE( func, *inst, 0, 0, CHAN_W );
2069 }
2070 break;
2071
2072 case TGSI_OPCODE_MIN:
2073 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2074 FETCH( func, *inst, 0, 0, chan_index );
2075 FETCH( func, *inst, 1, 1, chan_index );
2076 sse_minps(
2077 func,
2078 make_xmm( 0 ),
2079 make_xmm( 1 ) );
2080 STORE( func, *inst, 0, 0, chan_index );
2081 }
2082 break;
2083
2084 case TGSI_OPCODE_MAX:
2085 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2086 FETCH( func, *inst, 0, 0, chan_index );
2087 FETCH( func, *inst, 1, 1, chan_index );
2088 sse_maxps(
2089 func,
2090 make_xmm( 0 ),
2091 make_xmm( 1 ) );
2092 STORE( func, *inst, 0, 0, chan_index );
2093 }
2094 break;
2095
2096 case TGSI_OPCODE_SLT:
2097 emit_setcc( func, inst, cc_LessThan );
2098 break;
2099
2100 case TGSI_OPCODE_SGE:
2101 emit_setcc( func, inst, cc_NotLessThan );
2102 break;
2103
2104 case TGSI_OPCODE_MAD:
2105 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2106 FETCH( func, *inst, 0, 0, chan_index );
2107 FETCH( func, *inst, 1, 1, chan_index );
2108 FETCH( func, *inst, 2, 2, chan_index );
2109 emit_mul( func, 0, 1 );
2110 emit_add( func, 0, 2 );
2111 STORE( func, *inst, 0, 0, chan_index );
2112 }
2113 break;
2114
2115 case TGSI_OPCODE_SUB:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117 FETCH( func, *inst, 0, 0, chan_index );
2118 FETCH( func, *inst, 1, 1, chan_index );
2119 emit_sub( func, 0, 1 );
2120 STORE( func, *inst, 0, 0, chan_index );
2121 }
2122 break;
2123
2124 case TGSI_OPCODE_LRP:
2125 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2126 FETCH( func, *inst, 0, 0, chan_index );
2127 FETCH( func, *inst, 1, 1, chan_index );
2128 FETCH( func, *inst, 2, 2, chan_index );
2129 emit_sub( func, 1, 2 );
2130 emit_mul( func, 0, 1 );
2131 emit_add( func, 0, 2 );
2132 STORE( func, *inst, 0, 0, chan_index );
2133 }
2134 break;
2135
2136 case TGSI_OPCODE_CND:
2137 return 0;
2138 break;
2139
2140 case TGSI_OPCODE_DP2A:
2141 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2142 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2143 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2144 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2145 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2146 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2147 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2148 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2149 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2150 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2151 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2152 }
2153 break;
2154
2155 case TGSI_OPCODE_FRC:
2156 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2157 FETCH( func, *inst, 0, 0, chan_index );
2158 emit_frc( func, 0, 0 );
2159 STORE( func, *inst, 0, 0, chan_index );
2160 }
2161 break;
2162
2163 case TGSI_OPCODE_CLAMP:
2164 return 0;
2165 break;
2166
2167 case TGSI_OPCODE_FLR:
2168 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2169 FETCH( func, *inst, 0, 0, chan_index );
2170 emit_flr( func, 0, 0 );
2171 STORE( func, *inst, 0, 0, chan_index );
2172 }
2173 break;
2174
2175 case TGSI_OPCODE_ROUND:
2176 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2177 FETCH( func, *inst, 0, 0, chan_index );
2178 emit_rnd( func, 0, 0 );
2179 STORE( func, *inst, 0, 0, chan_index );
2180 }
2181 break;
2182
2183 case TGSI_OPCODE_EX2:
2184 FETCH( func, *inst, 0, 0, CHAN_X );
2185 emit_ex2( func, 0, 0 );
2186 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2187 STORE( func, *inst, 0, 0, chan_index );
2188 }
2189 break;
2190
2191 case TGSI_OPCODE_LG2:
2192 FETCH( func, *inst, 0, 0, CHAN_X );
2193 emit_lg2( func, 0, 0 );
2194 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2195 STORE( func, *inst, 0, 0, chan_index );
2196 }
2197 break;
2198
2199 case TGSI_OPCODE_POW:
2200 FETCH( func, *inst, 0, 0, CHAN_X );
2201 FETCH( func, *inst, 1, 1, CHAN_X );
2202 emit_pow( func, 0, 0, 0, 1 );
2203 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2204 STORE( func, *inst, 0, 0, chan_index );
2205 }
2206 break;
2207
2208 case TGSI_OPCODE_XPD:
2209 /* Note: we do all stores after all operands have been fetched
2210 * to avoid src/dst register aliasing issues for an instruction
2211 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2212 */
2213 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2214 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2215 FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2216 FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2217 }
2218 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2219 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2220 FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2221 FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2222 }
2223 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2224 emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */
2225 emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2226 emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */
2227 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2228 emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2229 /* store xmm[7] in dst.x below */
2230 }
2231 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2232 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2233 FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2234 FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2235 }
2236 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2237 emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2238 emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2239 emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2240 /* store xmm[3] in dst.y below */
2241 }
2242 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2243 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2244 emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2245 emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2246 STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2247 }
2248 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2249 STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2250 }
2251 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2252 STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2253 }
2254 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2255 emit_tempf(
2256 func,
2257 0,
2258 TEMP_ONE_I,
2259 TEMP_ONE_C );
2260 STORE( func, *inst, 0, 0, CHAN_W );
2261 }
2262 break;
2263
2264 case TGSI_OPCODE_ABS:
2265 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2266 FETCH( func, *inst, 0, 0, chan_index );
2267 emit_abs( func, 0) ;
2268
2269 STORE( func, *inst, 0, 0, chan_index );
2270 }
2271 break;
2272
2273 case TGSI_OPCODE_RCC:
2274 return 0;
2275 break;
2276
2277 case TGSI_OPCODE_DPH:
2278 FETCH( func, *inst, 0, 0, CHAN_X );
2279 FETCH( func, *inst, 1, 1, CHAN_X );
2280 emit_mul( func, 0, 1 );
2281 FETCH( func, *inst, 1, 0, CHAN_Y );
2282 FETCH( func, *inst, 2, 1, CHAN_Y );
2283 emit_mul( func, 1, 2 );
2284 emit_add( func, 0, 1 );
2285 FETCH( func, *inst, 1, 0, CHAN_Z );
2286 FETCH( func, *inst, 2, 1, CHAN_Z );
2287 emit_mul( func, 1, 2 );
2288 emit_add( func, 0, 1 );
2289 FETCH( func, *inst, 1, 1, CHAN_W );
2290 emit_add( func, 0, 1 );
2291 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2292 STORE( func, *inst, 0, 0, chan_index );
2293 }
2294 break;
2295
2296 case TGSI_OPCODE_COS:
2297 FETCH( func, *inst, 0, 0, CHAN_X );
2298 emit_cos( func, 0, 0 );
2299 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2300 STORE( func, *inst, 0, 0, chan_index );
2301 }
2302 break;
2303
2304 case TGSI_OPCODE_DDX:
2305 return 0;
2306 break;
2307
2308 case TGSI_OPCODE_DDY:
2309 return 0;
2310 break;
2311
2312 case TGSI_OPCODE_KILP:
2313 /* predicated kill */
2314 emit_kilp( func );
2315 return 0; /* XXX fix me */
2316 break;
2317
2318 case TGSI_OPCODE_KIL:
2319 /* conditional kill */
2320 emit_kil( func, &inst->Src[0] );
2321 break;
2322
2323 case TGSI_OPCODE_PK2H:
2324 return 0;
2325 break;
2326
2327 case TGSI_OPCODE_PK2US:
2328 return 0;
2329 break;
2330
2331 case TGSI_OPCODE_PK4B:
2332 return 0;
2333 break;
2334
2335 case TGSI_OPCODE_PK4UB:
2336 return 0;
2337 break;
2338
2339 case TGSI_OPCODE_RFL:
2340 return 0;
2341 break;
2342
2343 case TGSI_OPCODE_SEQ:
2344 emit_setcc( func, inst, cc_Equal );
2345 break;
2346
2347 case TGSI_OPCODE_SFL:
2348 return 0;
2349 break;
2350
2351 case TGSI_OPCODE_SGT:
2352 emit_setcc( func, inst, cc_NotLessThanEqual );
2353 break;
2354
2355 case TGSI_OPCODE_SIN:
2356 FETCH( func, *inst, 0, 0, CHAN_X );
2357 emit_sin( func, 0, 0 );
2358 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2359 STORE( func, *inst, 0, 0, chan_index );
2360 }
2361 break;
2362
2363 case TGSI_OPCODE_SLE:
2364 emit_setcc( func, inst, cc_LessThanEqual );
2365 break;
2366
2367 case TGSI_OPCODE_SNE:
2368 emit_setcc( func, inst, cc_NotEqual );
2369 break;
2370
2371 case TGSI_OPCODE_STR:
2372 return 0;
2373 break;
2374
2375 case TGSI_OPCODE_TEX:
2376 emit_tex( func, inst, FALSE, FALSE );
2377 break;
2378
2379 case TGSI_OPCODE_TXD:
2380 return 0;
2381 break;
2382
2383 case TGSI_OPCODE_UP2H:
2384 return 0;
2385 break;
2386
2387 case TGSI_OPCODE_UP2US:
2388 return 0;
2389 break;
2390
2391 case TGSI_OPCODE_UP4B:
2392 return 0;
2393 break;
2394
2395 case TGSI_OPCODE_UP4UB:
2396 return 0;
2397 break;
2398
2399 case TGSI_OPCODE_X2D:
2400 return 0;
2401 break;
2402
2403 case TGSI_OPCODE_ARA:
2404 return 0;
2405 break;
2406
2407 case TGSI_OPCODE_ARR:
2408 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2409 FETCH( func, *inst, 0, 0, chan_index );
2410 emit_rnd( func, 0, 0 );
2411 emit_f2it( func, 0 );
2412 STORE( func, *inst, 0, 0, chan_index );
2413 }
2414 break;
2415
2416 case TGSI_OPCODE_BRA:
2417 return 0;
2418 break;
2419
2420 case TGSI_OPCODE_CAL:
2421 return 0;
2422 break;
2423
2424 case TGSI_OPCODE_RET:
2425 emit_ret( func );
2426 break;
2427
2428 case TGSI_OPCODE_END:
2429 break;
2430
2431 case TGSI_OPCODE_SSG:
2432 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2433 FETCH( func, *inst, 0, 0, chan_index );
2434 emit_sgn( func, 0, 0 );
2435 STORE( func, *inst, 0, 0, chan_index );
2436 }
2437 break;
2438
2439 case TGSI_OPCODE_CMP:
2440 emit_cmp (func, inst);
2441 break;
2442
2443 case TGSI_OPCODE_SCS:
2444 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2445 FETCH( func, *inst, 0, 0, CHAN_X );
2446 emit_cos( func, 0, 0 );
2447 STORE( func, *inst, 0, 0, CHAN_X );
2448 }
2449 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2450 FETCH( func, *inst, 0, 0, CHAN_X );
2451 emit_sin( func, 0, 0 );
2452 STORE( func, *inst, 0, 0, CHAN_Y );
2453 }
2454 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2455 emit_tempf(
2456 func,
2457 0,
2458 TGSI_EXEC_TEMP_00000000_I,
2459 TGSI_EXEC_TEMP_00000000_C );
2460 STORE( func, *inst, 0, 0, CHAN_Z );
2461 }
2462 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2463 emit_tempf(
2464 func,
2465 0,
2466 TEMP_ONE_I,
2467 TEMP_ONE_C );
2468 STORE( func, *inst, 0, 0, CHAN_W );
2469 }
2470 break;
2471
2472 case TGSI_OPCODE_TXB:
2473 emit_tex( func, inst, TRUE, FALSE );
2474 break;
2475
2476 case TGSI_OPCODE_NRM:
2477 /* fall-through */
2478 case TGSI_OPCODE_NRM4:
2479 /* 3 or 4-component normalization */
2480 {
2481 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2482
2483 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2484 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2485 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2486 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2487
2488 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2489
2490 /* xmm4 = src.x */
2491 /* xmm0 = src.x * src.x */
2492 FETCH(func, *inst, 0, 0, CHAN_X);
2493 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2494 emit_MOV(func, 4, 0);
2495 }
2496 emit_mul(func, 0, 0);
2497
2498 /* xmm5 = src.y */
2499 /* xmm0 = xmm0 + src.y * src.y */
2500 FETCH(func, *inst, 1, 0, CHAN_Y);
2501 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2502 emit_MOV(func, 5, 1);
2503 }
2504 emit_mul(func, 1, 1);
2505 emit_add(func, 0, 1);
2506
2507 /* xmm6 = src.z */
2508 /* xmm0 = xmm0 + src.z * src.z */
2509 FETCH(func, *inst, 1, 0, CHAN_Z);
2510 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2511 emit_MOV(func, 6, 1);
2512 }
2513 emit_mul(func, 1, 1);
2514 emit_add(func, 0, 1);
2515
2516 if (dims == 4) {
2517 /* xmm7 = src.w */
2518 /* xmm0 = xmm0 + src.w * src.w */
2519 FETCH(func, *inst, 1, 0, CHAN_W);
2520 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2521 emit_MOV(func, 7, 1);
2522 }
2523 emit_mul(func, 1, 1);
2524 emit_add(func, 0, 1);
2525 }
2526
2527 /* xmm1 = 1 / sqrt(xmm0) */
2528 emit_rsqrt(func, 1, 0);
2529
2530 /* dst.x = xmm1 * src.x */
2531 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2532 emit_mul(func, 4, 1);
2533 STORE(func, *inst, 4, 0, CHAN_X);
2534 }
2535
2536 /* dst.y = xmm1 * src.y */
2537 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2538 emit_mul(func, 5, 1);
2539 STORE(func, *inst, 5, 0, CHAN_Y);
2540 }
2541
2542 /* dst.z = xmm1 * src.z */
2543 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2544 emit_mul(func, 6, 1);
2545 STORE(func, *inst, 6, 0, CHAN_Z);
2546 }
2547
2548 /* dst.w = xmm1 * src.w */
2549 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2550 emit_mul(func, 7, 1);
2551 STORE(func, *inst, 7, 0, CHAN_W);
2552 }
2553 }
2554
2555 /* dst0.w = 1.0 */
2556 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2557 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2558 STORE(func, *inst, 0, 0, CHAN_W);
2559 }
2560 }
2561 break;
2562
2563 case TGSI_OPCODE_DIV:
2564 return 0;
2565 break;
2566
2567 case TGSI_OPCODE_DP2:
2568 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2569 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2570 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2571 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2572 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2573 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2574 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2575 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2576 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2577 }
2578 break;
2579
2580 case TGSI_OPCODE_TXL:
2581 return 0;
2582 break;
2583
2584 case TGSI_OPCODE_TXP:
2585 emit_tex( func, inst, FALSE, TRUE );
2586 break;
2587
2588 case TGSI_OPCODE_BRK:
2589 return 0;
2590 break;
2591
2592 case TGSI_OPCODE_IF:
2593 return 0;
2594 break;
2595
2596 case TGSI_OPCODE_ELSE:
2597 return 0;
2598 break;
2599
2600 case TGSI_OPCODE_ENDIF:
2601 return 0;
2602 break;
2603
2604 case TGSI_OPCODE_PUSHA:
2605 return 0;
2606 break;
2607
2608 case TGSI_OPCODE_POPA:
2609 return 0;
2610 break;
2611
2612 case TGSI_OPCODE_CEIL:
2613 return 0;
2614 break;
2615
2616 case TGSI_OPCODE_I2F:
2617 return 0;
2618 break;
2619
2620 case TGSI_OPCODE_NOT:
2621 return 0;
2622 break;
2623
2624 case TGSI_OPCODE_TRUNC:
2625 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2626 FETCH( func, *inst, 0, 0, chan_index );
2627 emit_f2it( func, 0 );
2628 emit_i2f( func, 0 );
2629 STORE( func, *inst, 0, 0, chan_index );
2630 }
2631 break;
2632
2633 case TGSI_OPCODE_SHL:
2634 return 0;
2635 break;
2636
2637 case TGSI_OPCODE_ISHR:
2638 return 0;
2639 break;
2640
2641 case TGSI_OPCODE_AND:
2642 return 0;
2643 break;
2644
2645 case TGSI_OPCODE_OR:
2646 return 0;
2647 break;
2648
2649 case TGSI_OPCODE_MOD:
2650 return 0;
2651 break;
2652
2653 case TGSI_OPCODE_XOR:
2654 return 0;
2655 break;
2656
2657 case TGSI_OPCODE_SAD:
2658 return 0;
2659 break;
2660
2661 case TGSI_OPCODE_TXF:
2662 return 0;
2663 break;
2664
2665 case TGSI_OPCODE_TXQ:
2666 return 0;
2667 break;
2668
2669 case TGSI_OPCODE_CONT:
2670 return 0;
2671 break;
2672
2673 case TGSI_OPCODE_EMIT:
2674 return 0;
2675 break;
2676
2677 case TGSI_OPCODE_ENDPRIM:
2678 return 0;
2679 break;
2680
2681 default:
2682 return 0;
2683 }
2684
2685 return 1;
2686 }
2687
2688 static void
2689 emit_declaration(
2690 struct x86_function *func,
2691 struct tgsi_full_declaration *decl )
2692 {
2693 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2694 unsigned first, last, mask;
2695 unsigned i, j;
2696
2697 first = decl->Range.First;
2698 last = decl->Range.Last;
2699 mask = decl->Declaration.UsageMask;
2700
2701 for( i = first; i <= last; i++ ) {
2702 for( j = 0; j < NUM_CHANNELS; j++ ) {
2703 if( mask & (1 << j) ) {
2704 switch( decl->Declaration.Interpolate ) {
2705 case TGSI_INTERPOLATE_CONSTANT:
2706 emit_coef_a0( func, 0, i, j );
2707 emit_inputs( func, 0, i, j );
2708 break;
2709
2710 case TGSI_INTERPOLATE_LINEAR:
2711 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2712 emit_coef_dadx( func, 1, i, j );
2713 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2714 emit_coef_dady( func, 3, i, j );
2715 emit_mul( func, 0, 1 ); /* x * dadx */
2716 emit_coef_a0( func, 4, i, j );
2717 emit_mul( func, 2, 3 ); /* y * dady */
2718 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2719 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2720 emit_inputs( func, 0, i, j );
2721 break;
2722
2723 case TGSI_INTERPOLATE_PERSPECTIVE:
2724 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2725 emit_coef_dadx( func, 1, i, j );
2726 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2727 emit_coef_dady( func, 3, i, j );
2728 emit_mul( func, 0, 1 ); /* x * dadx */
2729 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2730 emit_coef_a0( func, 5, i, j );
2731 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2732 emit_mul( func, 2, 3 ); /* y * dady */
2733 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2734 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2735 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2736 emit_inputs( func, 0, i, j );
2737 break;
2738
2739 default:
2740 assert( 0 );
2741 break;
2742 }
2743 }
2744 }
2745 }
2746 }
2747 }
2748
2749 static void aos_to_soa( struct x86_function *func,
2750 uint arg_aos,
2751 uint arg_machine,
2752 uint arg_num,
2753 uint arg_stride )
2754 {
2755 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2756 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2757 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2758 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2759 int loop_top, loop_exit_fixup;
2760
2761 /* Save EBX */
2762 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2763
2764 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2765 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2766 /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
2767 x86_lea( func, soa_input,
2768 x86_make_disp( soa_input,
2769 Offset(struct tgsi_exec_machine, Inputs) ) );
2770 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2771 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2772
2773 /* while (num_inputs != 0) */
2774 loop_top = x86_get_label( func );
2775 x86_cmp_imm( func, num_inputs, 0 );
2776 loop_exit_fixup = x86_jcc_forward( func, cc_E );
2777
2778 {
2779 x86_push( func, aos_input );
2780 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2781 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2782 x86_add( func, aos_input, stride );
2783 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2784 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2785 x86_add( func, aos_input, stride );
2786 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2787 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2788 x86_add( func, aos_input, stride );
2789 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2790 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2791 x86_pop( func, aos_input );
2792
2793 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2794 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2795 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2796 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2797 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2798 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2799
2800 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2801 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2802 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2803 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2804
2805 /* Advance to next input */
2806 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2807 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2808 }
2809 /* --num_inputs */
2810 x86_dec( func, num_inputs );
2811 x86_jmp( func, loop_top );
2812 x86_fixup_fwd_jump( func, loop_exit_fixup );
2813
2814 /* Restore EBX */
2815 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2816 }
2817
2818 static void soa_to_aos( struct x86_function *func,
2819 uint arg_aos,
2820 uint arg_machine,
2821 uint arg_num,
2822 uint arg_stride )
2823 {
2824 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2825 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2826 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2827 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2828 int inner_loop;
2829
2830 /* Save EBX */
2831 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2832
2833 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2834 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2835 /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
2836 x86_lea( func, soa_output,
2837 x86_make_disp( soa_output,
2838 Offset(struct tgsi_exec_machine, Outputs) ) );
2839 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2840
2841 /* do */
2842 inner_loop = x86_get_label( func );
2843 {
2844 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2845 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2846 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2847 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2848
2849 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2850 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2851 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2852 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2853 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2854 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2855
2856 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2857 x86_push( func, aos_output );
2858 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2859 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2860 x86_add( func, aos_output, temp );
2861 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2862 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2863 x86_add( func, aos_output, temp );
2864 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2865 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2866 x86_add( func, aos_output, temp );
2867 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2868 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2869 x86_pop( func, aos_output );
2870
2871 /* Advance to next output */
2872 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2873 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2874 }
2875 /* while --num_outputs */
2876 x86_dec( func, num_outputs );
2877 x86_jcc( func, cc_NE, inner_loop );
2878
2879 /* Restore EBX */
2880 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2881 }
2882
2883
2884 /**
2885 * Check if the instructions dst register is the same as any src
2886 * register and warn if there's a posible SOA dependency.
2887 */
2888 static boolean
2889 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2890 {
2891 uint opcode = inst->Instruction.Opcode;
2892
2893 /* XXX: we only handle src/dst aliasing in a few opcodes currently.
2894 * Need to use an additional temporay to hold the result in the
2895 * cases where the code is too opaque to fix.
2896 */
2897
2898 switch (opcode) {
2899 case TGSI_OPCODE_ADD:
2900 case TGSI_OPCODE_MOV:
2901 case TGSI_OPCODE_MUL:
2902 case TGSI_OPCODE_RCP:
2903 case TGSI_OPCODE_RSQ:
2904 case TGSI_OPCODE_EXP:
2905 case TGSI_OPCODE_LOG:
2906 case TGSI_OPCODE_DP3:
2907 case TGSI_OPCODE_DP4:
2908 case TGSI_OPCODE_DP2A:
2909 case TGSI_OPCODE_EX2:
2910 case TGSI_OPCODE_LG2:
2911 case TGSI_OPCODE_POW:
2912 case TGSI_OPCODE_XPD:
2913 case TGSI_OPCODE_DPH:
2914 case TGSI_OPCODE_COS:
2915 case TGSI_OPCODE_SIN:
2916 case TGSI_OPCODE_TEX:
2917 case TGSI_OPCODE_TXB:
2918 case TGSI_OPCODE_TXP:
2919 case TGSI_OPCODE_NRM:
2920 case TGSI_OPCODE_NRM4:
2921 case TGSI_OPCODE_DP2:
2922 /* OK - these opcodes correctly handle SOA dependencies */
2923 return TRUE;
2924 default:
2925 if (!tgsi_check_soa_dependencies(inst))
2926 return TRUE;
2927
2928 debug_printf("Warning: src/dst aliasing in instruction"
2929 " is not handled:\n");
2930 debug_printf("Warning: ");
2931 tgsi_dump_instruction(inst, 1);
2932
2933 return FALSE;
2934 }
2935 }
2936
2937
2938 /**
2939 * Translate a TGSI vertex/fragment shader to SSE2 code.
2940 * Slightly different things are done for vertex vs. fragment shaders.
2941 *
2942 * \param tokens the TGSI input shader
2943 * \param func the output SSE code/function
2944 * \param immediates buffer to place immediates, later passed to SSE func
2945 * \param return 1 for success, 0 if translation failed
2946 */
2947 unsigned
2948 tgsi_emit_sse2(
2949 const struct tgsi_token *tokens,
2950 struct x86_function *func,
2951 float (*immediates)[4],
2952 boolean do_swizzles )
2953 {
2954 struct tgsi_parse_context parse;
2955 unsigned ok = 1;
2956 uint num_immediates = 0;
2957
2958 util_init_math();
2959
2960 func->csr = func->store;
2961
2962 tgsi_parse_init( &parse, tokens );
2963
2964 /* Can't just use EDI, EBX without save/restoring them:
2965 */
2966 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2967 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2968
2969 /*
2970 * Different function args for vertex/fragment shaders:
2971 */
2972 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2973 if (do_swizzles)
2974 aos_to_soa( func,
2975 4, /* aos_input */
2976 1, /* machine */
2977 5, /* num_inputs */
2978 6 ); /* input_stride */
2979 }
2980
2981 x86_mov(
2982 func,
2983 get_machine_base(),
2984 x86_fn_arg( func, 1 ) );
2985 x86_mov(
2986 func,
2987 get_const_base(),
2988 x86_fn_arg( func, 2 ) );
2989 x86_mov(
2990 func,
2991 get_immediate_base(),
2992 x86_fn_arg( func, 3 ) );
2993
2994 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2995 x86_mov(
2996 func,
2997 get_coef_base(),
2998 x86_fn_arg( func, 4 ) );
2999 }
3000
3001 x86_mov(
3002 func,
3003 get_sampler_base(),
3004 x86_make_disp( get_machine_base(),
3005 Offset( struct tgsi_exec_machine, Samplers ) ) );
3006
3007 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
3008 tgsi_parse_token( &parse );
3009
3010 switch( parse.FullToken.Token.Type ) {
3011 case TGSI_TOKEN_TYPE_DECLARATION:
3012 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
3013 emit_declaration(
3014 func,
3015 &parse.FullToken.FullDeclaration );
3016 }
3017 break;
3018
3019 case TGSI_TOKEN_TYPE_INSTRUCTION:
3020 ok = emit_instruction(
3021 func,
3022 &parse.FullToken.FullInstruction );
3023
3024 if (!ok) {
3025 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
3026 uint proc = parse.FullHeader.Processor.Processor;
3027 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
3028 opcode,
3029 tgsi_get_opcode_name(opcode),
3030 tgsi_get_processor_name(proc));
3031 }
3032
3033 if (ok)
3034 ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
3035 break;
3036
3037 case TGSI_TOKEN_TYPE_IMMEDIATE:
3038 /* simply copy the immediate values into the next immediates[] slot */
3039 {
3040 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
3041 uint i;
3042 assert(size <= 4);
3043 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
3044 for( i = 0; i < size; i++ ) {
3045 immediates[num_immediates][i] =
3046 parse.FullToken.FullImmediate.u[i].Float;
3047 }
3048 #if 0
3049 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
3050 num_immediates,
3051 immediates[num_immediates][0],
3052 immediates[num_immediates][1],
3053 immediates[num_immediates][2],
3054 immediates[num_immediates][3]);
3055 #endif
3056 num_immediates++;
3057 }
3058 break;
3059 case TGSI_TOKEN_TYPE_PROPERTY:
3060 /* we just ignore them for now */
3061 break;
3062
3063 default:
3064 ok = 0;
3065 assert( 0 );
3066 }
3067 }
3068
3069 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
3070 if (do_swizzles)
3071 soa_to_aos( func,
3072 7, /* aos_output */
3073 1, /* machine */
3074 8, /* num_outputs */
3075 9 ); /* output_stride */
3076 }
3077
3078 /* Can't just use EBX, EDI without save/restoring them:
3079 */
3080 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3081 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3082
3083 emit_ret( func );
3084
3085 tgsi_parse_free( &parse );
3086
3087 return ok;
3088 }
3089
3090 #else /* !PIPE_ARCH_X86 */
3091
3092 unsigned
3093 tgsi_emit_sse2(
3094 const struct tgsi_token *tokens,
3095 struct x86_function *func,
3096 float (*immediates)[4],
3097 boolean do_swizzles )
3098 {
3099 return 0;
3100 }
3101
3102 #endif /* !PIPE_ARCH_X86 */