92ba8b8f033225fa008c9c3b558acc4aec4bd894
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 #include "pipe/p_config.h"
30
31 #if defined(PIPE_ARCH_X86)
32
33 #include "util/u_debug.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37 #if defined(PIPE_ARCH_SSE)
38 #include "util/u_sse.h"
39 #endif
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_parse.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44 #include "tgsi/tgsi_exec.h"
45 #include "tgsi/tgsi_sse2.h"
46
47 #include "rtasm/rtasm_x86sse.h"
48
49 /* for 1/sqrt()
50 *
51 * This costs about 100fps (close to 10%) in gears:
52 */
53 #define HIGH_PRECISION 1
54
55 #define FAST_MATH 1
56
57
58 #define FOR_EACH_CHANNEL( CHAN )\
59 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
60
61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
63
64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
65 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
66
67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
68 FOR_EACH_CHANNEL( CHAN )\
69 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
70
71 #define CHAN_X 0
72 #define CHAN_Y 1
73 #define CHAN_Z 2
74 #define CHAN_W 3
75
76 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
77 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
78
79 #define TEMP_R0 TGSI_EXEC_TEMP_R0
80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83
84
85 /**
86 * X86 utility functions.
87 */
88
89 static struct x86_reg
90 make_xmm(
91 unsigned xmm )
92 {
93 return x86_make_reg(
94 file_XMM,
95 (enum x86_reg_name) xmm );
96 }
97
98 /**
99 * X86 register mapping helpers.
100 */
101
102 static struct x86_reg
103 get_const_base( void )
104 {
105 return x86_make_reg(
106 file_REG32,
107 reg_AX );
108 }
109
110 static struct x86_reg
111 get_machine_base( void )
112 {
113 return x86_make_reg(
114 file_REG32,
115 reg_CX );
116 }
117
118 static struct x86_reg
119 get_input_base( void )
120 {
121 return x86_make_disp(
122 get_machine_base(),
123 Offset(struct tgsi_exec_machine, Inputs) );
124 }
125
126 static struct x86_reg
127 get_output_base( void )
128 {
129 return x86_make_disp(
130 get_machine_base(),
131 Offset(struct tgsi_exec_machine, Outputs) );
132 }
133
134 static struct x86_reg
135 get_temp_base( void )
136 {
137 return x86_make_disp(
138 get_machine_base(),
139 Offset(struct tgsi_exec_machine, Temps) );
140 }
141
142 static struct x86_reg
143 get_coef_base( void )
144 {
145 return x86_make_reg(
146 file_REG32,
147 reg_BX );
148 }
149
150 static struct x86_reg
151 get_sampler_base( void )
152 {
153 return x86_make_reg(
154 file_REG32,
155 reg_DI );
156 }
157
158 static struct x86_reg
159 get_immediate_base( void )
160 {
161 return x86_make_reg(
162 file_REG32,
163 reg_DX );
164 }
165
166 static struct x86_reg
167 get_system_value_base( void )
168 {
169 return x86_make_disp(
170 get_machine_base(),
171 Offset(struct tgsi_exec_machine, SystemValue) );
172 }
173
174
175 /**
176 * Data access helpers.
177 */
178
179
180 static struct x86_reg
181 get_immediate(
182 unsigned vec,
183 unsigned chan )
184 {
185 return x86_make_disp(
186 get_immediate_base(),
187 (vec * 4 + chan) * 4 );
188 }
189
190 static struct x86_reg
191 get_const(
192 unsigned vec,
193 unsigned chan )
194 {
195 return x86_make_disp(
196 get_const_base(),
197 (vec * 4 + chan) * 4 );
198 }
199
200 static struct x86_reg
201 get_sampler_ptr(
202 unsigned unit )
203 {
204 return x86_make_disp(
205 get_sampler_base(),
206 unit * sizeof( struct tgsi_sampler * ) );
207 }
208
209 static struct x86_reg
210 get_input(
211 unsigned vec,
212 unsigned chan )
213 {
214 return x86_make_disp(
215 get_input_base(),
216 (vec * 4 + chan) * 16 );
217 }
218
219 static struct x86_reg
220 get_output(
221 unsigned vec,
222 unsigned chan )
223 {
224 return x86_make_disp(
225 get_output_base(),
226 (vec * 4 + chan) * 16 );
227 }
228
229 static struct x86_reg
230 get_temp(
231 unsigned vec,
232 unsigned chan )
233 {
234 return x86_make_disp(
235 get_temp_base(),
236 (vec * 4 + chan) * 16 );
237 }
238
239 static struct x86_reg
240 get_system_value(
241 unsigned vec,
242 unsigned chan )
243 {
244 return x86_make_disp(
245 get_system_value_base(), /* base */
246 (vec * 4 + chan) * 4 ); /* byte offset from base */
247 }
248
249 static struct x86_reg
250 get_coef(
251 unsigned vec,
252 unsigned chan,
253 unsigned member )
254 {
255 return x86_make_disp(
256 get_coef_base(),
257 ((vec * 3 + member) * 4 + chan) * 4 );
258 }
259
260
261 static void
262 emit_ret(
263 struct x86_function *func )
264 {
265 x86_ret( func );
266 }
267
268
269 /**
270 * Data fetch helpers.
271 */
272
273 /**
274 * Copy a shader constant to xmm register
275 * \param xmm the destination xmm register
276 * \param vec the src const buffer index
277 * \param chan src channel to fetch (X, Y, Z or W)
278 */
279 static void
280 emit_const(
281 struct x86_function *func,
282 uint xmm,
283 int vec,
284 uint chan,
285 uint indirect,
286 uint indirectFile,
287 int indirectIndex )
288 {
289 if (indirect) {
290 /* 'vec' is the offset from the address register's value.
291 * We're loading CONST[ADDR+vec] into an xmm register.
292 */
293 struct x86_reg r0 = get_immediate_base();
294 struct x86_reg r1 = get_coef_base();
295 uint i;
296
297 assert( indirectFile == TGSI_FILE_ADDRESS );
298 assert( indirectIndex == 0 );
299 assert( r0.mod == mod_REG );
300 assert( r1.mod == mod_REG );
301
302 x86_push( func, r0 );
303 x86_push( func, r1 );
304
305 /*
306 * Loop over the four pixels or vertices in the quad.
307 * Get the value of the address (offset) register for pixel/vertex[i],
308 * add it to the src offset and index into the constant buffer.
309 * Note that we're working on SOA data.
310 * If any of the pixel/vertex execution channels are unused their
311 * values will be garbage. It's very important that we don't use
312 * those garbage values as indexes into the constant buffer since
313 * that'll cause segfaults.
314 * The solution is to bitwise-AND the offset with the execution mask
315 * register whose values are either 0 or ~0.
316 * The caller must setup the execution mask register to indicate
317 * which channels are valid/alive before running the shader.
318 * The execution mask will also figure into loops and conditionals
319 * someday.
320 */
321 for (i = 0; i < QUAD_SIZE; i++) {
322 /* r1 = address register[i] */
323 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
324 /* r0 = execution mask[i] */
325 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
326 /* r1 = r1 & r0 */
327 x86_and( func, r1, r0 );
328 /* r0 = 'vec', the offset */
329 x86_lea( func, r0, get_const( vec, chan ) );
330
331 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
332 */
333 x86_add( func, r1, r1 );
334 x86_add( func, r1, r1 );
335 x86_add( func, r1, r1 );
336 x86_add( func, r1, r1 );
337
338 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
339 x86_mov( func, r1, x86_deref( r0 ) );
340 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
341 }
342
343 x86_pop( func, r1 );
344 x86_pop( func, r0 );
345
346 sse_movaps(
347 func,
348 make_xmm( xmm ),
349 get_temp( TEMP_R0, CHAN_X ) );
350 }
351 else {
352 /* 'vec' is the index into the src register file, such as TEMP[vec] */
353 assert( vec >= 0 );
354
355 sse_movss(
356 func,
357 make_xmm( xmm ),
358 get_const( vec, chan ) );
359 sse_shufps(
360 func,
361 make_xmm( xmm ),
362 make_xmm( xmm ),
363 SHUF( 0, 0, 0, 0 ) );
364 }
365 }
366
367 static void
368 emit_immediate(
369 struct x86_function *func,
370 unsigned xmm,
371 unsigned vec,
372 unsigned chan )
373 {
374 sse_movss(
375 func,
376 make_xmm( xmm ),
377 get_immediate( vec, chan ) );
378 sse_shufps(
379 func,
380 make_xmm( xmm ),
381 make_xmm( xmm ),
382 SHUF( 0, 0, 0, 0 ) );
383 }
384
385
386 /**
387 * Copy a shader input to xmm register
388 * \param xmm the destination xmm register
389 * \param vec the src input attrib
390 * \param chan src channel to fetch (X, Y, Z or W)
391 */
392 static void
393 emit_inputf(
394 struct x86_function *func,
395 unsigned xmm,
396 unsigned vec,
397 unsigned chan )
398 {
399 sse_movups(
400 func,
401 make_xmm( xmm ),
402 get_input( vec, chan ) );
403 }
404
405 /**
406 * Store an xmm register to a shader output
407 * \param xmm the source xmm register
408 * \param vec the dest output attrib
409 * \param chan src dest channel to store (X, Y, Z or W)
410 */
411 static void
412 emit_output(
413 struct x86_function *func,
414 unsigned xmm,
415 unsigned vec,
416 unsigned chan )
417 {
418 sse_movups(
419 func,
420 get_output( vec, chan ),
421 make_xmm( xmm ) );
422 }
423
424 /**
425 * Copy a shader temporary to xmm register
426 * \param xmm the destination xmm register
427 * \param vec the src temp register
428 * \param chan src channel to fetch (X, Y, Z or W)
429 */
430 static void
431 emit_tempf(
432 struct x86_function *func,
433 unsigned xmm,
434 unsigned vec,
435 unsigned chan )
436 {
437 sse_movaps(
438 func,
439 make_xmm( xmm ),
440 get_temp( vec, chan ) );
441 }
442
443 /**
444 * Copy a system value to xmm register
445 * \param xmm the destination xmm register
446 * \param vec the source system value register
447 * \param chan src channel to fetch (X, Y, Z or W)
448 */
449 static void
450 emit_system_value(
451 struct x86_function *func,
452 unsigned xmm,
453 unsigned vec,
454 unsigned chan )
455 {
456 sse_movss(
457 func,
458 make_xmm( xmm ),
459 get_system_value( vec, chan ) );
460 sse_shufps(
461 func,
462 make_xmm( xmm ),
463 make_xmm( xmm ),
464 SHUF( 0, 0, 0, 0 ) );
465 }
466
467 /**
468 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
469 * \param xmm the destination xmm register
470 * \param vec the src input/attribute coefficient index
471 * \param chan src channel to fetch (X, Y, Z or W)
472 * \param member 0=a0, 1=dadx, 2=dady
473 */
474 static void
475 emit_coef(
476 struct x86_function *func,
477 unsigned xmm,
478 unsigned vec,
479 unsigned chan,
480 unsigned member )
481 {
482 sse_movss(
483 func,
484 make_xmm( xmm ),
485 get_coef( vec, chan, member ) );
486 sse_shufps(
487 func,
488 make_xmm( xmm ),
489 make_xmm( xmm ),
490 SHUF( 0, 0, 0, 0 ) );
491 }
492
493 /**
494 * Data store helpers.
495 */
496
497 static void
498 emit_inputs(
499 struct x86_function *func,
500 unsigned xmm,
501 unsigned vec,
502 unsigned chan )
503 {
504 sse_movups(
505 func,
506 get_input( vec, chan ),
507 make_xmm( xmm ) );
508 }
509
510 static void
511 emit_temps(
512 struct x86_function *func,
513 unsigned xmm,
514 unsigned vec,
515 unsigned chan )
516 {
517 sse_movaps(
518 func,
519 get_temp( vec, chan ),
520 make_xmm( xmm ) );
521 }
522
523 static void
524 emit_addrs(
525 struct x86_function *func,
526 unsigned xmm,
527 unsigned vec,
528 unsigned chan )
529 {
530 assert( vec == 0 );
531
532 emit_temps(
533 func,
534 xmm,
535 vec + TGSI_EXEC_TEMP_ADDR,
536 chan );
537 }
538
539 /**
540 * Coefficent fetch helpers.
541 */
542
543 static void
544 emit_coef_a0(
545 struct x86_function *func,
546 unsigned xmm,
547 unsigned vec,
548 unsigned chan )
549 {
550 emit_coef(
551 func,
552 xmm,
553 vec,
554 chan,
555 0 );
556 }
557
558 static void
559 emit_coef_dadx(
560 struct x86_function *func,
561 unsigned xmm,
562 unsigned vec,
563 unsigned chan )
564 {
565 emit_coef(
566 func,
567 xmm,
568 vec,
569 chan,
570 1 );
571 }
572
573 static void
574 emit_coef_dady(
575 struct x86_function *func,
576 unsigned xmm,
577 unsigned vec,
578 unsigned chan )
579 {
580 emit_coef(
581 func,
582 xmm,
583 vec,
584 chan,
585 2 );
586 }
587
588 /**
589 * Function call helpers.
590 */
591
592 /**
593 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
594 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
595 * that the stack pointer is 16 byte aligned, as expected.
596 */
597 static void
598 emit_func_call(
599 struct x86_function *func,
600 unsigned xmm_save_mask,
601 const struct x86_reg *arg,
602 unsigned nr_args,
603 void (PIPE_CDECL *code)() )
604 {
605 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
606 unsigned i, n;
607
608 x86_push(
609 func,
610 x86_make_reg( file_REG32, reg_AX) );
611 x86_push(
612 func,
613 x86_make_reg( file_REG32, reg_CX) );
614 x86_push(
615 func,
616 x86_make_reg( file_REG32, reg_DX) );
617
618 /* Store XMM regs to the stack
619 */
620 for(i = 0, n = 0; i < 8; ++i)
621 if(xmm_save_mask & (1 << i))
622 ++n;
623
624 x86_sub_imm(
625 func,
626 x86_make_reg( file_REG32, reg_SP ),
627 n*16);
628
629 for(i = 0, n = 0; i < 8; ++i)
630 if(xmm_save_mask & (1 << i)) {
631 sse_movups(
632 func,
633 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
634 make_xmm( i ) );
635 ++n;
636 }
637
638 for (i = 0; i < nr_args; i++) {
639 /* Load the address of the buffer we use for passing arguments and
640 * receiving results:
641 */
642 x86_lea(
643 func,
644 ecx,
645 arg[i] );
646
647 /* Push actual function arguments (currently just the pointer to
648 * the buffer above), and call the function:
649 */
650 x86_push( func, ecx );
651 }
652
653 x86_mov_reg_imm( func, ecx, (unsigned long) code );
654 x86_call( func, ecx );
655
656 /* Pop the arguments (or just add an immediate to esp)
657 */
658 for (i = 0; i < nr_args; i++) {
659 x86_pop(func, ecx );
660 }
661
662 /* Pop the saved XMM regs:
663 */
664 for(i = 0, n = 0; i < 8; ++i)
665 if(xmm_save_mask & (1 << i)) {
666 sse_movups(
667 func,
668 make_xmm( i ),
669 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
670 ++n;
671 }
672
673 x86_add_imm(
674 func,
675 x86_make_reg( file_REG32, reg_SP ),
676 n*16);
677
678 /* Restore GP registers in a reverse order.
679 */
680 x86_pop(
681 func,
682 x86_make_reg( file_REG32, reg_DX) );
683 x86_pop(
684 func,
685 x86_make_reg( file_REG32, reg_CX) );
686 x86_pop(
687 func,
688 x86_make_reg( file_REG32, reg_AX) );
689 }
690
691 static void
692 emit_func_call_dst_src1(
693 struct x86_function *func,
694 unsigned xmm_save,
695 unsigned xmm_dst,
696 unsigned xmm_src0,
697 void (PIPE_CDECL *code)() )
698 {
699 struct x86_reg store = get_temp( TEMP_R0, 0 );
700 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
701
702 /* Store our input parameters (in xmm regs) to the buffer we use
703 * for passing arguments. We will pass a pointer to this buffer as
704 * the actual function argument.
705 */
706 sse_movaps(
707 func,
708 store,
709 make_xmm( xmm_src0 ) );
710
711 emit_func_call( func,
712 xmm_mask,
713 &store,
714 1,
715 code );
716
717 sse_movaps(
718 func,
719 make_xmm( xmm_dst ),
720 store );
721 }
722
723
724 static void
725 emit_func_call_dst_src2(
726 struct x86_function *func,
727 unsigned xmm_save,
728 unsigned xmm_dst,
729 unsigned xmm_src0,
730 unsigned xmm_src1,
731 void (PIPE_CDECL *code)() )
732 {
733 struct x86_reg store = get_temp( TEMP_R0, 0 );
734 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
735
736 /* Store two inputs to parameter buffer.
737 */
738 sse_movaps(
739 func,
740 store,
741 make_xmm( xmm_src0 ) );
742
743 sse_movaps(
744 func,
745 x86_make_disp( store, 4 * sizeof(float) ),
746 make_xmm( xmm_src1 ) );
747
748
749 /* Emit the call
750 */
751 emit_func_call( func,
752 xmm_mask,
753 &store,
754 1,
755 code );
756
757 /* Retrieve the results:
758 */
759 sse_movaps(
760 func,
761 make_xmm( xmm_dst ),
762 store );
763 }
764
765
766
767
768
769 #if defined(PIPE_ARCH_SSE)
770
771 /*
772 * Fast SSE2 implementation of special math functions.
773 */
774
775 #define POLY0(x, c0) _mm_set1_ps(c0)
776 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
777 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
778 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
779 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
780 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
781
782 #define EXP_POLY_DEGREE 3
783 #define LOG_POLY_DEGREE 5
784
785 /**
786 * See http://www.devmaster.net/forums/showthread.php?p=43580
787 */
788 static INLINE __m128
789 exp2f4(__m128 x)
790 {
791 __m128i ipart;
792 __m128 fpart, expipart, expfpart;
793
794 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
795 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
796
797 /* ipart = int(x - 0.5) */
798 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
799
800 /* fpart = x - ipart */
801 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
802
803 /* expipart = (float) (1 << ipart) */
804 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
805
806 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
807 #if EXP_POLY_DEGREE == 5
808 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
809 #elif EXP_POLY_DEGREE == 4
810 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
811 #elif EXP_POLY_DEGREE == 3
812 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
813 #elif EXP_POLY_DEGREE == 2
814 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
815 #else
816 #error
817 #endif
818
819 return _mm_mul_ps(expipart, expfpart);
820 }
821
822
823 /**
824 * See http://www.devmaster.net/forums/showthread.php?p=43580
825 */
826 static INLINE __m128
827 log2f4(__m128 x)
828 {
829 __m128i expmask = _mm_set1_epi32(0x7f800000);
830 __m128i mantmask = _mm_set1_epi32(0x007fffff);
831 __m128 one = _mm_set1_ps(1.0f);
832
833 __m128i i = _mm_castps_si128(x);
834
835 /* exp = (float) exponent(x) */
836 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
837
838 /* mant = (float) mantissa(x) */
839 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
840
841 __m128 logmant;
842
843 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
844 * These coefficients can be generate with
845 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
846 */
847 #if LOG_POLY_DEGREE == 6
848 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
849 #elif LOG_POLY_DEGREE == 5
850 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
851 #elif LOG_POLY_DEGREE == 4
852 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
853 #elif LOG_POLY_DEGREE == 3
854 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
855 #else
856 #error
857 #endif
858
859 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
860 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
861
862 return _mm_add_ps(logmant, exp);
863 }
864
865
866 static INLINE __m128
867 powf4(__m128 x, __m128 y)
868 {
869 return exp2f4(_mm_mul_ps(log2f4(x), y));
870 }
871
872 #endif /* PIPE_ARCH_SSE */
873
874
875
876 /**
877 * Low-level instruction translators.
878 */
879
880 static void
881 emit_abs(
882 struct x86_function *func,
883 unsigned xmm )
884 {
885 sse_andps(
886 func,
887 make_xmm( xmm ),
888 get_temp(
889 TGSI_EXEC_TEMP_7FFFFFFF_I,
890 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
891 }
892
893 static void
894 emit_add(
895 struct x86_function *func,
896 unsigned xmm_dst,
897 unsigned xmm_src )
898 {
899 sse_addps(
900 func,
901 make_xmm( xmm_dst ),
902 make_xmm( xmm_src ) );
903 }
904
905 static void PIPE_CDECL
906 cos4f(
907 float *store )
908 {
909 store[0] = cosf( store[0] );
910 store[1] = cosf( store[1] );
911 store[2] = cosf( store[2] );
912 store[3] = cosf( store[3] );
913 }
914
915 static void
916 emit_cos(
917 struct x86_function *func,
918 unsigned xmm_save,
919 unsigned xmm_dst )
920 {
921 emit_func_call_dst_src1(
922 func,
923 xmm_save,
924 xmm_dst,
925 xmm_dst,
926 cos4f );
927 }
928
929 static void PIPE_CDECL
930 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
931 __attribute__((force_align_arg_pointer))
932 #endif
933 ex24f(
934 float *store )
935 {
936 #if defined(PIPE_ARCH_SSE)
937 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
938 #else
939 store[0] = util_fast_exp2( store[0] );
940 store[1] = util_fast_exp2( store[1] );
941 store[2] = util_fast_exp2( store[2] );
942 store[3] = util_fast_exp2( store[3] );
943 #endif
944 }
945
946 static void
947 emit_ex2(
948 struct x86_function *func,
949 unsigned xmm_save,
950 unsigned xmm_dst )
951 {
952 emit_func_call_dst_src1(
953 func,
954 xmm_save,
955 xmm_dst,
956 xmm_dst,
957 ex24f );
958 }
959
960 static void
961 emit_f2it(
962 struct x86_function *func,
963 unsigned xmm )
964 {
965 sse2_cvttps2dq(
966 func,
967 make_xmm( xmm ),
968 make_xmm( xmm ) );
969 }
970
971 static void
972 emit_i2f(
973 struct x86_function *func,
974 unsigned xmm )
975 {
976 sse2_cvtdq2ps(
977 func,
978 make_xmm( xmm ),
979 make_xmm( xmm ) );
980 }
981
982 static void PIPE_CDECL
983 flr4f(
984 float *store )
985 {
986 store[0] = floorf( store[0] );
987 store[1] = floorf( store[1] );
988 store[2] = floorf( store[2] );
989 store[3] = floorf( store[3] );
990 }
991
992 static void
993 emit_flr(
994 struct x86_function *func,
995 unsigned xmm_save,
996 unsigned xmm_dst )
997 {
998 emit_func_call_dst_src1(
999 func,
1000 xmm_save,
1001 xmm_dst,
1002 xmm_dst,
1003 flr4f );
1004 }
1005
1006 static void PIPE_CDECL
1007 frc4f(
1008 float *store )
1009 {
1010 store[0] -= floorf( store[0] );
1011 store[1] -= floorf( store[1] );
1012 store[2] -= floorf( store[2] );
1013 store[3] -= floorf( store[3] );
1014 }
1015
1016 static void
1017 emit_frc(
1018 struct x86_function *func,
1019 unsigned xmm_save,
1020 unsigned xmm_dst )
1021 {
1022 emit_func_call_dst_src1(
1023 func,
1024 xmm_save,
1025 xmm_dst,
1026 xmm_dst,
1027 frc4f );
1028 }
1029
1030 static void PIPE_CDECL
1031 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1032 __attribute__((force_align_arg_pointer))
1033 #endif
1034 lg24f(
1035 float *store )
1036 {
1037 #if defined(PIPE_ARCH_SSE)
1038 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
1039 #else
1040 store[0] = util_fast_log2( store[0] );
1041 store[1] = util_fast_log2( store[1] );
1042 store[2] = util_fast_log2( store[2] );
1043 store[3] = util_fast_log2( store[3] );
1044 #endif
1045 }
1046
1047 static void
1048 emit_lg2(
1049 struct x86_function *func,
1050 unsigned xmm_save,
1051 unsigned xmm_dst )
1052 {
1053 emit_func_call_dst_src1(
1054 func,
1055 xmm_save,
1056 xmm_dst,
1057 xmm_dst,
1058 lg24f );
1059 }
1060
1061 static void
1062 emit_MOV(
1063 struct x86_function *func,
1064 unsigned xmm_dst,
1065 unsigned xmm_src )
1066 {
1067 sse_movups(
1068 func,
1069 make_xmm( xmm_dst ),
1070 make_xmm( xmm_src ) );
1071 }
1072
1073 static void
1074 emit_mul (struct x86_function *func,
1075 unsigned xmm_dst,
1076 unsigned xmm_src)
1077 {
1078 sse_mulps(
1079 func,
1080 make_xmm( xmm_dst ),
1081 make_xmm( xmm_src ) );
1082 }
1083
1084 static void
1085 emit_neg(
1086 struct x86_function *func,
1087 unsigned xmm )
1088 {
1089 sse_xorps(
1090 func,
1091 make_xmm( xmm ),
1092 get_temp(
1093 TGSI_EXEC_TEMP_80000000_I,
1094 TGSI_EXEC_TEMP_80000000_C ) );
1095 }
1096
1097 static void PIPE_CDECL
1098 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1099 __attribute__((force_align_arg_pointer))
1100 #endif
1101 pow4f(
1102 float *store )
1103 {
1104 #if defined(PIPE_ARCH_SSE)
1105 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1106 #else
1107 store[0] = util_fast_pow( store[0], store[4] );
1108 store[1] = util_fast_pow( store[1], store[5] );
1109 store[2] = util_fast_pow( store[2], store[6] );
1110 store[3] = util_fast_pow( store[3], store[7] );
1111 #endif
1112 }
1113
1114 static void
1115 emit_pow(
1116 struct x86_function *func,
1117 unsigned xmm_save,
1118 unsigned xmm_dst,
1119 unsigned xmm_src0,
1120 unsigned xmm_src1 )
1121 {
1122 emit_func_call_dst_src2(
1123 func,
1124 xmm_save,
1125 xmm_dst,
1126 xmm_src0,
1127 xmm_src1,
1128 pow4f );
1129 }
1130
1131 static void
1132 emit_rcp (
1133 struct x86_function *func,
1134 unsigned xmm_dst,
1135 unsigned xmm_src )
1136 {
1137 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1138 * good enough. Need to either emit a proper divide or use the
1139 * iterative technique described below in emit_rsqrt().
1140 */
1141 sse2_rcpps(
1142 func,
1143 make_xmm( xmm_dst ),
1144 make_xmm( xmm_src ) );
1145 }
1146
1147 static void PIPE_CDECL
1148 rnd4f(
1149 float *store )
1150 {
1151 store[0] = floorf( store[0] + 0.5f );
1152 store[1] = floorf( store[1] + 0.5f );
1153 store[2] = floorf( store[2] + 0.5f );
1154 store[3] = floorf( store[3] + 0.5f );
1155 }
1156
1157 static void
1158 emit_rnd(
1159 struct x86_function *func,
1160 unsigned xmm_save,
1161 unsigned xmm_dst )
1162 {
1163 emit_func_call_dst_src1(
1164 func,
1165 xmm_save,
1166 xmm_dst,
1167 xmm_dst,
1168 rnd4f );
1169 }
1170
1171 static void
1172 emit_rsqrt(
1173 struct x86_function *func,
1174 unsigned xmm_dst,
1175 unsigned xmm_src )
1176 {
1177 #if HIGH_PRECISION
1178 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1179 * implementations, it is possible to improve its precision at
1180 * fairly low cost, using a newton/raphson step, as below:
1181 *
1182 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1183 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1184 *
1185 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1186 */
1187 {
1188 struct x86_reg dst = make_xmm( xmm_dst );
1189 struct x86_reg src = make_xmm( xmm_src );
1190 struct x86_reg tmp0 = make_xmm( 2 );
1191 struct x86_reg tmp1 = make_xmm( 3 );
1192
1193 assert( xmm_dst != xmm_src );
1194 assert( xmm_dst != 2 && xmm_dst != 3 );
1195 assert( xmm_src != 2 && xmm_src != 3 );
1196
1197 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1198 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1199 sse_rsqrtps( func, tmp1, src );
1200 sse_mulps( func, src, tmp1 );
1201 sse_mulps( func, dst, tmp1 );
1202 sse_mulps( func, src, tmp1 );
1203 sse_subps( func, tmp0, src );
1204 sse_mulps( func, dst, tmp0 );
1205 }
1206 #else
1207 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1208 * good enough.
1209 */
1210 sse_rsqrtps(
1211 func,
1212 make_xmm( xmm_dst ),
1213 make_xmm( xmm_src ) );
1214 #endif
1215 }
1216
1217 static void
1218 emit_setsign(
1219 struct x86_function *func,
1220 unsigned xmm )
1221 {
1222 sse_orps(
1223 func,
1224 make_xmm( xmm ),
1225 get_temp(
1226 TGSI_EXEC_TEMP_80000000_I,
1227 TGSI_EXEC_TEMP_80000000_C ) );
1228 }
1229
1230 static void PIPE_CDECL
1231 sgn4f(
1232 float *store )
1233 {
1234 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1235 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1236 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1237 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1238 }
1239
1240 static void
1241 emit_sgn(
1242 struct x86_function *func,
1243 unsigned xmm_save,
1244 unsigned xmm_dst )
1245 {
1246 emit_func_call_dst_src1(
1247 func,
1248 xmm_save,
1249 xmm_dst,
1250 xmm_dst,
1251 sgn4f );
1252 }
1253
1254 static void PIPE_CDECL
1255 sin4f(
1256 float *store )
1257 {
1258 store[0] = sinf( store[0] );
1259 store[1] = sinf( store[1] );
1260 store[2] = sinf( store[2] );
1261 store[3] = sinf( store[3] );
1262 }
1263
1264 static void
1265 emit_sin (struct x86_function *func,
1266 unsigned xmm_save,
1267 unsigned xmm_dst)
1268 {
1269 emit_func_call_dst_src1(
1270 func,
1271 xmm_save,
1272 xmm_dst,
1273 xmm_dst,
1274 sin4f );
1275 }
1276
1277 static void
1278 emit_sub(
1279 struct x86_function *func,
1280 unsigned xmm_dst,
1281 unsigned xmm_src )
1282 {
1283 sse_subps(
1284 func,
1285 make_xmm( xmm_dst ),
1286 make_xmm( xmm_src ) );
1287 }
1288
1289 /**
1290 * Register fetch.
1291 */
1292 static void
1293 emit_fetch(
1294 struct x86_function *func,
1295 unsigned xmm,
1296 const struct tgsi_full_src_register *reg,
1297 const unsigned chan_index )
1298 {
1299 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1300
1301 switch (swizzle) {
1302 case TGSI_SWIZZLE_X:
1303 case TGSI_SWIZZLE_Y:
1304 case TGSI_SWIZZLE_Z:
1305 case TGSI_SWIZZLE_W:
1306 switch (reg->Register.File) {
1307 case TGSI_FILE_CONSTANT:
1308 emit_const(
1309 func,
1310 xmm,
1311 reg->Register.Index,
1312 swizzle,
1313 reg->Register.Indirect,
1314 reg->Indirect.File,
1315 reg->Indirect.Index );
1316 break;
1317
1318 case TGSI_FILE_IMMEDIATE:
1319 emit_immediate(
1320 func,
1321 xmm,
1322 reg->Register.Index,
1323 swizzle );
1324 break;
1325
1326 case TGSI_FILE_SYSTEM_VALUE:
1327 emit_system_value(
1328 func,
1329 xmm,
1330 reg->Register.Index,
1331 swizzle );
1332 break;
1333
1334 case TGSI_FILE_INPUT:
1335 emit_inputf(
1336 func,
1337 xmm,
1338 reg->Register.Index,
1339 swizzle );
1340 break;
1341
1342 case TGSI_FILE_TEMPORARY:
1343 emit_tempf(
1344 func,
1345 xmm,
1346 reg->Register.Index,
1347 swizzle );
1348 break;
1349
1350 default:
1351 assert( 0 );
1352 }
1353 break;
1354
1355 default:
1356 assert( 0 );
1357 }
1358
1359 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1360 case TGSI_UTIL_SIGN_CLEAR:
1361 emit_abs( func, xmm );
1362 break;
1363
1364 case TGSI_UTIL_SIGN_SET:
1365 emit_setsign( func, xmm );
1366 break;
1367
1368 case TGSI_UTIL_SIGN_TOGGLE:
1369 emit_neg( func, xmm );
1370 break;
1371
1372 case TGSI_UTIL_SIGN_KEEP:
1373 break;
1374 }
1375 }
1376
1377 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1378 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1379
1380 /**
1381 * Register store.
1382 */
1383 static void
1384 emit_store(
1385 struct x86_function *func,
1386 unsigned xmm,
1387 const struct tgsi_full_dst_register *reg,
1388 const struct tgsi_full_instruction *inst,
1389 unsigned chan_index )
1390 {
1391 switch( inst->Instruction.Saturate ) {
1392 case TGSI_SAT_NONE:
1393 break;
1394
1395 case TGSI_SAT_ZERO_ONE:
1396 sse_maxps(
1397 func,
1398 make_xmm( xmm ),
1399 get_temp(
1400 TGSI_EXEC_TEMP_00000000_I,
1401 TGSI_EXEC_TEMP_00000000_C ) );
1402
1403 sse_minps(
1404 func,
1405 make_xmm( xmm ),
1406 get_temp(
1407 TGSI_EXEC_TEMP_ONE_I,
1408 TGSI_EXEC_TEMP_ONE_C ) );
1409 break;
1410
1411 case TGSI_SAT_MINUS_PLUS_ONE:
1412 assert( 0 );
1413 break;
1414 }
1415
1416
1417 switch( reg->Register.File ) {
1418 case TGSI_FILE_OUTPUT:
1419 emit_output(
1420 func,
1421 xmm,
1422 reg->Register.Index,
1423 chan_index );
1424 break;
1425
1426 case TGSI_FILE_TEMPORARY:
1427 emit_temps(
1428 func,
1429 xmm,
1430 reg->Register.Index,
1431 chan_index );
1432 break;
1433
1434 case TGSI_FILE_ADDRESS:
1435 emit_addrs(
1436 func,
1437 xmm,
1438 reg->Register.Index,
1439 chan_index );
1440 break;
1441
1442 default:
1443 assert( 0 );
1444 }
1445 }
1446
1447 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1448 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1449
1450
1451 static void PIPE_CDECL
1452 fetch_texel( struct tgsi_sampler **sampler,
1453 float *store )
1454 {
1455 #if 0
1456 uint j;
1457
1458 debug_printf("%s sampler: %p (%p) store: %p\n",
1459 __FUNCTION__,
1460 sampler, *sampler,
1461 store );
1462
1463 for (j = 0; j < 4; j++)
1464 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1465 j,
1466 store[0+j],
1467 store[4+j],
1468 store[8 + j],
1469 store[12 + j]);
1470 #endif
1471
1472 {
1473 float rgba[NUM_CHANNELS][QUAD_SIZE];
1474 (*sampler)->get_samples(*sampler,
1475 &store[0], /* s */
1476 &store[4], /* t */
1477 &store[8], /* r */
1478 &store[12], /* lodbias */
1479 tgsi_sampler_lod_bias,
1480 rgba); /* results */
1481
1482 memcpy( store, rgba, 16 * sizeof(float));
1483 }
1484
1485 #if 0
1486 for (j = 0; j < 4; j++)
1487 debug_printf("sample %d result %f %f %f %f\n",
1488 j,
1489 store[0+j],
1490 store[4+j],
1491 store[8+j],
1492 store[12+j]);
1493 #endif
1494 }
1495
1496 /**
1497 * High-level instruction translators.
1498 */
1499 static void
1500 emit_tex( struct x86_function *func,
1501 const struct tgsi_full_instruction *inst,
1502 boolean lodbias,
1503 boolean projected)
1504 {
1505 const uint unit = inst->Src[1].Register.Index;
1506 struct x86_reg args[2];
1507 unsigned count;
1508 unsigned i;
1509
1510 assert(inst->Instruction.Texture);
1511 switch (inst->Texture.Texture) {
1512 case TGSI_TEXTURE_1D:
1513 count = 1;
1514 break;
1515 case TGSI_TEXTURE_2D:
1516 case TGSI_TEXTURE_RECT:
1517 case TGSI_TEXTURE_1D_ARRAY:
1518 count = 2;
1519 break;
1520 case TGSI_TEXTURE_SHADOW1D:
1521 case TGSI_TEXTURE_SHADOW2D:
1522 case TGSI_TEXTURE_SHADOWRECT:
1523 case TGSI_TEXTURE_3D:
1524 case TGSI_TEXTURE_CUBE:
1525 case TGSI_TEXTURE_2D_ARRAY:
1526 count = 3;
1527 break;
1528 default:
1529 assert(0);
1530 return;
1531 }
1532
1533 if (lodbias) {
1534 FETCH( func, *inst, 3, 0, 3 );
1535 }
1536 else {
1537 emit_tempf(
1538 func,
1539 3,
1540 TGSI_EXEC_TEMP_00000000_I,
1541 TGSI_EXEC_TEMP_00000000_C );
1542
1543 }
1544
1545 /* store lodbias whether enabled or not -- fetch_texel currently
1546 * respects it always.
1547 */
1548 sse_movaps( func,
1549 get_temp( TEMP_R0, 3 ),
1550 make_xmm( 3 ) );
1551
1552 if (projected) {
1553 FETCH( func, *inst, 3, 0, 3 );
1554
1555 emit_rcp( func, 3, 3 );
1556 }
1557
1558 for (i = 0; i < count; i++) {
1559 FETCH( func, *inst, i, 0, i );
1560
1561 if (projected) {
1562 sse_mulps(
1563 func,
1564 make_xmm( i ),
1565 make_xmm( 3 ) );
1566 }
1567
1568 /* Store in the argument buffer:
1569 */
1570 sse_movaps(
1571 func,
1572 get_temp( TEMP_R0, i ),
1573 make_xmm( i ) );
1574 }
1575
1576 args[0] = get_temp( TEMP_R0, 0 );
1577 args[1] = get_sampler_ptr( unit );
1578
1579 emit_func_call( func,
1580 0,
1581 args,
1582 Elements(args),
1583 fetch_texel );
1584
1585 /* If all four channels are enabled, could use a pointer to
1586 * dst[0].x instead of TEMP_R0 for store?
1587 */
1588 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1589
1590 sse_movaps(
1591 func,
1592 make_xmm( 0 ),
1593 get_temp( TEMP_R0, i ) );
1594
1595 STORE( func, *inst, 0, 0, i );
1596 }
1597 }
1598
1599
1600 static void
1601 emit_kil(
1602 struct x86_function *func,
1603 const struct tgsi_full_src_register *reg )
1604 {
1605 unsigned uniquemask;
1606 unsigned unique_count = 0;
1607 unsigned chan_index;
1608 unsigned i;
1609
1610 /* This mask stores component bits that were already tested. Note that
1611 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1612 * tested.
1613 */
1614 uniquemask = 0;
1615
1616 FOR_EACH_CHANNEL( chan_index ) {
1617 unsigned swizzle;
1618
1619 /* unswizzle channel */
1620 swizzle = tgsi_util_get_full_src_register_swizzle(
1621 reg,
1622 chan_index );
1623
1624 /* check if the component has not been already tested */
1625 if( !(uniquemask & (1 << swizzle)) ) {
1626 uniquemask |= 1 << swizzle;
1627
1628 /* allocate register */
1629 emit_fetch(
1630 func,
1631 unique_count++,
1632 reg,
1633 chan_index );
1634 }
1635 }
1636
1637 x86_push(
1638 func,
1639 x86_make_reg( file_REG32, reg_AX ) );
1640 x86_push(
1641 func,
1642 x86_make_reg( file_REG32, reg_DX ) );
1643
1644 for (i = 0 ; i < unique_count; i++ ) {
1645 struct x86_reg dataXMM = make_xmm(i);
1646
1647 sse_cmpps(
1648 func,
1649 dataXMM,
1650 get_temp(
1651 TGSI_EXEC_TEMP_00000000_I,
1652 TGSI_EXEC_TEMP_00000000_C ),
1653 cc_LessThan );
1654
1655 if( i == 0 ) {
1656 sse_movmskps(
1657 func,
1658 x86_make_reg( file_REG32, reg_AX ),
1659 dataXMM );
1660 }
1661 else {
1662 sse_movmskps(
1663 func,
1664 x86_make_reg( file_REG32, reg_DX ),
1665 dataXMM );
1666 x86_or(
1667 func,
1668 x86_make_reg( file_REG32, reg_AX ),
1669 x86_make_reg( file_REG32, reg_DX ) );
1670 }
1671 }
1672
1673 x86_or(
1674 func,
1675 get_temp(
1676 TGSI_EXEC_TEMP_KILMASK_I,
1677 TGSI_EXEC_TEMP_KILMASK_C ),
1678 x86_make_reg( file_REG32, reg_AX ) );
1679
1680 x86_pop(
1681 func,
1682 x86_make_reg( file_REG32, reg_DX ) );
1683 x86_pop(
1684 func,
1685 x86_make_reg( file_REG32, reg_AX ) );
1686 }
1687
1688
1689 static void
1690 emit_kilp(
1691 struct x86_function *func )
1692 {
1693 /* XXX todo / fix me */
1694 }
1695
1696
1697 static void
1698 emit_setcc(
1699 struct x86_function *func,
1700 struct tgsi_full_instruction *inst,
1701 enum sse_cc cc )
1702 {
1703 unsigned chan_index;
1704
1705 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1706 FETCH( func, *inst, 0, 0, chan_index );
1707 FETCH( func, *inst, 1, 1, chan_index );
1708 sse_cmpps(
1709 func,
1710 make_xmm( 0 ),
1711 make_xmm( 1 ),
1712 cc );
1713 sse_andps(
1714 func,
1715 make_xmm( 0 ),
1716 get_temp(
1717 TEMP_ONE_I,
1718 TEMP_ONE_C ) );
1719 STORE( func, *inst, 0, 0, chan_index );
1720 }
1721 }
1722
1723 static void
1724 emit_cmp(
1725 struct x86_function *func,
1726 struct tgsi_full_instruction *inst )
1727 {
1728 unsigned chan_index;
1729
1730 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1731 FETCH( func, *inst, 0, 0, chan_index );
1732 FETCH( func, *inst, 1, 1, chan_index );
1733 FETCH( func, *inst, 2, 2, chan_index );
1734 sse_cmpps(
1735 func,
1736 make_xmm( 0 ),
1737 get_temp(
1738 TGSI_EXEC_TEMP_00000000_I,
1739 TGSI_EXEC_TEMP_00000000_C ),
1740 cc_LessThan );
1741 sse_andps(
1742 func,
1743 make_xmm( 1 ),
1744 make_xmm( 0 ) );
1745 sse_andnps(
1746 func,
1747 make_xmm( 0 ),
1748 make_xmm( 2 ) );
1749 sse_orps(
1750 func,
1751 make_xmm( 0 ),
1752 make_xmm( 1 ) );
1753 STORE( func, *inst, 0, 0, chan_index );
1754 }
1755 }
1756
1757
1758 /**
1759 * Check if inst src/dest regs use indirect addressing into temporary,
1760 * input or output register files.
1761 */
1762 static boolean
1763 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1764 {
1765 uint i;
1766 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1767 const struct tgsi_full_src_register *reg = &inst->Src[i];
1768 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1769 reg->Register.File == TGSI_FILE_INPUT ||
1770 reg->Register.File == TGSI_FILE_OUTPUT) &&
1771 reg->Register.Indirect)
1772 return TRUE;
1773 }
1774 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1775 const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1776 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1777 reg->Register.File == TGSI_FILE_INPUT ||
1778 reg->Register.File == TGSI_FILE_OUTPUT) &&
1779 reg->Register.Indirect)
1780 return TRUE;
1781 }
1782 return FALSE;
1783 }
1784
1785
1786 static int
1787 emit_instruction(
1788 struct x86_function *func,
1789 struct tgsi_full_instruction *inst )
1790 {
1791 unsigned chan_index;
1792
1793 /* we can't handle indirect addressing into temp register file yet */
1794 if (indirect_reg_reference(inst))
1795 return FALSE;
1796
1797 switch (inst->Instruction.Opcode) {
1798 case TGSI_OPCODE_ARL:
1799 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1800 FETCH( func, *inst, 0, 0, chan_index );
1801 emit_flr(func, 0, 0);
1802 emit_f2it( func, 0 );
1803 STORE( func, *inst, 0, 0, chan_index );
1804 }
1805 break;
1806
1807 case TGSI_OPCODE_MOV:
1808 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1809 FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1810 }
1811 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1812 STORE( func, *inst, 4 + chan_index, 0, chan_index );
1813 }
1814 break;
1815
1816 case TGSI_OPCODE_LIT:
1817 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1818 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1819 emit_tempf(
1820 func,
1821 0,
1822 TEMP_ONE_I,
1823 TEMP_ONE_C);
1824 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1825 STORE( func, *inst, 0, 0, CHAN_X );
1826 }
1827 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1828 STORE( func, *inst, 0, 0, CHAN_W );
1829 }
1830 }
1831 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1832 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1833 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1834 FETCH( func, *inst, 0, 0, CHAN_X );
1835 sse_maxps(
1836 func,
1837 make_xmm( 0 ),
1838 get_temp(
1839 TGSI_EXEC_TEMP_00000000_I,
1840 TGSI_EXEC_TEMP_00000000_C ) );
1841 STORE( func, *inst, 0, 0, CHAN_Y );
1842 }
1843 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1844 /* XMM[1] = SrcReg[0].yyyy */
1845 FETCH( func, *inst, 1, 0, CHAN_Y );
1846 /* XMM[1] = max(XMM[1], 0) */
1847 sse_maxps(
1848 func,
1849 make_xmm( 1 ),
1850 get_temp(
1851 TGSI_EXEC_TEMP_00000000_I,
1852 TGSI_EXEC_TEMP_00000000_C ) );
1853 /* XMM[2] = SrcReg[0].wwww */
1854 FETCH( func, *inst, 2, 0, CHAN_W );
1855 /* XMM[2] = min(XMM[2], 128.0) */
1856 sse_minps(
1857 func,
1858 make_xmm( 2 ),
1859 get_temp(
1860 TGSI_EXEC_TEMP_128_I,
1861 TGSI_EXEC_TEMP_128_C ) );
1862 /* XMM[2] = max(XMM[2], -128.0) */
1863 sse_maxps(
1864 func,
1865 make_xmm( 2 ),
1866 get_temp(
1867 TGSI_EXEC_TEMP_MINUS_128_I,
1868 TGSI_EXEC_TEMP_MINUS_128_C ) );
1869 emit_pow( func, 3, 1, 1, 2 );
1870 FETCH( func, *inst, 0, 0, CHAN_X );
1871 sse_xorps(
1872 func,
1873 make_xmm( 2 ),
1874 make_xmm( 2 ) );
1875 sse_cmpps(
1876 func,
1877 make_xmm( 2 ),
1878 make_xmm( 0 ),
1879 cc_LessThan );
1880 sse_andps(
1881 func,
1882 make_xmm( 2 ),
1883 make_xmm( 1 ) );
1884 STORE( func, *inst, 2, 0, CHAN_Z );
1885 }
1886 }
1887 break;
1888
1889 case TGSI_OPCODE_RCP:
1890 FETCH( func, *inst, 0, 0, CHAN_X );
1891 emit_rcp( func, 0, 0 );
1892 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1893 STORE( func, *inst, 0, 0, chan_index );
1894 }
1895 break;
1896
1897 case TGSI_OPCODE_RSQ:
1898 FETCH( func, *inst, 0, 0, CHAN_X );
1899 emit_abs( func, 0 );
1900 emit_rsqrt( func, 1, 0 );
1901 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1902 STORE( func, *inst, 1, 0, chan_index );
1903 }
1904 break;
1905
1906 case TGSI_OPCODE_EXP:
1907 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1908 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1909 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1910 FETCH( func, *inst, 0, 0, CHAN_X );
1911 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1912 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1913 emit_MOV( func, 1, 0 );
1914 emit_flr( func, 2, 1 );
1915 /* dst.x = ex2(floor(src.x)) */
1916 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1917 emit_MOV( func, 2, 1 );
1918 emit_ex2( func, 3, 2 );
1919 STORE( func, *inst, 2, 0, CHAN_X );
1920 }
1921 /* dst.y = src.x - floor(src.x) */
1922 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1923 emit_MOV( func, 2, 0 );
1924 emit_sub( func, 2, 1 );
1925 STORE( func, *inst, 2, 0, CHAN_Y );
1926 }
1927 }
1928 /* dst.z = ex2(src.x) */
1929 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1930 emit_ex2( func, 3, 0 );
1931 STORE( func, *inst, 0, 0, CHAN_Z );
1932 }
1933 }
1934 /* dst.w = 1.0 */
1935 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1936 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1937 STORE( func, *inst, 0, 0, CHAN_W );
1938 }
1939 break;
1940
1941 case TGSI_OPCODE_LOG:
1942 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1943 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1944 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1945 FETCH( func, *inst, 0, 0, CHAN_X );
1946 emit_abs( func, 0 );
1947 emit_MOV( func, 1, 0 );
1948 emit_lg2( func, 2, 1 );
1949 /* dst.z = lg2(abs(src.x)) */
1950 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1951 STORE( func, *inst, 1, 0, CHAN_Z );
1952 }
1953 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1954 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1955 emit_flr( func, 2, 1 );
1956 /* dst.x = floor(lg2(abs(src.x))) */
1957 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1958 STORE( func, *inst, 1, 0, CHAN_X );
1959 }
1960 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1961 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1962 emit_ex2( func, 2, 1 );
1963 emit_rcp( func, 1, 1 );
1964 emit_mul( func, 0, 1 );
1965 STORE( func, *inst, 0, 0, CHAN_Y );
1966 }
1967 }
1968 }
1969 /* dst.w = 1.0 */
1970 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1971 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1972 STORE( func, *inst, 0, 0, CHAN_W );
1973 }
1974 break;
1975
1976 case TGSI_OPCODE_MUL:
1977 /* do all fetches and adds, storing results in temp regs */
1978 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1979 int r = chan_index + 1;
1980 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1981 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1982 emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */
1983 }
1984 /* do all stores of the temp regs */
1985 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1986 int r = chan_index + 1;
1987 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1988 }
1989 break;
1990
1991 case TGSI_OPCODE_ADD:
1992 /* do all fetches and adds, storing results in temp regs */
1993 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1994 int r = chan_index + 1;
1995 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1996 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1997 emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */
1998 }
1999 /* do all stores of the temp regs */
2000 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2001 int r = chan_index + 1;
2002 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
2003 }
2004 break;
2005
2006 case TGSI_OPCODE_DP3:
2007 FETCH( func, *inst, 0, 0, CHAN_X );
2008 FETCH( func, *inst, 1, 1, CHAN_X );
2009 emit_mul( func, 0, 1 );
2010 FETCH( func, *inst, 1, 0, CHAN_Y );
2011 FETCH( func, *inst, 2, 1, CHAN_Y );
2012 emit_mul( func, 1, 2 );
2013 emit_add( func, 0, 1 );
2014 FETCH( func, *inst, 1, 0, CHAN_Z );
2015 FETCH( func, *inst, 2, 1, CHAN_Z );
2016 emit_mul( func, 1, 2 );
2017 emit_add( func, 0, 1 );
2018 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2019 STORE( func, *inst, 0, 0, chan_index );
2020 }
2021 break;
2022
2023 case TGSI_OPCODE_DP4:
2024 FETCH( func, *inst, 0, 0, CHAN_X );
2025 FETCH( func, *inst, 1, 1, CHAN_X );
2026 emit_mul( func, 0, 1 );
2027 FETCH( func, *inst, 1, 0, CHAN_Y );
2028 FETCH( func, *inst, 2, 1, CHAN_Y );
2029 emit_mul( func, 1, 2 );
2030 emit_add( func, 0, 1 );
2031 FETCH( func, *inst, 1, 0, CHAN_Z );
2032 FETCH( func, *inst, 2, 1, CHAN_Z );
2033 emit_mul(func, 1, 2 );
2034 emit_add(func, 0, 1 );
2035 FETCH( func, *inst, 1, 0, CHAN_W );
2036 FETCH( func, *inst, 2, 1, CHAN_W );
2037 emit_mul( func, 1, 2 );
2038 emit_add( func, 0, 1 );
2039 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2040 STORE( func, *inst, 0, 0, chan_index );
2041 }
2042 break;
2043
2044 case TGSI_OPCODE_DST:
2045 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2046 emit_tempf(
2047 func,
2048 0,
2049 TEMP_ONE_I,
2050 TEMP_ONE_C );
2051 STORE( func, *inst, 0, 0, CHAN_X );
2052 }
2053 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2054 FETCH( func, *inst, 0, 0, CHAN_Y );
2055 FETCH( func, *inst, 1, 1, CHAN_Y );
2056 emit_mul( func, 0, 1 );
2057 STORE( func, *inst, 0, 0, CHAN_Y );
2058 }
2059 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2060 FETCH( func, *inst, 0, 0, CHAN_Z );
2061 STORE( func, *inst, 0, 0, CHAN_Z );
2062 }
2063 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2064 FETCH( func, *inst, 0, 1, CHAN_W );
2065 STORE( func, *inst, 0, 0, CHAN_W );
2066 }
2067 break;
2068
2069 case TGSI_OPCODE_MIN:
2070 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2071 FETCH( func, *inst, 0, 0, chan_index );
2072 FETCH( func, *inst, 1, 1, chan_index );
2073 sse_minps(
2074 func,
2075 make_xmm( 0 ),
2076 make_xmm( 1 ) );
2077 STORE( func, *inst, 0, 0, chan_index );
2078 }
2079 break;
2080
2081 case TGSI_OPCODE_MAX:
2082 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2083 FETCH( func, *inst, 0, 0, chan_index );
2084 FETCH( func, *inst, 1, 1, chan_index );
2085 sse_maxps(
2086 func,
2087 make_xmm( 0 ),
2088 make_xmm( 1 ) );
2089 STORE( func, *inst, 0, 0, chan_index );
2090 }
2091 break;
2092
2093 case TGSI_OPCODE_SLT:
2094 emit_setcc( func, inst, cc_LessThan );
2095 break;
2096
2097 case TGSI_OPCODE_SGE:
2098 emit_setcc( func, inst, cc_NotLessThan );
2099 break;
2100
2101 case TGSI_OPCODE_MAD:
2102 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103 FETCH( func, *inst, 0, 0, chan_index );
2104 FETCH( func, *inst, 1, 1, chan_index );
2105 FETCH( func, *inst, 2, 2, chan_index );
2106 emit_mul( func, 0, 1 );
2107 emit_add( func, 0, 2 );
2108 STORE( func, *inst, 0, 0, chan_index );
2109 }
2110 break;
2111
2112 case TGSI_OPCODE_SUB:
2113 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2114 FETCH( func, *inst, 0, 0, chan_index );
2115 FETCH( func, *inst, 1, 1, chan_index );
2116 emit_sub( func, 0, 1 );
2117 STORE( func, *inst, 0, 0, chan_index );
2118 }
2119 break;
2120
2121 case TGSI_OPCODE_LRP:
2122 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2123 FETCH( func, *inst, 0, 0, chan_index );
2124 FETCH( func, *inst, 1, 1, chan_index );
2125 FETCH( func, *inst, 2, 2, chan_index );
2126 emit_sub( func, 1, 2 );
2127 emit_mul( func, 0, 1 );
2128 emit_add( func, 0, 2 );
2129 STORE( func, *inst, 0, 0, chan_index );
2130 }
2131 break;
2132
2133 case TGSI_OPCODE_CND:
2134 return 0;
2135 break;
2136
2137 case TGSI_OPCODE_DP2A:
2138 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2139 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2140 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2141 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2142 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2143 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2144 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2145 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2146 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2147 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2148 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2149 }
2150 break;
2151
2152 case TGSI_OPCODE_FRC:
2153 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2154 FETCH( func, *inst, 0, 0, chan_index );
2155 emit_frc( func, 0, 0 );
2156 STORE( func, *inst, 0, 0, chan_index );
2157 }
2158 break;
2159
2160 case TGSI_OPCODE_CLAMP:
2161 return 0;
2162 break;
2163
2164 case TGSI_OPCODE_FLR:
2165 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2166 FETCH( func, *inst, 0, 0, chan_index );
2167 emit_flr( func, 0, 0 );
2168 STORE( func, *inst, 0, 0, chan_index );
2169 }
2170 break;
2171
2172 case TGSI_OPCODE_ROUND:
2173 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2174 FETCH( func, *inst, 0, 0, chan_index );
2175 emit_rnd( func, 0, 0 );
2176 STORE( func, *inst, 0, 0, chan_index );
2177 }
2178 break;
2179
2180 case TGSI_OPCODE_EX2:
2181 FETCH( func, *inst, 0, 0, CHAN_X );
2182 emit_ex2( func, 0, 0 );
2183 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2184 STORE( func, *inst, 0, 0, chan_index );
2185 }
2186 break;
2187
2188 case TGSI_OPCODE_LG2:
2189 FETCH( func, *inst, 0, 0, CHAN_X );
2190 emit_lg2( func, 0, 0 );
2191 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2192 STORE( func, *inst, 0, 0, chan_index );
2193 }
2194 break;
2195
2196 case TGSI_OPCODE_POW:
2197 FETCH( func, *inst, 0, 0, CHAN_X );
2198 FETCH( func, *inst, 1, 1, CHAN_X );
2199 emit_pow( func, 0, 0, 0, 1 );
2200 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2201 STORE( func, *inst, 0, 0, chan_index );
2202 }
2203 break;
2204
2205 case TGSI_OPCODE_XPD:
2206 /* Note: we do all stores after all operands have been fetched
2207 * to avoid src/dst register aliasing issues for an instruction
2208 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2209 */
2210 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2211 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2212 FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2213 FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2214 }
2215 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2216 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2217 FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2218 FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2219 }
2220 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2221 emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */
2222 emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2223 emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */
2224 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2225 emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2226 /* store xmm[7] in dst.x below */
2227 }
2228 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2229 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2230 FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2231 FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2232 }
2233 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2234 emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2235 emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2236 emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2237 /* store xmm[3] in dst.y below */
2238 }
2239 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2240 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2241 emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2242 emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2243 STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2244 }
2245 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2246 STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2247 }
2248 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2249 STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2250 }
2251 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2252 emit_tempf(
2253 func,
2254 0,
2255 TEMP_ONE_I,
2256 TEMP_ONE_C );
2257 STORE( func, *inst, 0, 0, CHAN_W );
2258 }
2259 break;
2260
2261 case TGSI_OPCODE_ABS:
2262 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2263 FETCH( func, *inst, 0, 0, chan_index );
2264 emit_abs( func, 0) ;
2265
2266 STORE( func, *inst, 0, 0, chan_index );
2267 }
2268 break;
2269
2270 case TGSI_OPCODE_RCC:
2271 return 0;
2272 break;
2273
2274 case TGSI_OPCODE_DPH:
2275 FETCH( func, *inst, 0, 0, CHAN_X );
2276 FETCH( func, *inst, 1, 1, CHAN_X );
2277 emit_mul( func, 0, 1 );
2278 FETCH( func, *inst, 1, 0, CHAN_Y );
2279 FETCH( func, *inst, 2, 1, CHAN_Y );
2280 emit_mul( func, 1, 2 );
2281 emit_add( func, 0, 1 );
2282 FETCH( func, *inst, 1, 0, CHAN_Z );
2283 FETCH( func, *inst, 2, 1, CHAN_Z );
2284 emit_mul( func, 1, 2 );
2285 emit_add( func, 0, 1 );
2286 FETCH( func, *inst, 1, 1, CHAN_W );
2287 emit_add( func, 0, 1 );
2288 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2289 STORE( func, *inst, 0, 0, chan_index );
2290 }
2291 break;
2292
2293 case TGSI_OPCODE_COS:
2294 FETCH( func, *inst, 0, 0, CHAN_X );
2295 emit_cos( func, 0, 0 );
2296 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2297 STORE( func, *inst, 0, 0, chan_index );
2298 }
2299 break;
2300
2301 case TGSI_OPCODE_DDX:
2302 return 0;
2303 break;
2304
2305 case TGSI_OPCODE_DDY:
2306 return 0;
2307 break;
2308
2309 case TGSI_OPCODE_KILP:
2310 /* predicated kill */
2311 emit_kilp( func );
2312 return 0; /* XXX fix me */
2313 break;
2314
2315 case TGSI_OPCODE_KIL:
2316 /* conditional kill */
2317 emit_kil( func, &inst->Src[0] );
2318 break;
2319
2320 case TGSI_OPCODE_PK2H:
2321 return 0;
2322 break;
2323
2324 case TGSI_OPCODE_PK2US:
2325 return 0;
2326 break;
2327
2328 case TGSI_OPCODE_PK4B:
2329 return 0;
2330 break;
2331
2332 case TGSI_OPCODE_PK4UB:
2333 return 0;
2334 break;
2335
2336 case TGSI_OPCODE_RFL:
2337 return 0;
2338 break;
2339
2340 case TGSI_OPCODE_SEQ:
2341 emit_setcc( func, inst, cc_Equal );
2342 break;
2343
2344 case TGSI_OPCODE_SFL:
2345 return 0;
2346 break;
2347
2348 case TGSI_OPCODE_SGT:
2349 emit_setcc( func, inst, cc_NotLessThanEqual );
2350 break;
2351
2352 case TGSI_OPCODE_SIN:
2353 FETCH( func, *inst, 0, 0, CHAN_X );
2354 emit_sin( func, 0, 0 );
2355 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2356 STORE( func, *inst, 0, 0, chan_index );
2357 }
2358 break;
2359
2360 case TGSI_OPCODE_SLE:
2361 emit_setcc( func, inst, cc_LessThanEqual );
2362 break;
2363
2364 case TGSI_OPCODE_SNE:
2365 emit_setcc( func, inst, cc_NotEqual );
2366 break;
2367
2368 case TGSI_OPCODE_STR:
2369 return 0;
2370 break;
2371
2372 case TGSI_OPCODE_TEX:
2373 emit_tex( func, inst, FALSE, FALSE );
2374 break;
2375
2376 case TGSI_OPCODE_TXD:
2377 return 0;
2378 break;
2379
2380 case TGSI_OPCODE_UP2H:
2381 return 0;
2382 break;
2383
2384 case TGSI_OPCODE_UP2US:
2385 return 0;
2386 break;
2387
2388 case TGSI_OPCODE_UP4B:
2389 return 0;
2390 break;
2391
2392 case TGSI_OPCODE_UP4UB:
2393 return 0;
2394 break;
2395
2396 case TGSI_OPCODE_X2D:
2397 return 0;
2398 break;
2399
2400 case TGSI_OPCODE_ARA:
2401 return 0;
2402 break;
2403
2404 case TGSI_OPCODE_ARR:
2405 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2406 FETCH( func, *inst, 0, 0, chan_index );
2407 emit_rnd( func, 0, 0 );
2408 emit_f2it( func, 0 );
2409 STORE( func, *inst, 0, 0, chan_index );
2410 }
2411 break;
2412
2413 case TGSI_OPCODE_BRA:
2414 return 0;
2415 break;
2416
2417 case TGSI_OPCODE_CAL:
2418 return 0;
2419 break;
2420
2421 case TGSI_OPCODE_RET:
2422 emit_ret( func );
2423 break;
2424
2425 case TGSI_OPCODE_END:
2426 break;
2427
2428 case TGSI_OPCODE_SSG:
2429 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2430 FETCH( func, *inst, 0, 0, chan_index );
2431 emit_sgn( func, 0, 0 );
2432 STORE( func, *inst, 0, 0, chan_index );
2433 }
2434 break;
2435
2436 case TGSI_OPCODE_CMP:
2437 emit_cmp (func, inst);
2438 break;
2439
2440 case TGSI_OPCODE_SCS:
2441 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2442 FETCH( func, *inst, 0, 0, CHAN_X );
2443 emit_cos( func, 0, 0 );
2444 STORE( func, *inst, 0, 0, CHAN_X );
2445 }
2446 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2447 FETCH( func, *inst, 0, 0, CHAN_X );
2448 emit_sin( func, 0, 0 );
2449 STORE( func, *inst, 0, 0, CHAN_Y );
2450 }
2451 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2452 emit_tempf(
2453 func,
2454 0,
2455 TGSI_EXEC_TEMP_00000000_I,
2456 TGSI_EXEC_TEMP_00000000_C );
2457 STORE( func, *inst, 0, 0, CHAN_Z );
2458 }
2459 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2460 emit_tempf(
2461 func,
2462 0,
2463 TEMP_ONE_I,
2464 TEMP_ONE_C );
2465 STORE( func, *inst, 0, 0, CHAN_W );
2466 }
2467 break;
2468
2469 case TGSI_OPCODE_TXB:
2470 emit_tex( func, inst, TRUE, FALSE );
2471 break;
2472
2473 case TGSI_OPCODE_NRM:
2474 /* fall-through */
2475 case TGSI_OPCODE_NRM4:
2476 /* 3 or 4-component normalization */
2477 {
2478 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2479
2480 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2481 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2482 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2483 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2484
2485 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2486
2487 /* xmm4 = src.x */
2488 /* xmm0 = src.x * src.x */
2489 FETCH(func, *inst, 0, 0, CHAN_X);
2490 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2491 emit_MOV(func, 4, 0);
2492 }
2493 emit_mul(func, 0, 0);
2494
2495 /* xmm5 = src.y */
2496 /* xmm0 = xmm0 + src.y * src.y */
2497 FETCH(func, *inst, 1, 0, CHAN_Y);
2498 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2499 emit_MOV(func, 5, 1);
2500 }
2501 emit_mul(func, 1, 1);
2502 emit_add(func, 0, 1);
2503
2504 /* xmm6 = src.z */
2505 /* xmm0 = xmm0 + src.z * src.z */
2506 FETCH(func, *inst, 1, 0, CHAN_Z);
2507 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2508 emit_MOV(func, 6, 1);
2509 }
2510 emit_mul(func, 1, 1);
2511 emit_add(func, 0, 1);
2512
2513 if (dims == 4) {
2514 /* xmm7 = src.w */
2515 /* xmm0 = xmm0 + src.w * src.w */
2516 FETCH(func, *inst, 1, 0, CHAN_W);
2517 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2518 emit_MOV(func, 7, 1);
2519 }
2520 emit_mul(func, 1, 1);
2521 emit_add(func, 0, 1);
2522 }
2523
2524 /* xmm1 = 1 / sqrt(xmm0) */
2525 emit_rsqrt(func, 1, 0);
2526
2527 /* dst.x = xmm1 * src.x */
2528 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2529 emit_mul(func, 4, 1);
2530 STORE(func, *inst, 4, 0, CHAN_X);
2531 }
2532
2533 /* dst.y = xmm1 * src.y */
2534 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2535 emit_mul(func, 5, 1);
2536 STORE(func, *inst, 5, 0, CHAN_Y);
2537 }
2538
2539 /* dst.z = xmm1 * src.z */
2540 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2541 emit_mul(func, 6, 1);
2542 STORE(func, *inst, 6, 0, CHAN_Z);
2543 }
2544
2545 /* dst.w = xmm1 * src.w */
2546 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2547 emit_mul(func, 7, 1);
2548 STORE(func, *inst, 7, 0, CHAN_W);
2549 }
2550 }
2551
2552 /* dst0.w = 1.0 */
2553 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2554 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2555 STORE(func, *inst, 0, 0, CHAN_W);
2556 }
2557 }
2558 break;
2559
2560 case TGSI_OPCODE_DIV:
2561 return 0;
2562 break;
2563
2564 case TGSI_OPCODE_DP2:
2565 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2566 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2567 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2568 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2569 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2570 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2571 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2572 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2573 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2574 }
2575 break;
2576
2577 case TGSI_OPCODE_TXL:
2578 return 0;
2579 break;
2580
2581 case TGSI_OPCODE_TXP:
2582 emit_tex( func, inst, FALSE, TRUE );
2583 break;
2584
2585 case TGSI_OPCODE_BRK:
2586 return 0;
2587 break;
2588
2589 case TGSI_OPCODE_IF:
2590 return 0;
2591 break;
2592
2593 case TGSI_OPCODE_ELSE:
2594 return 0;
2595 break;
2596
2597 case TGSI_OPCODE_ENDIF:
2598 return 0;
2599 break;
2600
2601 case TGSI_OPCODE_PUSHA:
2602 return 0;
2603 break;
2604
2605 case TGSI_OPCODE_POPA:
2606 return 0;
2607 break;
2608
2609 case TGSI_OPCODE_CEIL:
2610 return 0;
2611 break;
2612
2613 case TGSI_OPCODE_I2F:
2614 return 0;
2615 break;
2616
2617 case TGSI_OPCODE_NOT:
2618 return 0;
2619 break;
2620
2621 case TGSI_OPCODE_TRUNC:
2622 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2623 FETCH( func, *inst, 0, 0, chan_index );
2624 emit_f2it( func, 0 );
2625 emit_i2f( func, 0 );
2626 STORE( func, *inst, 0, 0, chan_index );
2627 }
2628 break;
2629
2630 case TGSI_OPCODE_SHL:
2631 return 0;
2632 break;
2633
2634 case TGSI_OPCODE_ISHR:
2635 return 0;
2636 break;
2637
2638 case TGSI_OPCODE_AND:
2639 return 0;
2640 break;
2641
2642 case TGSI_OPCODE_OR:
2643 return 0;
2644 break;
2645
2646 case TGSI_OPCODE_MOD:
2647 return 0;
2648 break;
2649
2650 case TGSI_OPCODE_XOR:
2651 return 0;
2652 break;
2653
2654 case TGSI_OPCODE_SAD:
2655 return 0;
2656 break;
2657
2658 case TGSI_OPCODE_TXF:
2659 return 0;
2660 break;
2661
2662 case TGSI_OPCODE_TXQ:
2663 return 0;
2664 break;
2665
2666 case TGSI_OPCODE_CONT:
2667 return 0;
2668 break;
2669
2670 case TGSI_OPCODE_EMIT:
2671 return 0;
2672 break;
2673
2674 case TGSI_OPCODE_ENDPRIM:
2675 return 0;
2676 break;
2677
2678 default:
2679 return 0;
2680 }
2681
2682 return 1;
2683 }
2684
2685 static void
2686 emit_declaration(
2687 struct x86_function *func,
2688 struct tgsi_full_declaration *decl )
2689 {
2690 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2691 unsigned first, last, mask;
2692 unsigned i, j;
2693
2694 first = decl->Range.First;
2695 last = decl->Range.Last;
2696 mask = decl->Declaration.UsageMask;
2697
2698 for( i = first; i <= last; i++ ) {
2699 for( j = 0; j < NUM_CHANNELS; j++ ) {
2700 if( mask & (1 << j) ) {
2701 switch( decl->Declaration.Interpolate ) {
2702 case TGSI_INTERPOLATE_CONSTANT:
2703 emit_coef_a0( func, 0, i, j );
2704 emit_inputs( func, 0, i, j );
2705 break;
2706
2707 case TGSI_INTERPOLATE_LINEAR:
2708 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2709 emit_coef_dadx( func, 1, i, j );
2710 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2711 emit_coef_dady( func, 3, i, j );
2712 emit_mul( func, 0, 1 ); /* x * dadx */
2713 emit_coef_a0( func, 4, i, j );
2714 emit_mul( func, 2, 3 ); /* y * dady */
2715 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2716 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2717 emit_inputs( func, 0, i, j );
2718 break;
2719
2720 case TGSI_INTERPOLATE_PERSPECTIVE:
2721 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2722 emit_coef_dadx( func, 1, i, j );
2723 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2724 emit_coef_dady( func, 3, i, j );
2725 emit_mul( func, 0, 1 ); /* x * dadx */
2726 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2727 emit_coef_a0( func, 5, i, j );
2728 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2729 emit_mul( func, 2, 3 ); /* y * dady */
2730 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2731 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2732 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2733 emit_inputs( func, 0, i, j );
2734 break;
2735
2736 default:
2737 assert( 0 );
2738 break;
2739 }
2740 }
2741 }
2742 }
2743 }
2744 }
2745
2746 static void aos_to_soa( struct x86_function *func,
2747 uint arg_aos,
2748 uint arg_machine,
2749 uint arg_num,
2750 uint arg_stride )
2751 {
2752 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2753 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2754 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2755 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2756 int loop_top, loop_exit_fixup;
2757
2758 /* Save EBX */
2759 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2760
2761 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2762 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2763 x86_lea( func, soa_input,
2764 x86_make_disp( soa_input,
2765 Offset(struct tgsi_exec_machine, Inputs) ) );
2766 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2767 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2768
2769 /* while (num_inputs != 0) */
2770 loop_top = x86_get_label( func );
2771 x86_cmp_imm( func, num_inputs, 0 );
2772 loop_exit_fixup = x86_jcc_forward( func, cc_E );
2773
2774 {
2775 x86_push( func, aos_input );
2776 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2777 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2778 x86_add( func, aos_input, stride );
2779 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2780 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2781 x86_add( func, aos_input, stride );
2782 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2783 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2784 x86_add( func, aos_input, stride );
2785 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2786 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2787 x86_pop( func, aos_input );
2788
2789 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2790 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2791 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2792 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2793 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2794 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2795
2796 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2797 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2798 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2799 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2800
2801 /* Advance to next input */
2802 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2803 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2804 }
2805 /* --num_inputs */
2806 x86_dec( func, num_inputs );
2807 x86_jmp( func, loop_top );
2808 x86_fixup_fwd_jump( func, loop_exit_fixup );
2809
2810 /* Restore EBX */
2811 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2812 }
2813
2814 static void soa_to_aos( struct x86_function *func,
2815 uint arg_aos,
2816 uint arg_machine,
2817 uint arg_num,
2818 uint arg_stride )
2819 {
2820 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2821 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2822 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2823 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2824 int inner_loop;
2825
2826 /* Save EBX */
2827 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2828
2829 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2830 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2831 x86_lea( func, soa_output,
2832 x86_make_disp( soa_output,
2833 Offset(struct tgsi_exec_machine, Outputs) ) );
2834 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2835
2836 /* do */
2837 inner_loop = x86_get_label( func );
2838 {
2839 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2840 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2841 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2842 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2843
2844 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2845 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2846 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2847 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2848 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2849 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2850
2851 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2852 x86_push( func, aos_output );
2853 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2854 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2855 x86_add( func, aos_output, temp );
2856 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2857 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2858 x86_add( func, aos_output, temp );
2859 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2860 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2861 x86_add( func, aos_output, temp );
2862 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2863 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2864 x86_pop( func, aos_output );
2865
2866 /* Advance to next output */
2867 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2868 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2869 }
2870 /* while --num_outputs */
2871 x86_dec( func, num_outputs );
2872 x86_jcc( func, cc_NE, inner_loop );
2873
2874 /* Restore EBX */
2875 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2876 }
2877
2878
2879 /**
2880 * Check if the instructions dst register is the same as any src
2881 * register and warn if there's a posible SOA dependency.
2882 */
2883 static boolean
2884 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2885 {
2886 uint opcode = inst->Instruction.Opcode;
2887
2888 /* XXX: we only handle src/dst aliasing in a few opcodes currently.
2889 * Need to use an additional temporay to hold the result in the
2890 * cases where the code is too opaque to fix.
2891 */
2892
2893 switch (opcode) {
2894 case TGSI_OPCODE_ADD:
2895 case TGSI_OPCODE_MOV:
2896 case TGSI_OPCODE_MUL:
2897 case TGSI_OPCODE_RCP:
2898 case TGSI_OPCODE_RSQ:
2899 case TGSI_OPCODE_EXP:
2900 case TGSI_OPCODE_LOG:
2901 case TGSI_OPCODE_DP3:
2902 case TGSI_OPCODE_DP4:
2903 case TGSI_OPCODE_DP2A:
2904 case TGSI_OPCODE_EX2:
2905 case TGSI_OPCODE_LG2:
2906 case TGSI_OPCODE_POW:
2907 case TGSI_OPCODE_XPD:
2908 case TGSI_OPCODE_DPH:
2909 case TGSI_OPCODE_COS:
2910 case TGSI_OPCODE_SIN:
2911 case TGSI_OPCODE_TEX:
2912 case TGSI_OPCODE_TXB:
2913 case TGSI_OPCODE_TXP:
2914 case TGSI_OPCODE_NRM:
2915 case TGSI_OPCODE_NRM4:
2916 case TGSI_OPCODE_DP2:
2917 /* OK - these opcodes correctly handle SOA dependencies */
2918 return TRUE;
2919 default:
2920 if (!tgsi_check_soa_dependencies(inst))
2921 return TRUE;
2922
2923 debug_printf("Warning: src/dst aliasing in instruction"
2924 " is not handled:\n");
2925 debug_printf("Warning: ");
2926 tgsi_dump_instruction(inst, 1);
2927
2928 return FALSE;
2929 }
2930 }
2931
2932
2933 /**
2934 * Translate a TGSI vertex/fragment shader to SSE2 code.
2935 * Slightly different things are done for vertex vs. fragment shaders.
2936 *
2937 * \param tokens the TGSI input shader
2938 * \param func the output SSE code/function
2939 * \param immediates buffer to place immediates, later passed to SSE func
2940 * \param return 1 for success, 0 if translation failed
2941 */
2942 unsigned
2943 tgsi_emit_sse2(
2944 const struct tgsi_token *tokens,
2945 struct x86_function *func,
2946 float (*immediates)[4],
2947 boolean do_swizzles )
2948 {
2949 struct tgsi_parse_context parse;
2950 unsigned ok = 1;
2951 uint num_immediates = 0;
2952
2953 util_init_math();
2954
2955 func->csr = func->store;
2956
2957 tgsi_parse_init( &parse, tokens );
2958
2959 /* Can't just use EDI, EBX without save/restoring them:
2960 */
2961 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2962 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2963
2964 /*
2965 * Different function args for vertex/fragment shaders:
2966 */
2967 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2968 if (do_swizzles)
2969 aos_to_soa( func,
2970 4, /* aos_input */
2971 1, /* machine */
2972 5, /* num_inputs */
2973 6 ); /* input_stride */
2974 }
2975
2976 x86_mov(
2977 func,
2978 get_machine_base(),
2979 x86_fn_arg( func, 1 ) );
2980 x86_mov(
2981 func,
2982 get_const_base(),
2983 x86_fn_arg( func, 2 ) );
2984 x86_mov(
2985 func,
2986 get_immediate_base(),
2987 x86_fn_arg( func, 3 ) );
2988
2989 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2990 x86_mov(
2991 func,
2992 get_coef_base(),
2993 x86_fn_arg( func, 4 ) );
2994 }
2995
2996 x86_mov(
2997 func,
2998 get_sampler_base(),
2999 x86_make_disp( get_machine_base(),
3000 Offset( struct tgsi_exec_machine, Samplers ) ) );
3001
3002 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
3003 tgsi_parse_token( &parse );
3004
3005 switch( parse.FullToken.Token.Type ) {
3006 case TGSI_TOKEN_TYPE_DECLARATION:
3007 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
3008 emit_declaration(
3009 func,
3010 &parse.FullToken.FullDeclaration );
3011 }
3012 break;
3013
3014 case TGSI_TOKEN_TYPE_INSTRUCTION:
3015 ok = emit_instruction(
3016 func,
3017 &parse.FullToken.FullInstruction );
3018
3019 if (!ok) {
3020 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
3021 uint proc = parse.FullHeader.Processor.Processor;
3022 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
3023 opcode,
3024 tgsi_get_opcode_name(opcode),
3025 tgsi_get_processor_name(proc));
3026 }
3027
3028 if (ok)
3029 ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
3030 break;
3031
3032 case TGSI_TOKEN_TYPE_IMMEDIATE:
3033 /* simply copy the immediate values into the next immediates[] slot */
3034 {
3035 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
3036 uint i;
3037 assert(size <= 4);
3038 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
3039 for( i = 0; i < size; i++ ) {
3040 immediates[num_immediates][i] =
3041 parse.FullToken.FullImmediate.u[i].Float;
3042 }
3043 #if 0
3044 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
3045 num_immediates,
3046 immediates[num_immediates][0],
3047 immediates[num_immediates][1],
3048 immediates[num_immediates][2],
3049 immediates[num_immediates][3]);
3050 #endif
3051 num_immediates++;
3052 }
3053 break;
3054 case TGSI_TOKEN_TYPE_PROPERTY:
3055 /* we just ignore them for now */
3056 break;
3057
3058 default:
3059 ok = 0;
3060 assert( 0 );
3061 }
3062 }
3063
3064 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
3065 if (do_swizzles)
3066 soa_to_aos( func,
3067 7, /* aos_output */
3068 1, /* machine */
3069 8, /* num_outputs */
3070 9 ); /* output_stride */
3071 }
3072
3073 /* Can't just use EBX, EDI without save/restoring them:
3074 */
3075 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3076 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3077
3078 emit_ret( func );
3079
3080 tgsi_parse_free( &parse );
3081
3082 return ok;
3083 }
3084
3085 #endif /* PIPE_ARCH_X86 */