4c3343d26c31ab92fe67f8eb9d8f124caac08f97
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 /* for 1/sqrt()
47 *
48 * This costs about 100fps (close to 10%) in gears:
49 */
50 #define HIGH_PRECISION 1
51
52 #define FAST_MATH 1
53
54
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
57
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
60
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
63
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67
68 #define CHAN_X 0
69 #define CHAN_Y 1
70 #define CHAN_Z 2
71 #define CHAN_W 3
72
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
75
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
80
81
82 /**
83 * X86 utility functions.
84 */
85
86 static struct x86_reg
87 make_xmm(
88 unsigned xmm )
89 {
90 return x86_make_reg(
91 file_XMM,
92 (enum x86_reg_name) xmm );
93 }
94
95 /**
96 * X86 register mapping helpers.
97 */
98
99 static struct x86_reg
100 get_const_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_AX );
105 }
106
107 static struct x86_reg
108 get_machine_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_CX );
113 }
114
115 static struct x86_reg
116 get_input_base( void )
117 {
118 return x86_make_disp(
119 get_machine_base(),
120 Offset(struct tgsi_exec_machine, Inputs) );
121 }
122
123 static struct x86_reg
124 get_output_base( void )
125 {
126 return x86_make_disp(
127 get_machine_base(),
128 Offset(struct tgsi_exec_machine, Outputs) );
129 }
130
131 static struct x86_reg
132 get_temp_base( void )
133 {
134 return x86_make_disp(
135 get_machine_base(),
136 Offset(struct tgsi_exec_machine, Temps) );
137 }
138
139 static struct x86_reg
140 get_coef_base( void )
141 {
142 return x86_make_reg(
143 file_REG32,
144 reg_BX );
145 }
146
147 static struct x86_reg
148 get_sampler_base( void )
149 {
150 return x86_make_reg(
151 file_REG32,
152 reg_DI );
153 }
154
155 static struct x86_reg
156 get_immediate_base( void )
157 {
158 return x86_make_reg(
159 file_REG32,
160 reg_DX );
161 }
162
163
164 /**
165 * Data access helpers.
166 */
167
168
169 static struct x86_reg
170 get_immediate(
171 unsigned vec,
172 unsigned chan )
173 {
174 return x86_make_disp(
175 get_immediate_base(),
176 (vec * 4 + chan) * 4 );
177 }
178
179 static struct x86_reg
180 get_const(
181 unsigned vec,
182 unsigned chan )
183 {
184 return x86_make_disp(
185 get_const_base(),
186 (vec * 4 + chan) * 4 );
187 }
188
189 static struct x86_reg
190 get_sampler_ptr(
191 unsigned unit )
192 {
193 return x86_make_disp(
194 get_sampler_base(),
195 unit * sizeof( struct tgsi_sampler * ) );
196 }
197
198 static struct x86_reg
199 get_input(
200 unsigned vec,
201 unsigned chan )
202 {
203 return x86_make_disp(
204 get_input_base(),
205 (vec * 4 + chan) * 16 );
206 }
207
208 static struct x86_reg
209 get_output(
210 unsigned vec,
211 unsigned chan )
212 {
213 return x86_make_disp(
214 get_output_base(),
215 (vec * 4 + chan) * 16 );
216 }
217
218 static struct x86_reg
219 get_temp(
220 unsigned vec,
221 unsigned chan )
222 {
223 return x86_make_disp(
224 get_temp_base(),
225 (vec * 4 + chan) * 16 );
226 }
227
228 static struct x86_reg
229 get_coef(
230 unsigned vec,
231 unsigned chan,
232 unsigned member )
233 {
234 return x86_make_disp(
235 get_coef_base(),
236 ((vec * 3 + member) * 4 + chan) * 4 );
237 }
238
239
240 static void
241 emit_ret(
242 struct x86_function *func )
243 {
244 x86_ret( func );
245 }
246
247
248 /**
249 * Data fetch helpers.
250 */
251
252 /**
253 * Copy a shader constant to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src const buffer index
256 * \param chan src channel to fetch (X, Y, Z or W)
257 */
258 static void
259 emit_const(
260 struct x86_function *func,
261 uint xmm,
262 int vec,
263 uint chan,
264 uint indirect,
265 uint indirectFile,
266 int indirectIndex )
267 {
268 if (indirect) {
269 /* 'vec' is the offset from the address register's value.
270 * We're loading CONST[ADDR+vec] into an xmm register.
271 */
272 struct x86_reg r0 = get_immediate_base();
273 struct x86_reg r1 = get_coef_base();
274 uint i;
275
276 assert( indirectFile == TGSI_FILE_ADDRESS );
277 assert( indirectIndex == 0 );
278 assert( r0.mod == mod_REG );
279 assert( r1.mod == mod_REG );
280
281 x86_push( func, r0 );
282 x86_push( func, r1 );
283
284 /*
285 * Loop over the four pixels or vertices in the quad.
286 * Get the value of the address (offset) register for pixel/vertex[i],
287 * add it to the src offset and index into the constant buffer.
288 * Note that we're working on SOA data.
289 * If any of the pixel/vertex execution channels are unused their
290 * values will be garbage. It's very important that we don't use
291 * those garbage values as indexes into the constant buffer since
292 * that'll cause segfaults.
293 * The solution is to bitwise-AND the offset with the execution mask
294 * register whose values are either 0 or ~0.
295 * The caller must setup the execution mask register to indicate
296 * which channels are valid/alive before running the shader.
297 * The execution mask will also figure into loops and conditionals
298 * someday.
299 */
300 for (i = 0; i < QUAD_SIZE; i++) {
301 /* r1 = address register[i] */
302 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
303 /* r0 = execution mask[i] */
304 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
305 /* r1 = r1 & r0 */
306 x86_and( func, r1, r0 );
307 /* r0 = 'vec', the offset */
308 x86_lea( func, r0, get_const( vec, chan ) );
309
310 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
311 */
312 x86_add( func, r1, r1 );
313 x86_add( func, r1, r1 );
314 x86_add( func, r1, r1 );
315 x86_add( func, r1, r1 );
316
317 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
318 x86_mov( func, r1, x86_deref( r0 ) );
319 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
320 }
321
322 x86_pop( func, r1 );
323 x86_pop( func, r0 );
324
325 sse_movaps(
326 func,
327 make_xmm( xmm ),
328 get_temp( TEMP_R0, CHAN_X ) );
329 }
330 else {
331 /* 'vec' is the index into the src register file, such as TEMP[vec] */
332 assert( vec >= 0 );
333
334 sse_movss(
335 func,
336 make_xmm( xmm ),
337 get_const( vec, chan ) );
338 sse_shufps(
339 func,
340 make_xmm( xmm ),
341 make_xmm( xmm ),
342 SHUF( 0, 0, 0, 0 ) );
343 }
344 }
345
346 static void
347 emit_immediate(
348 struct x86_function *func,
349 unsigned xmm,
350 unsigned vec,
351 unsigned chan )
352 {
353 sse_movss(
354 func,
355 make_xmm( xmm ),
356 get_immediate( vec, chan ) );
357 sse_shufps(
358 func,
359 make_xmm( xmm ),
360 make_xmm( xmm ),
361 SHUF( 0, 0, 0, 0 ) );
362 }
363
364
365 /**
366 * Copy a shader input to xmm register
367 * \param xmm the destination xmm register
368 * \param vec the src input attrib
369 * \param chan src channel to fetch (X, Y, Z or W)
370 */
371 static void
372 emit_inputf(
373 struct x86_function *func,
374 unsigned xmm,
375 unsigned vec,
376 unsigned chan )
377 {
378 sse_movups(
379 func,
380 make_xmm( xmm ),
381 get_input( vec, chan ) );
382 }
383
384 /**
385 * Store an xmm register to a shader output
386 * \param xmm the source xmm register
387 * \param vec the dest output attrib
388 * \param chan src dest channel to store (X, Y, Z or W)
389 */
390 static void
391 emit_output(
392 struct x86_function *func,
393 unsigned xmm,
394 unsigned vec,
395 unsigned chan )
396 {
397 sse_movups(
398 func,
399 get_output( vec, chan ),
400 make_xmm( xmm ) );
401 }
402
403 /**
404 * Copy a shader temporary to xmm register
405 * \param xmm the destination xmm register
406 * \param vec the src temp register
407 * \param chan src channel to fetch (X, Y, Z or W)
408 */
409 static void
410 emit_tempf(
411 struct x86_function *func,
412 unsigned xmm,
413 unsigned vec,
414 unsigned chan )
415 {
416 sse_movaps(
417 func,
418 make_xmm( xmm ),
419 get_temp( vec, chan ) );
420 }
421
422 /**
423 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
424 * \param xmm the destination xmm register
425 * \param vec the src input/attribute coefficient index
426 * \param chan src channel to fetch (X, Y, Z or W)
427 * \param member 0=a0, 1=dadx, 2=dady
428 */
429 static void
430 emit_coef(
431 struct x86_function *func,
432 unsigned xmm,
433 unsigned vec,
434 unsigned chan,
435 unsigned member )
436 {
437 sse_movss(
438 func,
439 make_xmm( xmm ),
440 get_coef( vec, chan, member ) );
441 sse_shufps(
442 func,
443 make_xmm( xmm ),
444 make_xmm( xmm ),
445 SHUF( 0, 0, 0, 0 ) );
446 }
447
448 /**
449 * Data store helpers.
450 */
451
452 static void
453 emit_inputs(
454 struct x86_function *func,
455 unsigned xmm,
456 unsigned vec,
457 unsigned chan )
458 {
459 sse_movups(
460 func,
461 get_input( vec, chan ),
462 make_xmm( xmm ) );
463 }
464
465 static void
466 emit_temps(
467 struct x86_function *func,
468 unsigned xmm,
469 unsigned vec,
470 unsigned chan )
471 {
472 sse_movaps(
473 func,
474 get_temp( vec, chan ),
475 make_xmm( xmm ) );
476 }
477
478 static void
479 emit_addrs(
480 struct x86_function *func,
481 unsigned xmm,
482 unsigned vec,
483 unsigned chan )
484 {
485 assert( vec == 0 );
486
487 emit_temps(
488 func,
489 xmm,
490 vec + TGSI_EXEC_TEMP_ADDR,
491 chan );
492 }
493
494 /**
495 * Coefficent fetch helpers.
496 */
497
498 static void
499 emit_coef_a0(
500 struct x86_function *func,
501 unsigned xmm,
502 unsigned vec,
503 unsigned chan )
504 {
505 emit_coef(
506 func,
507 xmm,
508 vec,
509 chan,
510 0 );
511 }
512
513 static void
514 emit_coef_dadx(
515 struct x86_function *func,
516 unsigned xmm,
517 unsigned vec,
518 unsigned chan )
519 {
520 emit_coef(
521 func,
522 xmm,
523 vec,
524 chan,
525 1 );
526 }
527
528 static void
529 emit_coef_dady(
530 struct x86_function *func,
531 unsigned xmm,
532 unsigned vec,
533 unsigned chan )
534 {
535 emit_coef(
536 func,
537 xmm,
538 vec,
539 chan,
540 2 );
541 }
542
543 /**
544 * Function call helpers.
545 */
546
547 /**
548 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
549 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
550 * that the stack pointer is 16 byte aligned, as expected.
551 */
552 static void
553 emit_func_call(
554 struct x86_function *func,
555 unsigned xmm_save_mask,
556 const struct x86_reg *arg,
557 unsigned nr_args,
558 void (PIPE_CDECL *code)() )
559 {
560 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
561 unsigned i, n;
562
563 x86_push(
564 func,
565 x86_make_reg( file_REG32, reg_AX) );
566 x86_push(
567 func,
568 x86_make_reg( file_REG32, reg_CX) );
569 x86_push(
570 func,
571 x86_make_reg( file_REG32, reg_DX) );
572
573 /* Store XMM regs to the stack
574 */
575 for(i = 0, n = 0; i < 8; ++i)
576 if(xmm_save_mask & (1 << i))
577 ++n;
578
579 x86_sub_imm(
580 func,
581 x86_make_reg( file_REG32, reg_SP ),
582 n*16);
583
584 for(i = 0, n = 0; i < 8; ++i)
585 if(xmm_save_mask & (1 << i)) {
586 sse_movups(
587 func,
588 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
589 make_xmm( i ) );
590 ++n;
591 }
592
593 for (i = 0; i < nr_args; i++) {
594 /* Load the address of the buffer we use for passing arguments and
595 * receiving results:
596 */
597 x86_lea(
598 func,
599 ecx,
600 arg[i] );
601
602 /* Push actual function arguments (currently just the pointer to
603 * the buffer above), and call the function:
604 */
605 x86_push( func, ecx );
606 }
607
608 x86_mov_reg_imm( func, ecx, (unsigned long) code );
609 x86_call( func, ecx );
610
611 /* Pop the arguments (or just add an immediate to esp)
612 */
613 for (i = 0; i < nr_args; i++) {
614 x86_pop(func, ecx );
615 }
616
617 /* Pop the saved XMM regs:
618 */
619 for(i = 0, n = 0; i < 8; ++i)
620 if(xmm_save_mask & (1 << i)) {
621 sse_movups(
622 func,
623 make_xmm( i ),
624 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
625 ++n;
626 }
627
628 x86_add_imm(
629 func,
630 x86_make_reg( file_REG32, reg_SP ),
631 n*16);
632
633 /* Restore GP registers in a reverse order.
634 */
635 x86_pop(
636 func,
637 x86_make_reg( file_REG32, reg_DX) );
638 x86_pop(
639 func,
640 x86_make_reg( file_REG32, reg_CX) );
641 x86_pop(
642 func,
643 x86_make_reg( file_REG32, reg_AX) );
644 }
645
646 static void
647 emit_func_call_dst_src1(
648 struct x86_function *func,
649 unsigned xmm_save,
650 unsigned xmm_dst,
651 unsigned xmm_src0,
652 void (PIPE_CDECL *code)() )
653 {
654 struct x86_reg store = get_temp( TEMP_R0, 0 );
655 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
656
657 /* Store our input parameters (in xmm regs) to the buffer we use
658 * for passing arguments. We will pass a pointer to this buffer as
659 * the actual function argument.
660 */
661 sse_movaps(
662 func,
663 store,
664 make_xmm( xmm_src0 ) );
665
666 emit_func_call( func,
667 xmm_mask,
668 &store,
669 1,
670 code );
671
672 sse_movaps(
673 func,
674 make_xmm( xmm_dst ),
675 store );
676 }
677
678
679 static void
680 emit_func_call_dst_src2(
681 struct x86_function *func,
682 unsigned xmm_save,
683 unsigned xmm_dst,
684 unsigned xmm_src0,
685 unsigned xmm_src1,
686 void (PIPE_CDECL *code)() )
687 {
688 struct x86_reg store = get_temp( TEMP_R0, 0 );
689 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
690
691 /* Store two inputs to parameter buffer.
692 */
693 sse_movaps(
694 func,
695 store,
696 make_xmm( xmm_src0 ) );
697
698 sse_movaps(
699 func,
700 x86_make_disp( store, 4 * sizeof(float) ),
701 make_xmm( xmm_src1 ) );
702
703
704 /* Emit the call
705 */
706 emit_func_call( func,
707 xmm_mask,
708 &store,
709 1,
710 code );
711
712 /* Retrieve the results:
713 */
714 sse_movaps(
715 func,
716 make_xmm( xmm_dst ),
717 store );
718 }
719
720
721
722
723
724 #if defined(PIPE_ARCH_SSE)
725
726 /*
727 * Fast SSE2 implementation of special math functions.
728 */
729
730 #define POLY0(x, c0) _mm_set1_ps(c0)
731 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
732 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
733 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
734 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
735 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
736
737 #define EXP_POLY_DEGREE 3
738 #define LOG_POLY_DEGREE 5
739
740 /**
741 * See http://www.devmaster.net/forums/showthread.php?p=43580
742 */
743 static INLINE __m128
744 exp2f4(__m128 x)
745 {
746 __m128i ipart;
747 __m128 fpart, expipart, expfpart;
748
749 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
750 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
751
752 /* ipart = int(x - 0.5) */
753 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
754
755 /* fpart = x - ipart */
756 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
757
758 /* expipart = (float) (1 << ipart) */
759 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
760
761 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
762 #if EXP_POLY_DEGREE == 5
763 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
764 #elif EXP_POLY_DEGREE == 4
765 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
766 #elif EXP_POLY_DEGREE == 3
767 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
768 #elif EXP_POLY_DEGREE == 2
769 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
770 #else
771 #error
772 #endif
773
774 return _mm_mul_ps(expipart, expfpart);
775 }
776
777
778 /**
779 * See http://www.devmaster.net/forums/showthread.php?p=43580
780 */
781 static INLINE __m128
782 log2f4(__m128 x)
783 {
784 __m128i expmask = _mm_set1_epi32(0x7f800000);
785 __m128i mantmask = _mm_set1_epi32(0x007fffff);
786 __m128 one = _mm_set1_ps(1.0f);
787
788 __m128i i = _mm_castps_si128(x);
789
790 /* exp = (float) exponent(x) */
791 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
792
793 /* mant = (float) mantissa(x) */
794 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
795
796 __m128 logmant;
797
798 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
799 * These coefficients can be generate with
800 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
801 */
802 #if LOG_POLY_DEGREE == 6
803 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
804 #elif LOG_POLY_DEGREE == 5
805 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
806 #elif LOG_POLY_DEGREE == 4
807 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
808 #elif LOG_POLY_DEGREE == 3
809 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
810 #else
811 #error
812 #endif
813
814 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
815 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
816
817 return _mm_add_ps(logmant, exp);
818 }
819
820
821 static INLINE __m128
822 powf4(__m128 x, __m128 y)
823 {
824 return exp2f4(_mm_mul_ps(log2f4(x), y));
825 }
826
827 #endif /* PIPE_ARCH_SSE */
828
829
830
831 /**
832 * Low-level instruction translators.
833 */
834
835 static void
836 emit_abs(
837 struct x86_function *func,
838 unsigned xmm )
839 {
840 sse_andps(
841 func,
842 make_xmm( xmm ),
843 get_temp(
844 TGSI_EXEC_TEMP_7FFFFFFF_I,
845 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
846 }
847
848 static void
849 emit_add(
850 struct x86_function *func,
851 unsigned xmm_dst,
852 unsigned xmm_src )
853 {
854 sse_addps(
855 func,
856 make_xmm( xmm_dst ),
857 make_xmm( xmm_src ) );
858 }
859
860 static void PIPE_CDECL
861 cos4f(
862 float *store )
863 {
864 store[0] = cosf( store[0] );
865 store[1] = cosf( store[1] );
866 store[2] = cosf( store[2] );
867 store[3] = cosf( store[3] );
868 }
869
870 static void
871 emit_cos(
872 struct x86_function *func,
873 unsigned xmm_save,
874 unsigned xmm_dst )
875 {
876 emit_func_call_dst_src1(
877 func,
878 xmm_save,
879 xmm_dst,
880 xmm_dst,
881 cos4f );
882 }
883
884 static void PIPE_CDECL
885 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
886 __attribute__((force_align_arg_pointer))
887 #endif
888 ex24f(
889 float *store )
890 {
891 #if defined(PIPE_ARCH_SSE)
892 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
893 #else
894 store[0] = util_fast_exp2( store[0] );
895 store[1] = util_fast_exp2( store[1] );
896 store[2] = util_fast_exp2( store[2] );
897 store[3] = util_fast_exp2( store[3] );
898 #endif
899 }
900
901 static void
902 emit_ex2(
903 struct x86_function *func,
904 unsigned xmm_save,
905 unsigned xmm_dst )
906 {
907 emit_func_call_dst_src1(
908 func,
909 xmm_save,
910 xmm_dst,
911 xmm_dst,
912 ex24f );
913 }
914
915 static void
916 emit_f2it(
917 struct x86_function *func,
918 unsigned xmm )
919 {
920 sse2_cvttps2dq(
921 func,
922 make_xmm( xmm ),
923 make_xmm( xmm ) );
924 }
925
926 static void
927 emit_i2f(
928 struct x86_function *func,
929 unsigned xmm )
930 {
931 sse2_cvtdq2ps(
932 func,
933 make_xmm( xmm ),
934 make_xmm( xmm ) );
935 }
936
937 static void PIPE_CDECL
938 flr4f(
939 float *store )
940 {
941 store[0] = floorf( store[0] );
942 store[1] = floorf( store[1] );
943 store[2] = floorf( store[2] );
944 store[3] = floorf( store[3] );
945 }
946
947 static void
948 emit_flr(
949 struct x86_function *func,
950 unsigned xmm_save,
951 unsigned xmm_dst )
952 {
953 emit_func_call_dst_src1(
954 func,
955 xmm_save,
956 xmm_dst,
957 xmm_dst,
958 flr4f );
959 }
960
961 static void PIPE_CDECL
962 frc4f(
963 float *store )
964 {
965 store[0] -= floorf( store[0] );
966 store[1] -= floorf( store[1] );
967 store[2] -= floorf( store[2] );
968 store[3] -= floorf( store[3] );
969 }
970
971 static void
972 emit_frc(
973 struct x86_function *func,
974 unsigned xmm_save,
975 unsigned xmm_dst )
976 {
977 emit_func_call_dst_src1(
978 func,
979 xmm_save,
980 xmm_dst,
981 xmm_dst,
982 frc4f );
983 }
984
985 static void PIPE_CDECL
986 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
987 __attribute__((force_align_arg_pointer))
988 #endif
989 lg24f(
990 float *store )
991 {
992 #if defined(PIPE_ARCH_SSE)
993 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
994 #else
995 store[0] = util_fast_log2( store[0] );
996 store[1] = util_fast_log2( store[1] );
997 store[2] = util_fast_log2( store[2] );
998 store[3] = util_fast_log2( store[3] );
999 #endif
1000 }
1001
1002 static void
1003 emit_lg2(
1004 struct x86_function *func,
1005 unsigned xmm_save,
1006 unsigned xmm_dst )
1007 {
1008 emit_func_call_dst_src1(
1009 func,
1010 xmm_save,
1011 xmm_dst,
1012 xmm_dst,
1013 lg24f );
1014 }
1015
1016 static void
1017 emit_MOV(
1018 struct x86_function *func,
1019 unsigned xmm_dst,
1020 unsigned xmm_src )
1021 {
1022 sse_movups(
1023 func,
1024 make_xmm( xmm_dst ),
1025 make_xmm( xmm_src ) );
1026 }
1027
1028 static void
1029 emit_mul (struct x86_function *func,
1030 unsigned xmm_dst,
1031 unsigned xmm_src)
1032 {
1033 sse_mulps(
1034 func,
1035 make_xmm( xmm_dst ),
1036 make_xmm( xmm_src ) );
1037 }
1038
1039 static void
1040 emit_neg(
1041 struct x86_function *func,
1042 unsigned xmm )
1043 {
1044 sse_xorps(
1045 func,
1046 make_xmm( xmm ),
1047 get_temp(
1048 TGSI_EXEC_TEMP_80000000_I,
1049 TGSI_EXEC_TEMP_80000000_C ) );
1050 }
1051
1052 static void PIPE_CDECL
1053 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1054 __attribute__((force_align_arg_pointer))
1055 #endif
1056 pow4f(
1057 float *store )
1058 {
1059 #if defined(PIPE_ARCH_SSE)
1060 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1061 #else
1062 store[0] = util_fast_pow( store[0], store[4] );
1063 store[1] = util_fast_pow( store[1], store[5] );
1064 store[2] = util_fast_pow( store[2], store[6] );
1065 store[3] = util_fast_pow( store[3], store[7] );
1066 #endif
1067 }
1068
1069 static void
1070 emit_pow(
1071 struct x86_function *func,
1072 unsigned xmm_save,
1073 unsigned xmm_dst,
1074 unsigned xmm_src0,
1075 unsigned xmm_src1 )
1076 {
1077 emit_func_call_dst_src2(
1078 func,
1079 xmm_save,
1080 xmm_dst,
1081 xmm_src0,
1082 xmm_src1,
1083 pow4f );
1084 }
1085
1086 static void
1087 emit_rcp (
1088 struct x86_function *func,
1089 unsigned xmm_dst,
1090 unsigned xmm_src )
1091 {
1092 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1093 * good enough. Need to either emit a proper divide or use the
1094 * iterative technique described below in emit_rsqrt().
1095 */
1096 sse2_rcpps(
1097 func,
1098 make_xmm( xmm_dst ),
1099 make_xmm( xmm_src ) );
1100 }
1101
1102 static void PIPE_CDECL
1103 rnd4f(
1104 float *store )
1105 {
1106 store[0] = floorf( store[0] + 0.5f );
1107 store[1] = floorf( store[1] + 0.5f );
1108 store[2] = floorf( store[2] + 0.5f );
1109 store[3] = floorf( store[3] + 0.5f );
1110 }
1111
1112 static void
1113 emit_rnd(
1114 struct x86_function *func,
1115 unsigned xmm_save,
1116 unsigned xmm_dst )
1117 {
1118 emit_func_call_dst_src1(
1119 func,
1120 xmm_save,
1121 xmm_dst,
1122 xmm_dst,
1123 rnd4f );
1124 }
1125
1126 static void
1127 emit_rsqrt(
1128 struct x86_function *func,
1129 unsigned xmm_dst,
1130 unsigned xmm_src )
1131 {
1132 #if HIGH_PRECISION
1133 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1134 * implementations, it is possible to improve its precision at
1135 * fairly low cost, using a newton/raphson step, as below:
1136 *
1137 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1138 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1139 *
1140 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1141 */
1142 {
1143 struct x86_reg dst = make_xmm( xmm_dst );
1144 struct x86_reg src = make_xmm( xmm_src );
1145 struct x86_reg tmp0 = make_xmm( 2 );
1146 struct x86_reg tmp1 = make_xmm( 3 );
1147
1148 assert( xmm_dst != xmm_src );
1149 assert( xmm_dst != 2 && xmm_dst != 3 );
1150 assert( xmm_src != 2 && xmm_src != 3 );
1151
1152 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1153 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1154 sse_rsqrtps( func, tmp1, src );
1155 sse_mulps( func, src, tmp1 );
1156 sse_mulps( func, dst, tmp1 );
1157 sse_mulps( func, src, tmp1 );
1158 sse_subps( func, tmp0, src );
1159 sse_mulps( func, dst, tmp0 );
1160 }
1161 #else
1162 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1163 * good enough.
1164 */
1165 sse_rsqrtps(
1166 func,
1167 make_xmm( xmm_dst ),
1168 make_xmm( xmm_src ) );
1169 #endif
1170 }
1171
1172 static void
1173 emit_setsign(
1174 struct x86_function *func,
1175 unsigned xmm )
1176 {
1177 sse_orps(
1178 func,
1179 make_xmm( xmm ),
1180 get_temp(
1181 TGSI_EXEC_TEMP_80000000_I,
1182 TGSI_EXEC_TEMP_80000000_C ) );
1183 }
1184
1185 static void PIPE_CDECL
1186 sgn4f(
1187 float *store )
1188 {
1189 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1190 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1191 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1192 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1193 }
1194
1195 static void
1196 emit_sgn(
1197 struct x86_function *func,
1198 unsigned xmm_save,
1199 unsigned xmm_dst )
1200 {
1201 emit_func_call_dst_src1(
1202 func,
1203 xmm_save,
1204 xmm_dst,
1205 xmm_dst,
1206 sgn4f );
1207 }
1208
1209 static void PIPE_CDECL
1210 sin4f(
1211 float *store )
1212 {
1213 store[0] = sinf( store[0] );
1214 store[1] = sinf( store[1] );
1215 store[2] = sinf( store[2] );
1216 store[3] = sinf( store[3] );
1217 }
1218
1219 static void
1220 emit_sin (struct x86_function *func,
1221 unsigned xmm_save,
1222 unsigned xmm_dst)
1223 {
1224 emit_func_call_dst_src1(
1225 func,
1226 xmm_save,
1227 xmm_dst,
1228 xmm_dst,
1229 sin4f );
1230 }
1231
1232 static void
1233 emit_sub(
1234 struct x86_function *func,
1235 unsigned xmm_dst,
1236 unsigned xmm_src )
1237 {
1238 sse_subps(
1239 func,
1240 make_xmm( xmm_dst ),
1241 make_xmm( xmm_src ) );
1242 }
1243
1244
1245
1246
1247
1248
1249
1250 /**
1251 * Register fetch.
1252 */
1253
1254 static void
1255 emit_fetch(
1256 struct x86_function *func,
1257 unsigned xmm,
1258 const struct tgsi_full_src_register *reg,
1259 const unsigned chan_index )
1260 {
1261 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1262
1263 switch (swizzle) {
1264 case TGSI_EXTSWIZZLE_X:
1265 case TGSI_EXTSWIZZLE_Y:
1266 case TGSI_EXTSWIZZLE_Z:
1267 case TGSI_EXTSWIZZLE_W:
1268 switch (reg->SrcRegister.File) {
1269 case TGSI_FILE_CONSTANT:
1270 emit_const(
1271 func,
1272 xmm,
1273 reg->SrcRegister.Index,
1274 swizzle,
1275 reg->SrcRegister.Indirect,
1276 reg->SrcRegisterInd.File,
1277 reg->SrcRegisterInd.Index );
1278 break;
1279
1280 case TGSI_FILE_IMMEDIATE:
1281 emit_immediate(
1282 func,
1283 xmm,
1284 reg->SrcRegister.Index,
1285 swizzle );
1286 break;
1287
1288 case TGSI_FILE_INPUT:
1289 emit_inputf(
1290 func,
1291 xmm,
1292 reg->SrcRegister.Index,
1293 swizzle );
1294 break;
1295
1296 case TGSI_FILE_TEMPORARY:
1297 emit_tempf(
1298 func,
1299 xmm,
1300 reg->SrcRegister.Index,
1301 swizzle );
1302 break;
1303
1304 default:
1305 assert( 0 );
1306 }
1307 break;
1308
1309 case TGSI_EXTSWIZZLE_ZERO:
1310 emit_tempf(
1311 func,
1312 xmm,
1313 TGSI_EXEC_TEMP_00000000_I,
1314 TGSI_EXEC_TEMP_00000000_C );
1315 break;
1316
1317 case TGSI_EXTSWIZZLE_ONE:
1318 emit_tempf(
1319 func,
1320 xmm,
1321 TEMP_ONE_I,
1322 TEMP_ONE_C );
1323 break;
1324
1325 default:
1326 assert( 0 );
1327 }
1328
1329 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1330 case TGSI_UTIL_SIGN_CLEAR:
1331 emit_abs( func, xmm );
1332 break;
1333
1334 case TGSI_UTIL_SIGN_SET:
1335 emit_setsign( func, xmm );
1336 break;
1337
1338 case TGSI_UTIL_SIGN_TOGGLE:
1339 emit_neg( func, xmm );
1340 break;
1341
1342 case TGSI_UTIL_SIGN_KEEP:
1343 break;
1344 }
1345 }
1346
1347 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1348 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1349
1350 /**
1351 * Register store.
1352 */
1353
1354 static void
1355 emit_store(
1356 struct x86_function *func,
1357 unsigned xmm,
1358 const struct tgsi_full_dst_register *reg,
1359 const struct tgsi_full_instruction *inst,
1360 unsigned chan_index )
1361 {
1362 switch( reg->DstRegister.File ) {
1363 case TGSI_FILE_OUTPUT:
1364 emit_output(
1365 func,
1366 xmm,
1367 reg->DstRegister.Index,
1368 chan_index );
1369 break;
1370
1371 case TGSI_FILE_TEMPORARY:
1372 emit_temps(
1373 func,
1374 xmm,
1375 reg->DstRegister.Index,
1376 chan_index );
1377 break;
1378
1379 case TGSI_FILE_ADDRESS:
1380 emit_addrs(
1381 func,
1382 xmm,
1383 reg->DstRegister.Index,
1384 chan_index );
1385 break;
1386
1387 default:
1388 assert( 0 );
1389 }
1390
1391 switch( inst->Instruction.Saturate ) {
1392 case TGSI_SAT_NONE:
1393 break;
1394
1395 case TGSI_SAT_ZERO_ONE:
1396 /* assert( 0 ); */
1397 break;
1398
1399 case TGSI_SAT_MINUS_PLUS_ONE:
1400 assert( 0 );
1401 break;
1402 }
1403 }
1404
1405 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1406 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1407
1408
1409 static void PIPE_CDECL
1410 fetch_texel( struct tgsi_sampler **sampler,
1411 float *store )
1412 {
1413 #if 0
1414 uint j;
1415
1416 debug_printf("%s sampler: %p (%p) store: %p\n",
1417 __FUNCTION__,
1418 sampler, *sampler,
1419 store );
1420
1421 debug_printf("lodbias %f\n", store[12]);
1422
1423 for (j = 0; j < 4; j++)
1424 debug_printf("sample %d texcoord %f %f\n",
1425 j,
1426 store[0+j],
1427 store[4+j]);
1428 #endif
1429
1430 {
1431 float rgba[NUM_CHANNELS][QUAD_SIZE];
1432 (*sampler)->get_samples(*sampler,
1433 &store[0],
1434 &store[4],
1435 &store[8],
1436 0.0f, /*store[12], lodbias */
1437 rgba);
1438
1439 memcpy( store, rgba, 16 * sizeof(float));
1440 }
1441
1442 #if 0
1443 for (j = 0; j < 4; j++)
1444 debug_printf("sample %d result %f %f %f %f\n",
1445 j,
1446 store[0+j],
1447 store[4+j],
1448 store[8+j],
1449 store[12+j]);
1450 #endif
1451 }
1452
1453 /**
1454 * High-level instruction translators.
1455 */
1456
1457 static void
1458 emit_tex( struct x86_function *func,
1459 const struct tgsi_full_instruction *inst,
1460 boolean lodbias,
1461 boolean projected)
1462 {
1463 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1464 struct x86_reg args[2];
1465 unsigned count;
1466 unsigned i;
1467
1468 switch (inst->InstructionExtTexture.Texture) {
1469 case TGSI_TEXTURE_1D:
1470 case TGSI_TEXTURE_SHADOW1D:
1471 count = 1;
1472 break;
1473 case TGSI_TEXTURE_2D:
1474 case TGSI_TEXTURE_RECT:
1475 case TGSI_TEXTURE_SHADOW2D:
1476 case TGSI_TEXTURE_SHADOWRECT:
1477 count = 2;
1478 break;
1479 case TGSI_TEXTURE_3D:
1480 case TGSI_TEXTURE_CUBE:
1481 count = 3;
1482 break;
1483 default:
1484 assert(0);
1485 return;
1486 }
1487
1488 if (lodbias) {
1489 FETCH( func, *inst, 3, 0, 3 );
1490 }
1491 else {
1492 emit_tempf(
1493 func,
1494 3,
1495 TGSI_EXEC_TEMP_00000000_I,
1496 TGSI_EXEC_TEMP_00000000_C );
1497
1498 }
1499
1500 /* store lodbias whether enabled or not -- fetch_texel currently
1501 * respects it always.
1502 */
1503 sse_movaps( func,
1504 get_temp( TEMP_R0, 3 ),
1505 make_xmm( 3 ) );
1506
1507
1508 if (projected) {
1509 FETCH( func, *inst, 3, 0, 3 );
1510
1511 emit_rcp( func, 3, 3 );
1512 }
1513
1514 for (i = 0; i < count; i++) {
1515 FETCH( func, *inst, i, 0, i );
1516
1517 if (projected) {
1518 sse_mulps(
1519 func,
1520 make_xmm( i ),
1521 make_xmm( 3 ) );
1522 }
1523
1524 /* Store in the argument buffer:
1525 */
1526 sse_movaps(
1527 func,
1528 get_temp( TEMP_R0, i ),
1529 make_xmm( i ) );
1530 }
1531
1532 args[0] = get_temp( TEMP_R0, 0 );
1533 args[1] = get_sampler_ptr( unit );
1534
1535
1536 emit_func_call( func,
1537 0,
1538 args,
1539 Elements(args),
1540 fetch_texel );
1541
1542 /* If all four channels are enabled, could use a pointer to
1543 * dst[0].x instead of TEMP_R0 for store?
1544 */
1545 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1546
1547 sse_movaps(
1548 func,
1549 make_xmm( 0 ),
1550 get_temp( TEMP_R0, i ) );
1551
1552 STORE( func, *inst, 0, 0, i );
1553 }
1554 }
1555
1556
1557 static void
1558 emit_kil(
1559 struct x86_function *func,
1560 const struct tgsi_full_src_register *reg )
1561 {
1562 unsigned uniquemask;
1563 unsigned unique_count = 0;
1564 unsigned chan_index;
1565 unsigned i;
1566
1567 /* This mask stores component bits that were already tested. Note that
1568 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1569 * tested. */
1570 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1571
1572 FOR_EACH_CHANNEL( chan_index ) {
1573 unsigned swizzle;
1574
1575 /* unswizzle channel */
1576 swizzle = tgsi_util_get_full_src_register_extswizzle(
1577 reg,
1578 chan_index );
1579
1580 /* check if the component has not been already tested */
1581 if( !(uniquemask & (1 << swizzle)) ) {
1582 uniquemask |= 1 << swizzle;
1583
1584 /* allocate register */
1585 emit_fetch(
1586 func,
1587 unique_count++,
1588 reg,
1589 chan_index );
1590 }
1591 }
1592
1593 x86_push(
1594 func,
1595 x86_make_reg( file_REG32, reg_AX ) );
1596 x86_push(
1597 func,
1598 x86_make_reg( file_REG32, reg_DX ) );
1599
1600 for (i = 0 ; i < unique_count; i++ ) {
1601 struct x86_reg dataXMM = make_xmm(i);
1602
1603 sse_cmpps(
1604 func,
1605 dataXMM,
1606 get_temp(
1607 TGSI_EXEC_TEMP_00000000_I,
1608 TGSI_EXEC_TEMP_00000000_C ),
1609 cc_LessThan );
1610
1611 if( i == 0 ) {
1612 sse_movmskps(
1613 func,
1614 x86_make_reg( file_REG32, reg_AX ),
1615 dataXMM );
1616 }
1617 else {
1618 sse_movmskps(
1619 func,
1620 x86_make_reg( file_REG32, reg_DX ),
1621 dataXMM );
1622 x86_or(
1623 func,
1624 x86_make_reg( file_REG32, reg_AX ),
1625 x86_make_reg( file_REG32, reg_DX ) );
1626 }
1627 }
1628
1629 x86_or(
1630 func,
1631 get_temp(
1632 TGSI_EXEC_TEMP_KILMASK_I,
1633 TGSI_EXEC_TEMP_KILMASK_C ),
1634 x86_make_reg( file_REG32, reg_AX ) );
1635
1636 x86_pop(
1637 func,
1638 x86_make_reg( file_REG32, reg_DX ) );
1639 x86_pop(
1640 func,
1641 x86_make_reg( file_REG32, reg_AX ) );
1642 }
1643
1644
1645 static void
1646 emit_kilp(
1647 struct x86_function *func )
1648 {
1649 /* XXX todo / fix me */
1650 }
1651
1652
1653 static void
1654 emit_setcc(
1655 struct x86_function *func,
1656 struct tgsi_full_instruction *inst,
1657 enum sse_cc cc )
1658 {
1659 unsigned chan_index;
1660
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662 FETCH( func, *inst, 0, 0, chan_index );
1663 FETCH( func, *inst, 1, 1, chan_index );
1664 sse_cmpps(
1665 func,
1666 make_xmm( 0 ),
1667 make_xmm( 1 ),
1668 cc );
1669 sse_andps(
1670 func,
1671 make_xmm( 0 ),
1672 get_temp(
1673 TEMP_ONE_I,
1674 TEMP_ONE_C ) );
1675 STORE( func, *inst, 0, 0, chan_index );
1676 }
1677 }
1678
1679 static void
1680 emit_cmp(
1681 struct x86_function *func,
1682 struct tgsi_full_instruction *inst )
1683 {
1684 unsigned chan_index;
1685
1686 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1687 FETCH( func, *inst, 0, 0, chan_index );
1688 FETCH( func, *inst, 1, 1, chan_index );
1689 FETCH( func, *inst, 2, 2, chan_index );
1690 sse_cmpps(
1691 func,
1692 make_xmm( 0 ),
1693 get_temp(
1694 TGSI_EXEC_TEMP_00000000_I,
1695 TGSI_EXEC_TEMP_00000000_C ),
1696 cc_LessThan );
1697 sse_andps(
1698 func,
1699 make_xmm( 1 ),
1700 make_xmm( 0 ) );
1701 sse_andnps(
1702 func,
1703 make_xmm( 0 ),
1704 make_xmm( 2 ) );
1705 sse_orps(
1706 func,
1707 make_xmm( 0 ),
1708 make_xmm( 1 ) );
1709 STORE( func, *inst, 0, 0, chan_index );
1710 }
1711 }
1712
1713
1714 /**
1715 * Check if inst src/dest regs use indirect addressing into temporary
1716 * register file.
1717 */
1718 static boolean
1719 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1720 {
1721 uint i;
1722 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1723 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1724 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1725 reg->SrcRegister.Indirect)
1726 return TRUE;
1727 }
1728 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1729 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1730 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1731 reg->DstRegister.Indirect)
1732 return TRUE;
1733 }
1734 return FALSE;
1735 }
1736
1737
1738 static int
1739 emit_instruction(
1740 struct x86_function *func,
1741 struct tgsi_full_instruction *inst )
1742 {
1743 unsigned chan_index;
1744
1745 /* we can't handle indirect addressing into temp register file yet */
1746 if (indirect_temp_reference(inst))
1747 return FALSE;
1748
1749 switch (inst->Instruction.Opcode) {
1750 case TGSI_OPCODE_ARL:
1751 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1752 FETCH( func, *inst, 0, 0, chan_index );
1753 emit_flr(func, 0, 0);
1754 emit_f2it( func, 0 );
1755 STORE( func, *inst, 0, 0, chan_index );
1756 }
1757 break;
1758
1759 case TGSI_OPCODE_MOV:
1760 case TGSI_OPCODE_SWZ:
1761 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1762 FETCH( func, *inst, 0, 0, chan_index );
1763 STORE( func, *inst, 0, 0, chan_index );
1764 }
1765 break;
1766
1767 case TGSI_OPCODE_LIT:
1768 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1769 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1770 emit_tempf(
1771 func,
1772 0,
1773 TEMP_ONE_I,
1774 TEMP_ONE_C);
1775 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1776 STORE( func, *inst, 0, 0, CHAN_X );
1777 }
1778 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1779 STORE( func, *inst, 0, 0, CHAN_W );
1780 }
1781 }
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1783 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1784 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1785 FETCH( func, *inst, 0, 0, CHAN_X );
1786 sse_maxps(
1787 func,
1788 make_xmm( 0 ),
1789 get_temp(
1790 TGSI_EXEC_TEMP_00000000_I,
1791 TGSI_EXEC_TEMP_00000000_C ) );
1792 STORE( func, *inst, 0, 0, CHAN_Y );
1793 }
1794 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1795 /* XMM[1] = SrcReg[0].yyyy */
1796 FETCH( func, *inst, 1, 0, CHAN_Y );
1797 /* XMM[1] = max(XMM[1], 0) */
1798 sse_maxps(
1799 func,
1800 make_xmm( 1 ),
1801 get_temp(
1802 TGSI_EXEC_TEMP_00000000_I,
1803 TGSI_EXEC_TEMP_00000000_C ) );
1804 /* XMM[2] = SrcReg[0].wwww */
1805 FETCH( func, *inst, 2, 0, CHAN_W );
1806 /* XMM[2] = min(XMM[2], 128.0) */
1807 sse_minps(
1808 func,
1809 make_xmm( 2 ),
1810 get_temp(
1811 TGSI_EXEC_TEMP_128_I,
1812 TGSI_EXEC_TEMP_128_C ) );
1813 /* XMM[2] = max(XMM[2], -128.0) */
1814 sse_maxps(
1815 func,
1816 make_xmm( 2 ),
1817 get_temp(
1818 TGSI_EXEC_TEMP_MINUS_128_I,
1819 TGSI_EXEC_TEMP_MINUS_128_C ) );
1820 emit_pow( func, 3, 1, 1, 2 );
1821 FETCH( func, *inst, 0, 0, CHAN_X );
1822 sse_xorps(
1823 func,
1824 make_xmm( 2 ),
1825 make_xmm( 2 ) );
1826 sse_cmpps(
1827 func,
1828 make_xmm( 2 ),
1829 make_xmm( 0 ),
1830 cc_LessThan );
1831 sse_andps(
1832 func,
1833 make_xmm( 2 ),
1834 make_xmm( 1 ) );
1835 STORE( func, *inst, 2, 0, CHAN_Z );
1836 }
1837 }
1838 break;
1839
1840 case TGSI_OPCODE_RCP:
1841 /* TGSI_OPCODE_RECIP */
1842 FETCH( func, *inst, 0, 0, CHAN_X );
1843 emit_rcp( func, 0, 0 );
1844 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1845 STORE( func, *inst, 0, 0, chan_index );
1846 }
1847 break;
1848
1849 case TGSI_OPCODE_RSQ:
1850 /* TGSI_OPCODE_RECIPSQRT */
1851 FETCH( func, *inst, 0, 0, CHAN_X );
1852 emit_abs( func, 0 );
1853 emit_rsqrt( func, 1, 0 );
1854 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855 STORE( func, *inst, 1, 0, chan_index );
1856 }
1857 break;
1858
1859 case TGSI_OPCODE_EXP:
1860 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1862 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863 FETCH( func, *inst, 0, 0, CHAN_X );
1864 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1865 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1866 emit_MOV( func, 1, 0 );
1867 emit_flr( func, 2, 1 );
1868 /* dst.x = ex2(floor(src.x)) */
1869 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1870 emit_MOV( func, 2, 1 );
1871 emit_ex2( func, 3, 2 );
1872 STORE( func, *inst, 2, 0, CHAN_X );
1873 }
1874 /* dst.y = src.x - floor(src.x) */
1875 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1876 emit_MOV( func, 2, 0 );
1877 emit_sub( func, 2, 1 );
1878 STORE( func, *inst, 2, 0, CHAN_Y );
1879 }
1880 }
1881 /* dst.z = ex2(src.x) */
1882 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1883 emit_ex2( func, 3, 0 );
1884 STORE( func, *inst, 0, 0, CHAN_Z );
1885 }
1886 }
1887 /* dst.w = 1.0 */
1888 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1889 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1890 STORE( func, *inst, 0, 0, CHAN_W );
1891 }
1892 break;
1893
1894 case TGSI_OPCODE_LOG:
1895 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1896 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1897 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1898 FETCH( func, *inst, 0, 0, CHAN_X );
1899 emit_abs( func, 0 );
1900 emit_MOV( func, 1, 0 );
1901 emit_lg2( func, 2, 1 );
1902 /* dst.z = lg2(abs(src.x)) */
1903 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1904 STORE( func, *inst, 1, 0, CHAN_Z );
1905 }
1906 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1907 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1908 emit_flr( func, 2, 1 );
1909 /* dst.x = floor(lg2(abs(src.x))) */
1910 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1911 STORE( func, *inst, 1, 0, CHAN_X );
1912 }
1913 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1914 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1915 emit_ex2( func, 2, 1 );
1916 emit_rcp( func, 1, 1 );
1917 emit_mul( func, 0, 1 );
1918 STORE( func, *inst, 0, 0, CHAN_Y );
1919 }
1920 }
1921 }
1922 /* dst.w = 1.0 */
1923 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1924 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1925 STORE( func, *inst, 0, 0, CHAN_W );
1926 }
1927 break;
1928
1929 case TGSI_OPCODE_MUL:
1930 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1931 FETCH( func, *inst, 0, 0, chan_index );
1932 FETCH( func, *inst, 1, 1, chan_index );
1933 emit_mul( func, 0, 1 );
1934 STORE( func, *inst, 0, 0, chan_index );
1935 }
1936 break;
1937
1938 case TGSI_OPCODE_ADD:
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940 FETCH( func, *inst, 0, 0, chan_index );
1941 FETCH( func, *inst, 1, 1, chan_index );
1942 emit_add( func, 0, 1 );
1943 STORE( func, *inst, 0, 0, chan_index );
1944 }
1945 break;
1946
1947 case TGSI_OPCODE_DP3:
1948 /* TGSI_OPCODE_DOT3 */
1949 FETCH( func, *inst, 0, 0, CHAN_X );
1950 FETCH( func, *inst, 1, 1, CHAN_X );
1951 emit_mul( func, 0, 1 );
1952 FETCH( func, *inst, 1, 0, CHAN_Y );
1953 FETCH( func, *inst, 2, 1, CHAN_Y );
1954 emit_mul( func, 1, 2 );
1955 emit_add( func, 0, 1 );
1956 FETCH( func, *inst, 1, 0, CHAN_Z );
1957 FETCH( func, *inst, 2, 1, CHAN_Z );
1958 emit_mul( func, 1, 2 );
1959 emit_add( func, 0, 1 );
1960 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1961 STORE( func, *inst, 0, 0, chan_index );
1962 }
1963 break;
1964
1965 case TGSI_OPCODE_DP4:
1966 /* TGSI_OPCODE_DOT4 */
1967 FETCH( func, *inst, 0, 0, CHAN_X );
1968 FETCH( func, *inst, 1, 1, CHAN_X );
1969 emit_mul( func, 0, 1 );
1970 FETCH( func, *inst, 1, 0, CHAN_Y );
1971 FETCH( func, *inst, 2, 1, CHAN_Y );
1972 emit_mul( func, 1, 2 );
1973 emit_add( func, 0, 1 );
1974 FETCH( func, *inst, 1, 0, CHAN_Z );
1975 FETCH( func, *inst, 2, 1, CHAN_Z );
1976 emit_mul(func, 1, 2 );
1977 emit_add(func, 0, 1 );
1978 FETCH( func, *inst, 1, 0, CHAN_W );
1979 FETCH( func, *inst, 2, 1, CHAN_W );
1980 emit_mul( func, 1, 2 );
1981 emit_add( func, 0, 1 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983 STORE( func, *inst, 0, 0, chan_index );
1984 }
1985 break;
1986
1987 case TGSI_OPCODE_DST:
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1989 emit_tempf(
1990 func,
1991 0,
1992 TEMP_ONE_I,
1993 TEMP_ONE_C );
1994 STORE( func, *inst, 0, 0, CHAN_X );
1995 }
1996 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1997 FETCH( func, *inst, 0, 0, CHAN_Y );
1998 FETCH( func, *inst, 1, 1, CHAN_Y );
1999 emit_mul( func, 0, 1 );
2000 STORE( func, *inst, 0, 0, CHAN_Y );
2001 }
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2003 FETCH( func, *inst, 0, 0, CHAN_Z );
2004 STORE( func, *inst, 0, 0, CHAN_Z );
2005 }
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2007 FETCH( func, *inst, 0, 1, CHAN_W );
2008 STORE( func, *inst, 0, 0, CHAN_W );
2009 }
2010 break;
2011
2012 case TGSI_OPCODE_MIN:
2013 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2014 FETCH( func, *inst, 0, 0, chan_index );
2015 FETCH( func, *inst, 1, 1, chan_index );
2016 sse_minps(
2017 func,
2018 make_xmm( 0 ),
2019 make_xmm( 1 ) );
2020 STORE( func, *inst, 0, 0, chan_index );
2021 }
2022 break;
2023
2024 case TGSI_OPCODE_MAX:
2025 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2026 FETCH( func, *inst, 0, 0, chan_index );
2027 FETCH( func, *inst, 1, 1, chan_index );
2028 sse_maxps(
2029 func,
2030 make_xmm( 0 ),
2031 make_xmm( 1 ) );
2032 STORE( func, *inst, 0, 0, chan_index );
2033 }
2034 break;
2035
2036 case TGSI_OPCODE_SLT:
2037 /* TGSI_OPCODE_SETLT */
2038 emit_setcc( func, inst, cc_LessThan );
2039 break;
2040
2041 case TGSI_OPCODE_SGE:
2042 /* TGSI_OPCODE_SETGE */
2043 emit_setcc( func, inst, cc_NotLessThan );
2044 break;
2045
2046 case TGSI_OPCODE_MAD:
2047 /* TGSI_OPCODE_MADD */
2048 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2049 FETCH( func, *inst, 0, 0, chan_index );
2050 FETCH( func, *inst, 1, 1, chan_index );
2051 FETCH( func, *inst, 2, 2, chan_index );
2052 emit_mul( func, 0, 1 );
2053 emit_add( func, 0, 2 );
2054 STORE( func, *inst, 0, 0, chan_index );
2055 }
2056 break;
2057
2058 case TGSI_OPCODE_SUB:
2059 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2060 FETCH( func, *inst, 0, 0, chan_index );
2061 FETCH( func, *inst, 1, 1, chan_index );
2062 emit_sub( func, 0, 1 );
2063 STORE( func, *inst, 0, 0, chan_index );
2064 }
2065 break;
2066
2067 case TGSI_OPCODE_LERP:
2068 /* TGSI_OPCODE_LRP */
2069 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2070 FETCH( func, *inst, 0, 0, chan_index );
2071 FETCH( func, *inst, 1, 1, chan_index );
2072 FETCH( func, *inst, 2, 2, chan_index );
2073 emit_sub( func, 1, 2 );
2074 emit_mul( func, 0, 1 );
2075 emit_add( func, 0, 2 );
2076 STORE( func, *inst, 0, 0, chan_index );
2077 }
2078 break;
2079
2080 case TGSI_OPCODE_CND:
2081 return 0;
2082 break;
2083
2084 case TGSI_OPCODE_CND0:
2085 return 0;
2086 break;
2087
2088 case TGSI_OPCODE_DOT2ADD:
2089 /* TGSI_OPCODE_DP2A */
2090 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2091 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2092 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2093 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2094 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2095 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2096 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2097 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2098 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2099 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2100 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2101 }
2102 break;
2103
2104 case TGSI_OPCODE_INDEX:
2105 return 0;
2106 break;
2107
2108 case TGSI_OPCODE_NEGATE:
2109 return 0;
2110 break;
2111
2112 case TGSI_OPCODE_FRAC:
2113 /* TGSI_OPCODE_FRC */
2114 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2115 FETCH( func, *inst, 0, 0, chan_index );
2116 emit_frc( func, 0, 0 );
2117 STORE( func, *inst, 0, 0, chan_index );
2118 }
2119 break;
2120
2121 case TGSI_OPCODE_CLAMP:
2122 return 0;
2123 break;
2124
2125 case TGSI_OPCODE_FLOOR:
2126 /* TGSI_OPCODE_FLR */
2127 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2128 FETCH( func, *inst, 0, 0, chan_index );
2129 emit_flr( func, 0, 0 );
2130 STORE( func, *inst, 0, 0, chan_index );
2131 }
2132 break;
2133
2134 case TGSI_OPCODE_ROUND:
2135 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2136 FETCH( func, *inst, 0, 0, chan_index );
2137 emit_rnd( func, 0, 0 );
2138 STORE( func, *inst, 0, 0, chan_index );
2139 }
2140 break;
2141
2142 case TGSI_OPCODE_EXPBASE2:
2143 /* TGSI_OPCODE_EX2 */
2144 FETCH( func, *inst, 0, 0, CHAN_X );
2145 emit_ex2( func, 0, 0 );
2146 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147 STORE( func, *inst, 0, 0, chan_index );
2148 }
2149 break;
2150
2151 case TGSI_OPCODE_LOGBASE2:
2152 /* TGSI_OPCODE_LG2 */
2153 FETCH( func, *inst, 0, 0, CHAN_X );
2154 emit_lg2( func, 0, 0 );
2155 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2156 STORE( func, *inst, 0, 0, chan_index );
2157 }
2158 break;
2159
2160 case TGSI_OPCODE_POWER:
2161 /* TGSI_OPCODE_POW */
2162 FETCH( func, *inst, 0, 0, CHAN_X );
2163 FETCH( func, *inst, 1, 1, CHAN_X );
2164 emit_pow( func, 0, 0, 0, 1 );
2165 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2166 STORE( func, *inst, 0, 0, chan_index );
2167 }
2168 break;
2169
2170 case TGSI_OPCODE_CROSSPRODUCT:
2171 /* TGSI_OPCODE_XPD */
2172 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2173 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2174 FETCH( func, *inst, 1, 1, CHAN_Z );
2175 FETCH( func, *inst, 3, 0, CHAN_Z );
2176 }
2177 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2178 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2179 FETCH( func, *inst, 0, 0, CHAN_Y );
2180 FETCH( func, *inst, 4, 1, CHAN_Y );
2181 }
2182 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2183 emit_MOV( func, 2, 0 );
2184 emit_mul( func, 2, 1 );
2185 emit_MOV( func, 5, 3 );
2186 emit_mul( func, 5, 4 );
2187 emit_sub( func, 2, 5 );
2188 STORE( func, *inst, 2, 0, CHAN_X );
2189 }
2190 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2191 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2192 FETCH( func, *inst, 2, 1, CHAN_X );
2193 FETCH( func, *inst, 5, 0, CHAN_X );
2194 }
2195 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2196 emit_mul( func, 3, 2 );
2197 emit_mul( func, 1, 5 );
2198 emit_sub( func, 3, 1 );
2199 STORE( func, *inst, 3, 0, CHAN_Y );
2200 }
2201 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2202 emit_mul( func, 5, 4 );
2203 emit_mul( func, 0, 2 );
2204 emit_sub( func, 5, 0 );
2205 STORE( func, *inst, 5, 0, CHAN_Z );
2206 }
2207 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2208 emit_tempf(
2209 func,
2210 0,
2211 TEMP_ONE_I,
2212 TEMP_ONE_C );
2213 STORE( func, *inst, 0, 0, CHAN_W );
2214 }
2215 break;
2216
2217 case TGSI_OPCODE_MULTIPLYMATRIX:
2218 return 0;
2219 break;
2220
2221 case TGSI_OPCODE_ABS:
2222 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2223 FETCH( func, *inst, 0, 0, chan_index );
2224 emit_abs( func, 0) ;
2225
2226 STORE( func, *inst, 0, 0, chan_index );
2227 }
2228 break;
2229
2230 case TGSI_OPCODE_RCC:
2231 return 0;
2232 break;
2233
2234 case TGSI_OPCODE_DPH:
2235 FETCH( func, *inst, 0, 0, CHAN_X );
2236 FETCH( func, *inst, 1, 1, CHAN_X );
2237 emit_mul( func, 0, 1 );
2238 FETCH( func, *inst, 1, 0, CHAN_Y );
2239 FETCH( func, *inst, 2, 1, CHAN_Y );
2240 emit_mul( func, 1, 2 );
2241 emit_add( func, 0, 1 );
2242 FETCH( func, *inst, 1, 0, CHAN_Z );
2243 FETCH( func, *inst, 2, 1, CHAN_Z );
2244 emit_mul( func, 1, 2 );
2245 emit_add( func, 0, 1 );
2246 FETCH( func, *inst, 1, 1, CHAN_W );
2247 emit_add( func, 0, 1 );
2248 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2249 STORE( func, *inst, 0, 0, chan_index );
2250 }
2251 break;
2252
2253 case TGSI_OPCODE_COS:
2254 FETCH( func, *inst, 0, 0, CHAN_X );
2255 emit_cos( func, 0, 0 );
2256 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2257 STORE( func, *inst, 0, 0, chan_index );
2258 }
2259 break;
2260
2261 case TGSI_OPCODE_DDX:
2262 return 0;
2263 break;
2264
2265 case TGSI_OPCODE_DDY:
2266 return 0;
2267 break;
2268
2269 case TGSI_OPCODE_KILP:
2270 /* predicated kill */
2271 emit_kilp( func );
2272 return 0; /* XXX fix me */
2273 break;
2274
2275 case TGSI_OPCODE_KIL:
2276 /* conditional kill */
2277 emit_kil( func, &inst->FullSrcRegisters[0] );
2278 break;
2279
2280 case TGSI_OPCODE_PK2H:
2281 return 0;
2282 break;
2283
2284 case TGSI_OPCODE_PK2US:
2285 return 0;
2286 break;
2287
2288 case TGSI_OPCODE_PK4B:
2289 return 0;
2290 break;
2291
2292 case TGSI_OPCODE_PK4UB:
2293 return 0;
2294 break;
2295
2296 case TGSI_OPCODE_RFL:
2297 return 0;
2298 break;
2299
2300 case TGSI_OPCODE_SEQ:
2301 return 0;
2302 break;
2303
2304 case TGSI_OPCODE_SFL:
2305 return 0;
2306 break;
2307
2308 case TGSI_OPCODE_SGT:
2309 return 0;
2310 break;
2311
2312 case TGSI_OPCODE_SIN:
2313 FETCH( func, *inst, 0, 0, CHAN_X );
2314 emit_sin( func, 0, 0 );
2315 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2316 STORE( func, *inst, 0, 0, chan_index );
2317 }
2318 break;
2319
2320 case TGSI_OPCODE_SLE:
2321 return 0;
2322 break;
2323
2324 case TGSI_OPCODE_SNE:
2325 return 0;
2326 break;
2327
2328 case TGSI_OPCODE_STR:
2329 return 0;
2330 break;
2331
2332 case TGSI_OPCODE_TEX:
2333 emit_tex( func, inst, FALSE, FALSE );
2334 break;
2335
2336 case TGSI_OPCODE_TXD:
2337 return 0;
2338 break;
2339
2340 case TGSI_OPCODE_UP2H:
2341 return 0;
2342 break;
2343
2344 case TGSI_OPCODE_UP2US:
2345 return 0;
2346 break;
2347
2348 case TGSI_OPCODE_UP4B:
2349 return 0;
2350 break;
2351
2352 case TGSI_OPCODE_UP4UB:
2353 return 0;
2354 break;
2355
2356 case TGSI_OPCODE_X2D:
2357 return 0;
2358 break;
2359
2360 case TGSI_OPCODE_ARA:
2361 return 0;
2362 break;
2363
2364 case TGSI_OPCODE_ARR:
2365 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2366 FETCH( func, *inst, 0, 0, chan_index );
2367 emit_rnd( func, 0, 0 );
2368 emit_f2it( func, 0 );
2369 STORE( func, *inst, 0, 0, chan_index );
2370 }
2371 break;
2372
2373 case TGSI_OPCODE_BRA:
2374 return 0;
2375 break;
2376
2377 case TGSI_OPCODE_CAL:
2378 return 0;
2379 break;
2380
2381 case TGSI_OPCODE_RET:
2382 emit_ret( func );
2383 break;
2384
2385 case TGSI_OPCODE_END:
2386 break;
2387
2388 case TGSI_OPCODE_SSG:
2389 /* TGSI_OPCODE_SGN */
2390 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2391 FETCH( func, *inst, 0, 0, chan_index );
2392 emit_sgn( func, 0, 0 );
2393 STORE( func, *inst, 0, 0, chan_index );
2394 }
2395 break;
2396
2397 case TGSI_OPCODE_CMP:
2398 emit_cmp (func, inst);
2399 break;
2400
2401 case TGSI_OPCODE_SCS:
2402 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2403 FETCH( func, *inst, 0, 0, CHAN_X );
2404 emit_cos( func, 0, 0 );
2405 STORE( func, *inst, 0, 0, CHAN_X );
2406 }
2407 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2408 FETCH( func, *inst, 0, 0, CHAN_X );
2409 emit_sin( func, 0, 0 );
2410 STORE( func, *inst, 0, 0, CHAN_Y );
2411 }
2412 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2413 emit_tempf(
2414 func,
2415 0,
2416 TGSI_EXEC_TEMP_00000000_I,
2417 TGSI_EXEC_TEMP_00000000_C );
2418 STORE( func, *inst, 0, 0, CHAN_Z );
2419 }
2420 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2421 emit_tempf(
2422 func,
2423 0,
2424 TEMP_ONE_I,
2425 TEMP_ONE_C );
2426 STORE( func, *inst, 0, 0, CHAN_W );
2427 }
2428 break;
2429
2430 case TGSI_OPCODE_TXB:
2431 emit_tex( func, inst, TRUE, FALSE );
2432 break;
2433
2434 case TGSI_OPCODE_NRM:
2435 /* fall-through */
2436 case TGSI_OPCODE_NRM4:
2437 /* 3 or 4-component normalization */
2438 {
2439 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2440
2441 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2442 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2443 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2444 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2445
2446 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2447
2448 /* xmm4 = src.x */
2449 /* xmm0 = src.x * src.x */
2450 FETCH(func, *inst, 0, 0, CHAN_X);
2451 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2452 emit_MOV(func, 4, 0);
2453 }
2454 emit_mul(func, 0, 0);
2455
2456 /* xmm5 = src.y */
2457 /* xmm0 = xmm0 + src.y * src.y */
2458 FETCH(func, *inst, 1, 0, CHAN_Y);
2459 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2460 emit_MOV(func, 5, 1);
2461 }
2462 emit_mul(func, 1, 1);
2463 emit_add(func, 0, 1);
2464
2465 /* xmm6 = src.z */
2466 /* xmm0 = xmm0 + src.z * src.z */
2467 FETCH(func, *inst, 1, 0, CHAN_Z);
2468 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2469 emit_MOV(func, 6, 1);
2470 }
2471 emit_mul(func, 1, 1);
2472 emit_add(func, 0, 1);
2473
2474 if (dims == 4) {
2475 /* xmm7 = src.w */
2476 /* xmm0 = xmm0 + src.w * src.w */
2477 FETCH(func, *inst, 1, 0, CHAN_W);
2478 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2479 emit_MOV(func, 7, 1);
2480 }
2481 emit_mul(func, 1, 1);
2482 emit_add(func, 0, 1);
2483 }
2484
2485 /* xmm1 = 1 / sqrt(xmm0) */
2486 emit_rsqrt(func, 1, 0);
2487
2488 /* dst.x = xmm1 * src.x */
2489 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2490 emit_mul(func, 4, 1);
2491 STORE(func, *inst, 4, 0, CHAN_X);
2492 }
2493
2494 /* dst.y = xmm1 * src.y */
2495 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2496 emit_mul(func, 5, 1);
2497 STORE(func, *inst, 5, 0, CHAN_Y);
2498 }
2499
2500 /* dst.z = xmm1 * src.z */
2501 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2502 emit_mul(func, 6, 1);
2503 STORE(func, *inst, 6, 0, CHAN_Z);
2504 }
2505
2506 /* dst.w = xmm1 * src.w */
2507 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2508 emit_mul(func, 7, 1);
2509 STORE(func, *inst, 7, 0, CHAN_W);
2510 }
2511 }
2512
2513 /* dst0.w = 1.0 */
2514 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2515 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2516 STORE(func, *inst, 0, 0, CHAN_W);
2517 }
2518 }
2519 break;
2520
2521 case TGSI_OPCODE_DIV:
2522 return 0;
2523 break;
2524
2525 case TGSI_OPCODE_DP2:
2526 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2527 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2528 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2529 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2530 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2531 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2532 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2533 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2534 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2535 }
2536 break;
2537
2538 case TGSI_OPCODE_TXL:
2539 emit_tex( func, inst, TRUE, FALSE );
2540 break;
2541
2542 case TGSI_OPCODE_TXP:
2543 emit_tex( func, inst, FALSE, TRUE );
2544 break;
2545
2546 case TGSI_OPCODE_BRK:
2547 return 0;
2548 break;
2549
2550 case TGSI_OPCODE_IF:
2551 return 0;
2552 break;
2553
2554 case TGSI_OPCODE_LOOP:
2555 return 0;
2556 break;
2557
2558 case TGSI_OPCODE_REP:
2559 return 0;
2560 break;
2561
2562 case TGSI_OPCODE_ELSE:
2563 return 0;
2564 break;
2565
2566 case TGSI_OPCODE_ENDIF:
2567 return 0;
2568 break;
2569
2570 case TGSI_OPCODE_ENDLOOP:
2571 return 0;
2572 break;
2573
2574 case TGSI_OPCODE_ENDREP:
2575 return 0;
2576 break;
2577
2578 case TGSI_OPCODE_PUSHA:
2579 return 0;
2580 break;
2581
2582 case TGSI_OPCODE_POPA:
2583 return 0;
2584 break;
2585
2586 case TGSI_OPCODE_CEIL:
2587 return 0;
2588 break;
2589
2590 case TGSI_OPCODE_I2F:
2591 return 0;
2592 break;
2593
2594 case TGSI_OPCODE_NOT:
2595 return 0;
2596 break;
2597
2598 case TGSI_OPCODE_TRUNC:
2599 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2600 FETCH( func, *inst, 0, 0, chan_index );
2601 emit_f2it( func, 0 );
2602 emit_i2f( func, 0 );
2603 STORE( func, *inst, 0, 0, chan_index );
2604 }
2605 break;
2606
2607 case TGSI_OPCODE_SHL:
2608 return 0;
2609 break;
2610
2611 case TGSI_OPCODE_SHR:
2612 return 0;
2613 break;
2614
2615 case TGSI_OPCODE_AND:
2616 return 0;
2617 break;
2618
2619 case TGSI_OPCODE_OR:
2620 return 0;
2621 break;
2622
2623 case TGSI_OPCODE_MOD:
2624 return 0;
2625 break;
2626
2627 case TGSI_OPCODE_XOR:
2628 return 0;
2629 break;
2630
2631 case TGSI_OPCODE_SAD:
2632 return 0;
2633 break;
2634
2635 case TGSI_OPCODE_TXF:
2636 return 0;
2637 break;
2638
2639 case TGSI_OPCODE_TXQ:
2640 return 0;
2641 break;
2642
2643 case TGSI_OPCODE_CONT:
2644 return 0;
2645 break;
2646
2647 case TGSI_OPCODE_EMIT:
2648 return 0;
2649 break;
2650
2651 case TGSI_OPCODE_ENDPRIM:
2652 return 0;
2653 break;
2654
2655 default:
2656 return 0;
2657 }
2658
2659 return 1;
2660 }
2661
2662 static void
2663 emit_declaration(
2664 struct x86_function *func,
2665 struct tgsi_full_declaration *decl )
2666 {
2667 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2668 unsigned first, last, mask;
2669 unsigned i, j;
2670
2671 first = decl->DeclarationRange.First;
2672 last = decl->DeclarationRange.Last;
2673 mask = decl->Declaration.UsageMask;
2674
2675 for( i = first; i <= last; i++ ) {
2676 for( j = 0; j < NUM_CHANNELS; j++ ) {
2677 if( mask & (1 << j) ) {
2678 switch( decl->Declaration.Interpolate ) {
2679 case TGSI_INTERPOLATE_CONSTANT:
2680 emit_coef_a0( func, 0, i, j );
2681 emit_inputs( func, 0, i, j );
2682 break;
2683
2684 case TGSI_INTERPOLATE_LINEAR:
2685 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2686 emit_coef_dadx( func, 1, i, j );
2687 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2688 emit_coef_dady( func, 3, i, j );
2689 emit_mul( func, 0, 1 ); /* x * dadx */
2690 emit_coef_a0( func, 4, i, j );
2691 emit_mul( func, 2, 3 ); /* y * dady */
2692 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2693 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2694 emit_inputs( func, 0, i, j );
2695 break;
2696
2697 case TGSI_INTERPOLATE_PERSPECTIVE:
2698 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2699 emit_coef_dadx( func, 1, i, j );
2700 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2701 emit_coef_dady( func, 3, i, j );
2702 emit_mul( func, 0, 1 ); /* x * dadx */
2703 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2704 emit_coef_a0( func, 5, i, j );
2705 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2706 emit_mul( func, 2, 3 ); /* y * dady */
2707 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2708 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2709 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2710 emit_inputs( func, 0, i, j );
2711 break;
2712
2713 default:
2714 assert( 0 );
2715 break;
2716 }
2717 }
2718 }
2719 }
2720 }
2721 }
2722
2723 static void aos_to_soa( struct x86_function *func,
2724 uint arg_aos,
2725 uint arg_machine,
2726 uint arg_num,
2727 uint arg_stride )
2728 {
2729 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2730 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2731 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2732 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2733 int inner_loop;
2734
2735
2736 /* Save EBX */
2737 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2738
2739 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2740 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2741 x86_lea( func, soa_input,
2742 x86_make_disp( soa_input,
2743 Offset(struct tgsi_exec_machine, Inputs) ) );
2744 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2745 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2746
2747 /* do */
2748 inner_loop = x86_get_label( func );
2749 {
2750 x86_push( func, aos_input );
2751 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2752 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2753 x86_add( func, aos_input, stride );
2754 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2755 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2756 x86_add( func, aos_input, stride );
2757 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2758 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2759 x86_add( func, aos_input, stride );
2760 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2761 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2762 x86_pop( func, aos_input );
2763
2764 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2765 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2766 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2767 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2768 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2769 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2770
2771 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2772 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2773 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2774 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2775
2776 /* Advance to next input */
2777 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2778 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2779 }
2780 /* while --num_inputs */
2781 x86_dec( func, num_inputs );
2782 x86_jcc( func, cc_NE, inner_loop );
2783
2784 /* Restore EBX */
2785 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2786 }
2787
2788 static void soa_to_aos( struct x86_function *func,
2789 uint arg_aos,
2790 uint arg_machine,
2791 uint arg_num,
2792 uint arg_stride )
2793 {
2794 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2795 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2796 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2797 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2798 int inner_loop;
2799
2800 /* Save EBX */
2801 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2802
2803 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2804 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2805 x86_lea( func, soa_output,
2806 x86_make_disp( soa_output,
2807 Offset(struct tgsi_exec_machine, Outputs) ) );
2808 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2809
2810 /* do */
2811 inner_loop = x86_get_label( func );
2812 {
2813 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2814 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2815 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2816 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2817
2818 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2819 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2820 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2821 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2822 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2823 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2824
2825 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2826 x86_push( func, aos_output );
2827 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2828 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2829 x86_add( func, aos_output, temp );
2830 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2831 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2832 x86_add( func, aos_output, temp );
2833 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2834 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2835 x86_add( func, aos_output, temp );
2836 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2837 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2838 x86_pop( func, aos_output );
2839
2840 /* Advance to next output */
2841 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2842 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2843 }
2844 /* while --num_outputs */
2845 x86_dec( func, num_outputs );
2846 x86_jcc( func, cc_NE, inner_loop );
2847
2848 /* Restore EBX */
2849 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2850 }
2851
2852 /**
2853 * Translate a TGSI vertex/fragment shader to SSE2 code.
2854 * Slightly different things are done for vertex vs. fragment shaders.
2855 *
2856 * \param tokens the TGSI input shader
2857 * \param func the output SSE code/function
2858 * \param immediates buffer to place immediates, later passed to SSE func
2859 * \param return 1 for success, 0 if translation failed
2860 */
2861 unsigned
2862 tgsi_emit_sse2(
2863 const struct tgsi_token *tokens,
2864 struct x86_function *func,
2865 float (*immediates)[4],
2866 boolean do_swizzles )
2867 {
2868 struct tgsi_parse_context parse;
2869 unsigned ok = 1;
2870 uint num_immediates = 0;
2871
2872 util_init_math();
2873
2874 func->csr = func->store;
2875
2876 tgsi_parse_init( &parse, tokens );
2877
2878 /* Can't just use EDI, EBX without save/restoring them:
2879 */
2880 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2881 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2882
2883 /*
2884 * Different function args for vertex/fragment shaders:
2885 */
2886 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2887 if (do_swizzles)
2888 aos_to_soa( func,
2889 4, /* aos_input */
2890 1, /* machine */
2891 5, /* num_inputs */
2892 6 ); /* input_stride */
2893 }
2894
2895 x86_mov(
2896 func,
2897 get_machine_base(),
2898 x86_fn_arg( func, 1 ) );
2899 x86_mov(
2900 func,
2901 get_const_base(),
2902 x86_fn_arg( func, 2 ) );
2903 x86_mov(
2904 func,
2905 get_immediate_base(),
2906 x86_fn_arg( func, 3 ) );
2907
2908 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2909 x86_mov(
2910 func,
2911 get_coef_base(),
2912 x86_fn_arg( func, 4 ) );
2913 }
2914
2915 x86_mov(
2916 func,
2917 get_sampler_base(),
2918 x86_make_disp( get_machine_base(),
2919 Offset( struct tgsi_exec_machine, Samplers ) ) );
2920
2921
2922 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2923 tgsi_parse_token( &parse );
2924
2925 switch( parse.FullToken.Token.Type ) {
2926 case TGSI_TOKEN_TYPE_DECLARATION:
2927 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2928 emit_declaration(
2929 func,
2930 &parse.FullToken.FullDeclaration );
2931 }
2932 break;
2933
2934 case TGSI_TOKEN_TYPE_INSTRUCTION:
2935 ok = emit_instruction(
2936 func,
2937 &parse.FullToken.FullInstruction );
2938
2939 if (!ok) {
2940 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2941 parse.FullToken.FullInstruction.Instruction.Opcode,
2942 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2943 "vertex shader" : "fragment shader");
2944 }
2945 break;
2946
2947 case TGSI_TOKEN_TYPE_IMMEDIATE:
2948 /* simply copy the immediate values into the next immediates[] slot */
2949 {
2950 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2951 uint i;
2952 assert(size <= 4);
2953 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2954 for( i = 0; i < size; i++ ) {
2955 immediates[num_immediates][i] =
2956 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2957 }
2958 #if 0
2959 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2960 num_immediates,
2961 immediates[num_immediates][0],
2962 immediates[num_immediates][1],
2963 immediates[num_immediates][2],
2964 immediates[num_immediates][3]);
2965 #endif
2966 num_immediates++;
2967 }
2968 break;
2969
2970 default:
2971 ok = 0;
2972 assert( 0 );
2973 }
2974 }
2975
2976 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2977 if (do_swizzles)
2978 soa_to_aos( func,
2979 7, /* aos_output */
2980 1, /* machine */
2981 8, /* num_outputs */
2982 9 ); /* output_stride */
2983 }
2984
2985 /* Can't just use EBX, EDI without save/restoring them:
2986 */
2987 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2988 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2989
2990 emit_ret( func );
2991
2992 tgsi_parse_free( &parse );
2993
2994 return ok;
2995 }
2996
2997 #endif /* PIPE_ARCH_X86 */
2998