Merge branch 'mesa_7_5_branch'
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_config.h"
29
30 #if defined(PIPE_ARCH_X86)
31
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
38 #endif
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
43
44 #include "rtasm/rtasm_x86sse.h"
45
46 /* for 1/sqrt()
47 *
48 * This costs about 100fps (close to 10%) in gears:
49 */
50 #define HIGH_PRECISION 1
51
52 #define FAST_MATH 1
53
54
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
57
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
60
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
63
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67
68 #define CHAN_X 0
69 #define CHAN_Y 1
70 #define CHAN_Z 2
71 #define CHAN_W 3
72
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
75
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
80
81
82 /**
83 * X86 utility functions.
84 */
85
86 static struct x86_reg
87 make_xmm(
88 unsigned xmm )
89 {
90 return x86_make_reg(
91 file_XMM,
92 (enum x86_reg_name) xmm );
93 }
94
95 /**
96 * X86 register mapping helpers.
97 */
98
99 static struct x86_reg
100 get_const_base( void )
101 {
102 return x86_make_reg(
103 file_REG32,
104 reg_AX );
105 }
106
107 static struct x86_reg
108 get_machine_base( void )
109 {
110 return x86_make_reg(
111 file_REG32,
112 reg_CX );
113 }
114
115 static struct x86_reg
116 get_input_base( void )
117 {
118 return x86_make_disp(
119 get_machine_base(),
120 Offset(struct tgsi_exec_machine, Inputs) );
121 }
122
123 static struct x86_reg
124 get_output_base( void )
125 {
126 return x86_make_disp(
127 get_machine_base(),
128 Offset(struct tgsi_exec_machine, Outputs) );
129 }
130
131 static struct x86_reg
132 get_temp_base( void )
133 {
134 return x86_make_disp(
135 get_machine_base(),
136 Offset(struct tgsi_exec_machine, Temps) );
137 }
138
139 static struct x86_reg
140 get_coef_base( void )
141 {
142 return x86_make_reg(
143 file_REG32,
144 reg_BX );
145 }
146
147 static struct x86_reg
148 get_sampler_base( void )
149 {
150 return x86_make_reg(
151 file_REG32,
152 reg_DI );
153 }
154
155 static struct x86_reg
156 get_immediate_base( void )
157 {
158 return x86_make_reg(
159 file_REG32,
160 reg_DX );
161 }
162
163
164 /**
165 * Data access helpers.
166 */
167
168
169 static struct x86_reg
170 get_immediate(
171 unsigned vec,
172 unsigned chan )
173 {
174 return x86_make_disp(
175 get_immediate_base(),
176 (vec * 4 + chan) * 4 );
177 }
178
179 static struct x86_reg
180 get_const(
181 unsigned vec,
182 unsigned chan )
183 {
184 return x86_make_disp(
185 get_const_base(),
186 (vec * 4 + chan) * 4 );
187 }
188
189 static struct x86_reg
190 get_sampler_ptr(
191 unsigned unit )
192 {
193 return x86_make_disp(
194 get_sampler_base(),
195 unit * sizeof( struct tgsi_sampler * ) );
196 }
197
198 static struct x86_reg
199 get_input(
200 unsigned vec,
201 unsigned chan )
202 {
203 return x86_make_disp(
204 get_input_base(),
205 (vec * 4 + chan) * 16 );
206 }
207
208 static struct x86_reg
209 get_output(
210 unsigned vec,
211 unsigned chan )
212 {
213 return x86_make_disp(
214 get_output_base(),
215 (vec * 4 + chan) * 16 );
216 }
217
218 static struct x86_reg
219 get_temp(
220 unsigned vec,
221 unsigned chan )
222 {
223 return x86_make_disp(
224 get_temp_base(),
225 (vec * 4 + chan) * 16 );
226 }
227
228 static struct x86_reg
229 get_coef(
230 unsigned vec,
231 unsigned chan,
232 unsigned member )
233 {
234 return x86_make_disp(
235 get_coef_base(),
236 ((vec * 3 + member) * 4 + chan) * 4 );
237 }
238
239
240 static void
241 emit_ret(
242 struct x86_function *func )
243 {
244 x86_ret( func );
245 }
246
247
248 /**
249 * Data fetch helpers.
250 */
251
252 /**
253 * Copy a shader constant to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src const buffer index
256 * \param chan src channel to fetch (X, Y, Z or W)
257 */
258 static void
259 emit_const(
260 struct x86_function *func,
261 uint xmm,
262 int vec,
263 uint chan,
264 uint indirect,
265 uint indirectFile,
266 int indirectIndex )
267 {
268 if (indirect) {
269 /* 'vec' is the offset from the address register's value.
270 * We're loading CONST[ADDR+vec] into an xmm register.
271 */
272 struct x86_reg r0 = get_immediate_base();
273 struct x86_reg r1 = get_coef_base();
274 uint i;
275
276 assert( indirectFile == TGSI_FILE_ADDRESS );
277 assert( indirectIndex == 0 );
278 assert( r0.mod == mod_REG );
279 assert( r1.mod == mod_REG );
280
281 x86_push( func, r0 );
282 x86_push( func, r1 );
283
284 /*
285 * Loop over the four pixels or vertices in the quad.
286 * Get the value of the address (offset) register for pixel/vertex[i],
287 * add it to the src offset and index into the constant buffer.
288 * Note that we're working on SOA data.
289 * If any of the pixel/vertex execution channels are unused their
290 * values will be garbage. It's very important that we don't use
291 * those garbage values as indexes into the constant buffer since
292 * that'll cause segfaults.
293 * The solution is to bitwise-AND the offset with the execution mask
294 * register whose values are either 0 or ~0.
295 * The caller must setup the execution mask register to indicate
296 * which channels are valid/alive before running the shader.
297 * The execution mask will also figure into loops and conditionals
298 * someday.
299 */
300 for (i = 0; i < QUAD_SIZE; i++) {
301 /* r1 = address register[i] */
302 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
303 /* r0 = execution mask[i] */
304 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
305 /* r1 = r1 & r0 */
306 x86_and( func, r1, r0 );
307 /* r0 = 'vec', the offset */
308 x86_lea( func, r0, get_const( vec, chan ) );
309
310 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
311 */
312 x86_add( func, r1, r1 );
313 x86_add( func, r1, r1 );
314 x86_add( func, r1, r1 );
315 x86_add( func, r1, r1 );
316
317 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
318 x86_mov( func, r1, x86_deref( r0 ) );
319 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
320 }
321
322 x86_pop( func, r1 );
323 x86_pop( func, r0 );
324
325 sse_movaps(
326 func,
327 make_xmm( xmm ),
328 get_temp( TEMP_R0, CHAN_X ) );
329 }
330 else {
331 /* 'vec' is the index into the src register file, such as TEMP[vec] */
332 assert( vec >= 0 );
333
334 sse_movss(
335 func,
336 make_xmm( xmm ),
337 get_const( vec, chan ) );
338 sse_shufps(
339 func,
340 make_xmm( xmm ),
341 make_xmm( xmm ),
342 SHUF( 0, 0, 0, 0 ) );
343 }
344 }
345
346 static void
347 emit_immediate(
348 struct x86_function *func,
349 unsigned xmm,
350 unsigned vec,
351 unsigned chan )
352 {
353 sse_movss(
354 func,
355 make_xmm( xmm ),
356 get_immediate( vec, chan ) );
357 sse_shufps(
358 func,
359 make_xmm( xmm ),
360 make_xmm( xmm ),
361 SHUF( 0, 0, 0, 0 ) );
362 }
363
364
365 /**
366 * Copy a shader input to xmm register
367 * \param xmm the destination xmm register
368 * \param vec the src input attrib
369 * \param chan src channel to fetch (X, Y, Z or W)
370 */
371 static void
372 emit_inputf(
373 struct x86_function *func,
374 unsigned xmm,
375 unsigned vec,
376 unsigned chan )
377 {
378 sse_movups(
379 func,
380 make_xmm( xmm ),
381 get_input( vec, chan ) );
382 }
383
384 /**
385 * Store an xmm register to a shader output
386 * \param xmm the source xmm register
387 * \param vec the dest output attrib
388 * \param chan src dest channel to store (X, Y, Z or W)
389 */
390 static void
391 emit_output(
392 struct x86_function *func,
393 unsigned xmm,
394 unsigned vec,
395 unsigned chan )
396 {
397 sse_movups(
398 func,
399 get_output( vec, chan ),
400 make_xmm( xmm ) );
401 }
402
403 /**
404 * Copy a shader temporary to xmm register
405 * \param xmm the destination xmm register
406 * \param vec the src temp register
407 * \param chan src channel to fetch (X, Y, Z or W)
408 */
409 static void
410 emit_tempf(
411 struct x86_function *func,
412 unsigned xmm,
413 unsigned vec,
414 unsigned chan )
415 {
416 sse_movaps(
417 func,
418 make_xmm( xmm ),
419 get_temp( vec, chan ) );
420 }
421
422 /**
423 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
424 * \param xmm the destination xmm register
425 * \param vec the src input/attribute coefficient index
426 * \param chan src channel to fetch (X, Y, Z or W)
427 * \param member 0=a0, 1=dadx, 2=dady
428 */
429 static void
430 emit_coef(
431 struct x86_function *func,
432 unsigned xmm,
433 unsigned vec,
434 unsigned chan,
435 unsigned member )
436 {
437 sse_movss(
438 func,
439 make_xmm( xmm ),
440 get_coef( vec, chan, member ) );
441 sse_shufps(
442 func,
443 make_xmm( xmm ),
444 make_xmm( xmm ),
445 SHUF( 0, 0, 0, 0 ) );
446 }
447
448 /**
449 * Data store helpers.
450 */
451
452 static void
453 emit_inputs(
454 struct x86_function *func,
455 unsigned xmm,
456 unsigned vec,
457 unsigned chan )
458 {
459 sse_movups(
460 func,
461 get_input( vec, chan ),
462 make_xmm( xmm ) );
463 }
464
465 static void
466 emit_temps(
467 struct x86_function *func,
468 unsigned xmm,
469 unsigned vec,
470 unsigned chan )
471 {
472 sse_movaps(
473 func,
474 get_temp( vec, chan ),
475 make_xmm( xmm ) );
476 }
477
478 static void
479 emit_addrs(
480 struct x86_function *func,
481 unsigned xmm,
482 unsigned vec,
483 unsigned chan )
484 {
485 assert( vec == 0 );
486
487 emit_temps(
488 func,
489 xmm,
490 vec + TGSI_EXEC_TEMP_ADDR,
491 chan );
492 }
493
494 /**
495 * Coefficent fetch helpers.
496 */
497
498 static void
499 emit_coef_a0(
500 struct x86_function *func,
501 unsigned xmm,
502 unsigned vec,
503 unsigned chan )
504 {
505 emit_coef(
506 func,
507 xmm,
508 vec,
509 chan,
510 0 );
511 }
512
513 static void
514 emit_coef_dadx(
515 struct x86_function *func,
516 unsigned xmm,
517 unsigned vec,
518 unsigned chan )
519 {
520 emit_coef(
521 func,
522 xmm,
523 vec,
524 chan,
525 1 );
526 }
527
528 static void
529 emit_coef_dady(
530 struct x86_function *func,
531 unsigned xmm,
532 unsigned vec,
533 unsigned chan )
534 {
535 emit_coef(
536 func,
537 xmm,
538 vec,
539 chan,
540 2 );
541 }
542
543 /**
544 * Function call helpers.
545 */
546
547 /**
548 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
549 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
550 * that the stack pointer is 16 byte aligned, as expected.
551 */
552 static void
553 emit_func_call(
554 struct x86_function *func,
555 unsigned xmm_save_mask,
556 const struct x86_reg *arg,
557 unsigned nr_args,
558 void (PIPE_CDECL *code)() )
559 {
560 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
561 unsigned i, n;
562
563 x86_push(
564 func,
565 x86_make_reg( file_REG32, reg_AX) );
566 x86_push(
567 func,
568 x86_make_reg( file_REG32, reg_CX) );
569 x86_push(
570 func,
571 x86_make_reg( file_REG32, reg_DX) );
572
573 /* Store XMM regs to the stack
574 */
575 for(i = 0, n = 0; i < 8; ++i)
576 if(xmm_save_mask & (1 << i))
577 ++n;
578
579 x86_sub_imm(
580 func,
581 x86_make_reg( file_REG32, reg_SP ),
582 n*16);
583
584 for(i = 0, n = 0; i < 8; ++i)
585 if(xmm_save_mask & (1 << i)) {
586 sse_movups(
587 func,
588 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
589 make_xmm( i ) );
590 ++n;
591 }
592
593 for (i = 0; i < nr_args; i++) {
594 /* Load the address of the buffer we use for passing arguments and
595 * receiving results:
596 */
597 x86_lea(
598 func,
599 ecx,
600 arg[i] );
601
602 /* Push actual function arguments (currently just the pointer to
603 * the buffer above), and call the function:
604 */
605 x86_push( func, ecx );
606 }
607
608 x86_mov_reg_imm( func, ecx, (unsigned long) code );
609 x86_call( func, ecx );
610
611 /* Pop the arguments (or just add an immediate to esp)
612 */
613 for (i = 0; i < nr_args; i++) {
614 x86_pop(func, ecx );
615 }
616
617 /* Pop the saved XMM regs:
618 */
619 for(i = 0, n = 0; i < 8; ++i)
620 if(xmm_save_mask & (1 << i)) {
621 sse_movups(
622 func,
623 make_xmm( i ),
624 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
625 ++n;
626 }
627
628 x86_add_imm(
629 func,
630 x86_make_reg( file_REG32, reg_SP ),
631 n*16);
632
633 /* Restore GP registers in a reverse order.
634 */
635 x86_pop(
636 func,
637 x86_make_reg( file_REG32, reg_DX) );
638 x86_pop(
639 func,
640 x86_make_reg( file_REG32, reg_CX) );
641 x86_pop(
642 func,
643 x86_make_reg( file_REG32, reg_AX) );
644 }
645
646 static void
647 emit_func_call_dst_src1(
648 struct x86_function *func,
649 unsigned xmm_save,
650 unsigned xmm_dst,
651 unsigned xmm_src0,
652 void (PIPE_CDECL *code)() )
653 {
654 struct x86_reg store = get_temp( TEMP_R0, 0 );
655 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
656
657 /* Store our input parameters (in xmm regs) to the buffer we use
658 * for passing arguments. We will pass a pointer to this buffer as
659 * the actual function argument.
660 */
661 sse_movaps(
662 func,
663 store,
664 make_xmm( xmm_src0 ) );
665
666 emit_func_call( func,
667 xmm_mask,
668 &store,
669 1,
670 code );
671
672 sse_movaps(
673 func,
674 make_xmm( xmm_dst ),
675 store );
676 }
677
678
679 static void
680 emit_func_call_dst_src2(
681 struct x86_function *func,
682 unsigned xmm_save,
683 unsigned xmm_dst,
684 unsigned xmm_src0,
685 unsigned xmm_src1,
686 void (PIPE_CDECL *code)() )
687 {
688 struct x86_reg store = get_temp( TEMP_R0, 0 );
689 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
690
691 /* Store two inputs to parameter buffer.
692 */
693 sse_movaps(
694 func,
695 store,
696 make_xmm( xmm_src0 ) );
697
698 sse_movaps(
699 func,
700 x86_make_disp( store, 4 * sizeof(float) ),
701 make_xmm( xmm_src1 ) );
702
703
704 /* Emit the call
705 */
706 emit_func_call( func,
707 xmm_mask,
708 &store,
709 1,
710 code );
711
712 /* Retrieve the results:
713 */
714 sse_movaps(
715 func,
716 make_xmm( xmm_dst ),
717 store );
718 }
719
720
721
722
723
724 #if defined(PIPE_ARCH_SSE)
725
726 /*
727 * Fast SSE2 implementation of special math functions.
728 */
729
730 #define POLY0(x, c0) _mm_set1_ps(c0)
731 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
732 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
733 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
734 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
735 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
736
737 #define EXP_POLY_DEGREE 3
738 #define LOG_POLY_DEGREE 5
739
740 /**
741 * See http://www.devmaster.net/forums/showthread.php?p=43580
742 */
743 static INLINE __m128
744 exp2f4(__m128 x)
745 {
746 __m128i ipart;
747 __m128 fpart, expipart, expfpart;
748
749 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
750 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
751
752 /* ipart = int(x - 0.5) */
753 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
754
755 /* fpart = x - ipart */
756 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
757
758 /* expipart = (float) (1 << ipart) */
759 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
760
761 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
762 #if EXP_POLY_DEGREE == 5
763 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
764 #elif EXP_POLY_DEGREE == 4
765 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
766 #elif EXP_POLY_DEGREE == 3
767 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
768 #elif EXP_POLY_DEGREE == 2
769 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
770 #else
771 #error
772 #endif
773
774 return _mm_mul_ps(expipart, expfpart);
775 }
776
777
778 /**
779 * See http://www.devmaster.net/forums/showthread.php?p=43580
780 */
781 static INLINE __m128
782 log2f4(__m128 x)
783 {
784 __m128i expmask = _mm_set1_epi32(0x7f800000);
785 __m128i mantmask = _mm_set1_epi32(0x007fffff);
786 __m128 one = _mm_set1_ps(1.0f);
787
788 __m128i i = _mm_castps_si128(x);
789
790 /* exp = (float) exponent(x) */
791 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
792
793 /* mant = (float) mantissa(x) */
794 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
795
796 __m128 logmant;
797
798 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
799 * These coefficients can be generate with
800 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
801 */
802 #if LOG_POLY_DEGREE == 6
803 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
804 #elif LOG_POLY_DEGREE == 5
805 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
806 #elif LOG_POLY_DEGREE == 4
807 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
808 #elif LOG_POLY_DEGREE == 3
809 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
810 #else
811 #error
812 #endif
813
814 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
815 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
816
817 return _mm_add_ps(logmant, exp);
818 }
819
820
821 static INLINE __m128
822 powf4(__m128 x, __m128 y)
823 {
824 return exp2f4(_mm_mul_ps(log2f4(x), y));
825 }
826
827 #endif /* PIPE_ARCH_SSE */
828
829
830
831 /**
832 * Low-level instruction translators.
833 */
834
835 static void
836 emit_abs(
837 struct x86_function *func,
838 unsigned xmm )
839 {
840 sse_andps(
841 func,
842 make_xmm( xmm ),
843 get_temp(
844 TGSI_EXEC_TEMP_7FFFFFFF_I,
845 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
846 }
847
848 static void
849 emit_add(
850 struct x86_function *func,
851 unsigned xmm_dst,
852 unsigned xmm_src )
853 {
854 sse_addps(
855 func,
856 make_xmm( xmm_dst ),
857 make_xmm( xmm_src ) );
858 }
859
860 static void PIPE_CDECL
861 cos4f(
862 float *store )
863 {
864 store[0] = cosf( store[0] );
865 store[1] = cosf( store[1] );
866 store[2] = cosf( store[2] );
867 store[3] = cosf( store[3] );
868 }
869
870 static void
871 emit_cos(
872 struct x86_function *func,
873 unsigned xmm_save,
874 unsigned xmm_dst )
875 {
876 emit_func_call_dst_src1(
877 func,
878 xmm_save,
879 xmm_dst,
880 xmm_dst,
881 cos4f );
882 }
883
884 static void PIPE_CDECL
885 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
886 __attribute__((force_align_arg_pointer))
887 #endif
888 ex24f(
889 float *store )
890 {
891 #if defined(PIPE_ARCH_SSE)
892 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
893 #else
894 store[0] = util_fast_exp2( store[0] );
895 store[1] = util_fast_exp2( store[1] );
896 store[2] = util_fast_exp2( store[2] );
897 store[3] = util_fast_exp2( store[3] );
898 #endif
899 }
900
901 static void
902 emit_ex2(
903 struct x86_function *func,
904 unsigned xmm_save,
905 unsigned xmm_dst )
906 {
907 emit_func_call_dst_src1(
908 func,
909 xmm_save,
910 xmm_dst,
911 xmm_dst,
912 ex24f );
913 }
914
915 static void
916 emit_f2it(
917 struct x86_function *func,
918 unsigned xmm )
919 {
920 sse2_cvttps2dq(
921 func,
922 make_xmm( xmm ),
923 make_xmm( xmm ) );
924 }
925
926 static void
927 emit_i2f(
928 struct x86_function *func,
929 unsigned xmm )
930 {
931 sse2_cvtdq2ps(
932 func,
933 make_xmm( xmm ),
934 make_xmm( xmm ) );
935 }
936
937 static void PIPE_CDECL
938 flr4f(
939 float *store )
940 {
941 store[0] = floorf( store[0] );
942 store[1] = floorf( store[1] );
943 store[2] = floorf( store[2] );
944 store[3] = floorf( store[3] );
945 }
946
947 static void
948 emit_flr(
949 struct x86_function *func,
950 unsigned xmm_save,
951 unsigned xmm_dst )
952 {
953 emit_func_call_dst_src1(
954 func,
955 xmm_save,
956 xmm_dst,
957 xmm_dst,
958 flr4f );
959 }
960
961 static void PIPE_CDECL
962 frc4f(
963 float *store )
964 {
965 store[0] -= floorf( store[0] );
966 store[1] -= floorf( store[1] );
967 store[2] -= floorf( store[2] );
968 store[3] -= floorf( store[3] );
969 }
970
971 static void
972 emit_frc(
973 struct x86_function *func,
974 unsigned xmm_save,
975 unsigned xmm_dst )
976 {
977 emit_func_call_dst_src1(
978 func,
979 xmm_save,
980 xmm_dst,
981 xmm_dst,
982 frc4f );
983 }
984
985 static void PIPE_CDECL
986 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
987 __attribute__((force_align_arg_pointer))
988 #endif
989 lg24f(
990 float *store )
991 {
992 #if defined(PIPE_ARCH_SSE)
993 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
994 #else
995 store[0] = util_fast_log2( store[0] );
996 store[1] = util_fast_log2( store[1] );
997 store[2] = util_fast_log2( store[2] );
998 store[3] = util_fast_log2( store[3] );
999 #endif
1000 }
1001
1002 static void
1003 emit_lg2(
1004 struct x86_function *func,
1005 unsigned xmm_save,
1006 unsigned xmm_dst )
1007 {
1008 emit_func_call_dst_src1(
1009 func,
1010 xmm_save,
1011 xmm_dst,
1012 xmm_dst,
1013 lg24f );
1014 }
1015
1016 static void
1017 emit_MOV(
1018 struct x86_function *func,
1019 unsigned xmm_dst,
1020 unsigned xmm_src )
1021 {
1022 sse_movups(
1023 func,
1024 make_xmm( xmm_dst ),
1025 make_xmm( xmm_src ) );
1026 }
1027
1028 static void
1029 emit_mul (struct x86_function *func,
1030 unsigned xmm_dst,
1031 unsigned xmm_src)
1032 {
1033 sse_mulps(
1034 func,
1035 make_xmm( xmm_dst ),
1036 make_xmm( xmm_src ) );
1037 }
1038
1039 static void
1040 emit_neg(
1041 struct x86_function *func,
1042 unsigned xmm )
1043 {
1044 sse_xorps(
1045 func,
1046 make_xmm( xmm ),
1047 get_temp(
1048 TGSI_EXEC_TEMP_80000000_I,
1049 TGSI_EXEC_TEMP_80000000_C ) );
1050 }
1051
1052 static void PIPE_CDECL
1053 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1054 __attribute__((force_align_arg_pointer))
1055 #endif
1056 pow4f(
1057 float *store )
1058 {
1059 #if defined(PIPE_ARCH_SSE)
1060 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1061 #else
1062 store[0] = util_fast_pow( store[0], store[4] );
1063 store[1] = util_fast_pow( store[1], store[5] );
1064 store[2] = util_fast_pow( store[2], store[6] );
1065 store[3] = util_fast_pow( store[3], store[7] );
1066 #endif
1067 }
1068
1069 static void
1070 emit_pow(
1071 struct x86_function *func,
1072 unsigned xmm_save,
1073 unsigned xmm_dst,
1074 unsigned xmm_src0,
1075 unsigned xmm_src1 )
1076 {
1077 emit_func_call_dst_src2(
1078 func,
1079 xmm_save,
1080 xmm_dst,
1081 xmm_src0,
1082 xmm_src1,
1083 pow4f );
1084 }
1085
1086 static void
1087 emit_rcp (
1088 struct x86_function *func,
1089 unsigned xmm_dst,
1090 unsigned xmm_src )
1091 {
1092 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1093 * good enough. Need to either emit a proper divide or use the
1094 * iterative technique described below in emit_rsqrt().
1095 */
1096 sse2_rcpps(
1097 func,
1098 make_xmm( xmm_dst ),
1099 make_xmm( xmm_src ) );
1100 }
1101
1102 static void PIPE_CDECL
1103 rnd4f(
1104 float *store )
1105 {
1106 store[0] = floorf( store[0] + 0.5f );
1107 store[1] = floorf( store[1] + 0.5f );
1108 store[2] = floorf( store[2] + 0.5f );
1109 store[3] = floorf( store[3] + 0.5f );
1110 }
1111
1112 static void
1113 emit_rnd(
1114 struct x86_function *func,
1115 unsigned xmm_save,
1116 unsigned xmm_dst )
1117 {
1118 emit_func_call_dst_src1(
1119 func,
1120 xmm_save,
1121 xmm_dst,
1122 xmm_dst,
1123 rnd4f );
1124 }
1125
1126 static void
1127 emit_rsqrt(
1128 struct x86_function *func,
1129 unsigned xmm_dst,
1130 unsigned xmm_src )
1131 {
1132 #if HIGH_PRECISION
1133 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1134 * implementations, it is possible to improve its precision at
1135 * fairly low cost, using a newton/raphson step, as below:
1136 *
1137 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1138 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1139 *
1140 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1141 */
1142 {
1143 struct x86_reg dst = make_xmm( xmm_dst );
1144 struct x86_reg src = make_xmm( xmm_src );
1145 struct x86_reg tmp0 = make_xmm( 2 );
1146 struct x86_reg tmp1 = make_xmm( 3 );
1147
1148 assert( xmm_dst != xmm_src );
1149 assert( xmm_dst != 2 && xmm_dst != 3 );
1150 assert( xmm_src != 2 && xmm_src != 3 );
1151
1152 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1153 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1154 sse_rsqrtps( func, tmp1, src );
1155 sse_mulps( func, src, tmp1 );
1156 sse_mulps( func, dst, tmp1 );
1157 sse_mulps( func, src, tmp1 );
1158 sse_subps( func, tmp0, src );
1159 sse_mulps( func, dst, tmp0 );
1160 }
1161 #else
1162 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1163 * good enough.
1164 */
1165 sse_rsqrtps(
1166 func,
1167 make_xmm( xmm_dst ),
1168 make_xmm( xmm_src ) );
1169 #endif
1170 }
1171
1172 static void
1173 emit_setsign(
1174 struct x86_function *func,
1175 unsigned xmm )
1176 {
1177 sse_orps(
1178 func,
1179 make_xmm( xmm ),
1180 get_temp(
1181 TGSI_EXEC_TEMP_80000000_I,
1182 TGSI_EXEC_TEMP_80000000_C ) );
1183 }
1184
1185 static void PIPE_CDECL
1186 sgn4f(
1187 float *store )
1188 {
1189 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1190 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1191 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1192 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1193 }
1194
1195 static void
1196 emit_sgn(
1197 struct x86_function *func,
1198 unsigned xmm_save,
1199 unsigned xmm_dst )
1200 {
1201 emit_func_call_dst_src1(
1202 func,
1203 xmm_save,
1204 xmm_dst,
1205 xmm_dst,
1206 sgn4f );
1207 }
1208
1209 static void PIPE_CDECL
1210 sin4f(
1211 float *store )
1212 {
1213 store[0] = sinf( store[0] );
1214 store[1] = sinf( store[1] );
1215 store[2] = sinf( store[2] );
1216 store[3] = sinf( store[3] );
1217 }
1218
1219 static void
1220 emit_sin (struct x86_function *func,
1221 unsigned xmm_save,
1222 unsigned xmm_dst)
1223 {
1224 emit_func_call_dst_src1(
1225 func,
1226 xmm_save,
1227 xmm_dst,
1228 xmm_dst,
1229 sin4f );
1230 }
1231
1232 static void
1233 emit_sub(
1234 struct x86_function *func,
1235 unsigned xmm_dst,
1236 unsigned xmm_src )
1237 {
1238 sse_subps(
1239 func,
1240 make_xmm( xmm_dst ),
1241 make_xmm( xmm_src ) );
1242 }
1243
1244
1245
1246
1247
1248
1249
1250 /**
1251 * Register fetch.
1252 */
1253
1254 static void
1255 emit_fetch(
1256 struct x86_function *func,
1257 unsigned xmm,
1258 const struct tgsi_full_src_register *reg,
1259 const unsigned chan_index )
1260 {
1261 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1262
1263 switch (swizzle) {
1264 case TGSI_EXTSWIZZLE_X:
1265 case TGSI_EXTSWIZZLE_Y:
1266 case TGSI_EXTSWIZZLE_Z:
1267 case TGSI_EXTSWIZZLE_W:
1268 switch (reg->SrcRegister.File) {
1269 case TGSI_FILE_CONSTANT:
1270 emit_const(
1271 func,
1272 xmm,
1273 reg->SrcRegister.Index,
1274 swizzle,
1275 reg->SrcRegister.Indirect,
1276 reg->SrcRegisterInd.File,
1277 reg->SrcRegisterInd.Index );
1278 break;
1279
1280 case TGSI_FILE_IMMEDIATE:
1281 emit_immediate(
1282 func,
1283 xmm,
1284 reg->SrcRegister.Index,
1285 swizzle );
1286 break;
1287
1288 case TGSI_FILE_INPUT:
1289 emit_inputf(
1290 func,
1291 xmm,
1292 reg->SrcRegister.Index,
1293 swizzle );
1294 break;
1295
1296 case TGSI_FILE_TEMPORARY:
1297 emit_tempf(
1298 func,
1299 xmm,
1300 reg->SrcRegister.Index,
1301 swizzle );
1302 break;
1303
1304 default:
1305 assert( 0 );
1306 }
1307 break;
1308
1309 case TGSI_EXTSWIZZLE_ZERO:
1310 emit_tempf(
1311 func,
1312 xmm,
1313 TGSI_EXEC_TEMP_00000000_I,
1314 TGSI_EXEC_TEMP_00000000_C );
1315 break;
1316
1317 case TGSI_EXTSWIZZLE_ONE:
1318 emit_tempf(
1319 func,
1320 xmm,
1321 TEMP_ONE_I,
1322 TEMP_ONE_C );
1323 break;
1324
1325 default:
1326 assert( 0 );
1327 }
1328
1329 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1330 case TGSI_UTIL_SIGN_CLEAR:
1331 emit_abs( func, xmm );
1332 break;
1333
1334 case TGSI_UTIL_SIGN_SET:
1335 emit_setsign( func, xmm );
1336 break;
1337
1338 case TGSI_UTIL_SIGN_TOGGLE:
1339 emit_neg( func, xmm );
1340 break;
1341
1342 case TGSI_UTIL_SIGN_KEEP:
1343 break;
1344 }
1345 }
1346
1347 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1348 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1349
1350 /**
1351 * Register store.
1352 */
1353
1354 static void
1355 emit_store(
1356 struct x86_function *func,
1357 unsigned xmm,
1358 const struct tgsi_full_dst_register *reg,
1359 const struct tgsi_full_instruction *inst,
1360 unsigned chan_index )
1361 {
1362 switch( reg->DstRegister.File ) {
1363 case TGSI_FILE_OUTPUT:
1364 emit_output(
1365 func,
1366 xmm,
1367 reg->DstRegister.Index,
1368 chan_index );
1369 break;
1370
1371 case TGSI_FILE_TEMPORARY:
1372 emit_temps(
1373 func,
1374 xmm,
1375 reg->DstRegister.Index,
1376 chan_index );
1377 break;
1378
1379 case TGSI_FILE_ADDRESS:
1380 emit_addrs(
1381 func,
1382 xmm,
1383 reg->DstRegister.Index,
1384 chan_index );
1385 break;
1386
1387 default:
1388 assert( 0 );
1389 }
1390
1391 switch( inst->Instruction.Saturate ) {
1392 case TGSI_SAT_NONE:
1393 break;
1394
1395 case TGSI_SAT_ZERO_ONE:
1396 /* assert( 0 ); */
1397 break;
1398
1399 case TGSI_SAT_MINUS_PLUS_ONE:
1400 assert( 0 );
1401 break;
1402 }
1403 }
1404
1405 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1406 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1407
1408
1409 static void PIPE_CDECL
1410 fetch_texel( struct tgsi_sampler **sampler,
1411 float *store )
1412 {
1413 #if 0
1414 uint j;
1415
1416 debug_printf("%s sampler: %p (%p) store: %p\n",
1417 __FUNCTION__,
1418 sampler, *sampler,
1419 store );
1420
1421 debug_printf("lodbias %f\n", store[12]);
1422
1423 for (j = 0; j < 4; j++)
1424 debug_printf("sample %d texcoord %f %f\n",
1425 j,
1426 store[0+j],
1427 store[4+j]);
1428 #endif
1429
1430 {
1431 float rgba[NUM_CHANNELS][QUAD_SIZE];
1432 (*sampler)->get_samples(*sampler,
1433 &store[0],
1434 &store[4],
1435 &store[8],
1436 0.0f, /*store[12], lodbias */
1437 rgba);
1438
1439 memcpy( store, rgba, 16 * sizeof(float));
1440 }
1441
1442 #if 0
1443 for (j = 0; j < 4; j++)
1444 debug_printf("sample %d result %f %f %f %f\n",
1445 j,
1446 store[0+j],
1447 store[4+j],
1448 store[8+j],
1449 store[12+j]);
1450 #endif
1451 }
1452
1453 /**
1454 * High-level instruction translators.
1455 */
1456
1457 static void
1458 emit_tex( struct x86_function *func,
1459 const struct tgsi_full_instruction *inst,
1460 boolean lodbias,
1461 boolean projected)
1462 {
1463 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1464 struct x86_reg args[2];
1465 unsigned count;
1466 unsigned i;
1467
1468 switch (inst->InstructionExtTexture.Texture) {
1469 case TGSI_TEXTURE_1D:
1470 count = 1;
1471 break;
1472 case TGSI_TEXTURE_2D:
1473 case TGSI_TEXTURE_RECT:
1474 count = 2;
1475 break;
1476 case TGSI_TEXTURE_SHADOW1D:
1477 case TGSI_TEXTURE_SHADOW2D:
1478 case TGSI_TEXTURE_SHADOWRECT:
1479 case TGSI_TEXTURE_3D:
1480 case TGSI_TEXTURE_CUBE:
1481 count = 3;
1482 break;
1483 default:
1484 assert(0);
1485 return;
1486 }
1487
1488 if (lodbias) {
1489 FETCH( func, *inst, 3, 0, 3 );
1490 }
1491 else {
1492 emit_tempf(
1493 func,
1494 3,
1495 TGSI_EXEC_TEMP_00000000_I,
1496 TGSI_EXEC_TEMP_00000000_C );
1497
1498 }
1499
1500 /* store lodbias whether enabled or not -- fetch_texel currently
1501 * respects it always.
1502 */
1503 sse_movaps( func,
1504 get_temp( TEMP_R0, 3 ),
1505 make_xmm( 3 ) );
1506
1507
1508 if (projected) {
1509 FETCH( func, *inst, 3, 0, 3 );
1510
1511 emit_rcp( func, 3, 3 );
1512 }
1513
1514 for (i = 0; i < count; i++) {
1515 FETCH( func, *inst, i, 0, i );
1516
1517 if (projected) {
1518 sse_mulps(
1519 func,
1520 make_xmm( i ),
1521 make_xmm( 3 ) );
1522 }
1523
1524 /* Store in the argument buffer:
1525 */
1526 sse_movaps(
1527 func,
1528 get_temp( TEMP_R0, i ),
1529 make_xmm( i ) );
1530 }
1531
1532 args[0] = get_temp( TEMP_R0, 0 );
1533 args[1] = get_sampler_ptr( unit );
1534
1535
1536 emit_func_call( func,
1537 0,
1538 args,
1539 Elements(args),
1540 fetch_texel );
1541
1542 /* If all four channels are enabled, could use a pointer to
1543 * dst[0].x instead of TEMP_R0 for store?
1544 */
1545 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1546
1547 sse_movaps(
1548 func,
1549 make_xmm( 0 ),
1550 get_temp( TEMP_R0, i ) );
1551
1552 STORE( func, *inst, 0, 0, i );
1553 }
1554 }
1555
1556
1557 static void
1558 emit_kil(
1559 struct x86_function *func,
1560 const struct tgsi_full_src_register *reg )
1561 {
1562 unsigned uniquemask;
1563 unsigned unique_count = 0;
1564 unsigned chan_index;
1565 unsigned i;
1566
1567 /* This mask stores component bits that were already tested. Note that
1568 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1569 * tested. */
1570 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1571
1572 FOR_EACH_CHANNEL( chan_index ) {
1573 unsigned swizzle;
1574
1575 /* unswizzle channel */
1576 swizzle = tgsi_util_get_full_src_register_extswizzle(
1577 reg,
1578 chan_index );
1579
1580 /* check if the component has not been already tested */
1581 if( !(uniquemask & (1 << swizzle)) ) {
1582 uniquemask |= 1 << swizzle;
1583
1584 /* allocate register */
1585 emit_fetch(
1586 func,
1587 unique_count++,
1588 reg,
1589 chan_index );
1590 }
1591 }
1592
1593 x86_push(
1594 func,
1595 x86_make_reg( file_REG32, reg_AX ) );
1596 x86_push(
1597 func,
1598 x86_make_reg( file_REG32, reg_DX ) );
1599
1600 for (i = 0 ; i < unique_count; i++ ) {
1601 struct x86_reg dataXMM = make_xmm(i);
1602
1603 sse_cmpps(
1604 func,
1605 dataXMM,
1606 get_temp(
1607 TGSI_EXEC_TEMP_00000000_I,
1608 TGSI_EXEC_TEMP_00000000_C ),
1609 cc_LessThan );
1610
1611 if( i == 0 ) {
1612 sse_movmskps(
1613 func,
1614 x86_make_reg( file_REG32, reg_AX ),
1615 dataXMM );
1616 }
1617 else {
1618 sse_movmskps(
1619 func,
1620 x86_make_reg( file_REG32, reg_DX ),
1621 dataXMM );
1622 x86_or(
1623 func,
1624 x86_make_reg( file_REG32, reg_AX ),
1625 x86_make_reg( file_REG32, reg_DX ) );
1626 }
1627 }
1628
1629 x86_or(
1630 func,
1631 get_temp(
1632 TGSI_EXEC_TEMP_KILMASK_I,
1633 TGSI_EXEC_TEMP_KILMASK_C ),
1634 x86_make_reg( file_REG32, reg_AX ) );
1635
1636 x86_pop(
1637 func,
1638 x86_make_reg( file_REG32, reg_DX ) );
1639 x86_pop(
1640 func,
1641 x86_make_reg( file_REG32, reg_AX ) );
1642 }
1643
1644
1645 static void
1646 emit_kilp(
1647 struct x86_function *func )
1648 {
1649 /* XXX todo / fix me */
1650 }
1651
1652
1653 static void
1654 emit_setcc(
1655 struct x86_function *func,
1656 struct tgsi_full_instruction *inst,
1657 enum sse_cc cc )
1658 {
1659 unsigned chan_index;
1660
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662 FETCH( func, *inst, 0, 0, chan_index );
1663 FETCH( func, *inst, 1, 1, chan_index );
1664 sse_cmpps(
1665 func,
1666 make_xmm( 0 ),
1667 make_xmm( 1 ),
1668 cc );
1669 sse_andps(
1670 func,
1671 make_xmm( 0 ),
1672 get_temp(
1673 TEMP_ONE_I,
1674 TEMP_ONE_C ) );
1675 STORE( func, *inst, 0, 0, chan_index );
1676 }
1677 }
1678
1679 static void
1680 emit_cmp(
1681 struct x86_function *func,
1682 struct tgsi_full_instruction *inst )
1683 {
1684 unsigned chan_index;
1685
1686 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1687 FETCH( func, *inst, 0, 0, chan_index );
1688 FETCH( func, *inst, 1, 1, chan_index );
1689 FETCH( func, *inst, 2, 2, chan_index );
1690 sse_cmpps(
1691 func,
1692 make_xmm( 0 ),
1693 get_temp(
1694 TGSI_EXEC_TEMP_00000000_I,
1695 TGSI_EXEC_TEMP_00000000_C ),
1696 cc_LessThan );
1697 sse_andps(
1698 func,
1699 make_xmm( 1 ),
1700 make_xmm( 0 ) );
1701 sse_andnps(
1702 func,
1703 make_xmm( 0 ),
1704 make_xmm( 2 ) );
1705 sse_orps(
1706 func,
1707 make_xmm( 0 ),
1708 make_xmm( 1 ) );
1709 STORE( func, *inst, 0, 0, chan_index );
1710 }
1711 }
1712
1713
1714 /**
1715 * Check if inst src/dest regs use indirect addressing into temporary
1716 * register file.
1717 */
1718 static boolean
1719 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1720 {
1721 uint i;
1722 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1723 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1724 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1725 reg->SrcRegister.Indirect)
1726 return TRUE;
1727 }
1728 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1729 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1730 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1731 reg->DstRegister.Indirect)
1732 return TRUE;
1733 }
1734 return FALSE;
1735 }
1736
1737
1738 static int
1739 emit_instruction(
1740 struct x86_function *func,
1741 struct tgsi_full_instruction *inst )
1742 {
1743 unsigned chan_index;
1744
1745 /* we can't handle indirect addressing into temp register file yet */
1746 if (indirect_temp_reference(inst))
1747 return FALSE;
1748
1749 switch (inst->Instruction.Opcode) {
1750 case TGSI_OPCODE_ARL:
1751 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1752 FETCH( func, *inst, 0, 0, chan_index );
1753 emit_flr(func, 0, 0);
1754 emit_f2it( func, 0 );
1755 STORE( func, *inst, 0, 0, chan_index );
1756 }
1757 break;
1758
1759 case TGSI_OPCODE_MOV:
1760 case TGSI_OPCODE_SWZ:
1761 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1762 FETCH( func, *inst, 0, 0, chan_index );
1763 STORE( func, *inst, 0, 0, chan_index );
1764 }
1765 break;
1766
1767 case TGSI_OPCODE_LIT:
1768 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1769 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1770 emit_tempf(
1771 func,
1772 0,
1773 TEMP_ONE_I,
1774 TEMP_ONE_C);
1775 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1776 STORE( func, *inst, 0, 0, CHAN_X );
1777 }
1778 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1779 STORE( func, *inst, 0, 0, CHAN_W );
1780 }
1781 }
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1783 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1784 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1785 FETCH( func, *inst, 0, 0, CHAN_X );
1786 sse_maxps(
1787 func,
1788 make_xmm( 0 ),
1789 get_temp(
1790 TGSI_EXEC_TEMP_00000000_I,
1791 TGSI_EXEC_TEMP_00000000_C ) );
1792 STORE( func, *inst, 0, 0, CHAN_Y );
1793 }
1794 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1795 /* XMM[1] = SrcReg[0].yyyy */
1796 FETCH( func, *inst, 1, 0, CHAN_Y );
1797 /* XMM[1] = max(XMM[1], 0) */
1798 sse_maxps(
1799 func,
1800 make_xmm( 1 ),
1801 get_temp(
1802 TGSI_EXEC_TEMP_00000000_I,
1803 TGSI_EXEC_TEMP_00000000_C ) );
1804 /* XMM[2] = SrcReg[0].wwww */
1805 FETCH( func, *inst, 2, 0, CHAN_W );
1806 /* XMM[2] = min(XMM[2], 128.0) */
1807 sse_minps(
1808 func,
1809 make_xmm( 2 ),
1810 get_temp(
1811 TGSI_EXEC_TEMP_128_I,
1812 TGSI_EXEC_TEMP_128_C ) );
1813 /* XMM[2] = max(XMM[2], -128.0) */
1814 sse_maxps(
1815 func,
1816 make_xmm( 2 ),
1817 get_temp(
1818 TGSI_EXEC_TEMP_MINUS_128_I,
1819 TGSI_EXEC_TEMP_MINUS_128_C ) );
1820 emit_pow( func, 3, 1, 1, 2 );
1821 FETCH( func, *inst, 0, 0, CHAN_X );
1822 sse_xorps(
1823 func,
1824 make_xmm( 2 ),
1825 make_xmm( 2 ) );
1826 sse_cmpps(
1827 func,
1828 make_xmm( 2 ),
1829 make_xmm( 0 ),
1830 cc_LessThan );
1831 sse_andps(
1832 func,
1833 make_xmm( 2 ),
1834 make_xmm( 1 ) );
1835 STORE( func, *inst, 2, 0, CHAN_Z );
1836 }
1837 }
1838 break;
1839
1840 case TGSI_OPCODE_RCP:
1841 /* TGSI_OPCODE_RECIP */
1842 FETCH( func, *inst, 0, 0, CHAN_X );
1843 emit_rcp( func, 0, 0 );
1844 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1845 STORE( func, *inst, 0, 0, chan_index );
1846 }
1847 break;
1848
1849 case TGSI_OPCODE_RSQ:
1850 /* TGSI_OPCODE_RECIPSQRT */
1851 FETCH( func, *inst, 0, 0, CHAN_X );
1852 emit_abs( func, 0 );
1853 emit_rsqrt( func, 1, 0 );
1854 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855 STORE( func, *inst, 1, 0, chan_index );
1856 }
1857 break;
1858
1859 case TGSI_OPCODE_EXP:
1860 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1862 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863 FETCH( func, *inst, 0, 0, CHAN_X );
1864 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1865 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1866 emit_MOV( func, 1, 0 );
1867 emit_flr( func, 2, 1 );
1868 /* dst.x = ex2(floor(src.x)) */
1869 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1870 emit_MOV( func, 2, 1 );
1871 emit_ex2( func, 3, 2 );
1872 STORE( func, *inst, 2, 0, CHAN_X );
1873 }
1874 /* dst.y = src.x - floor(src.x) */
1875 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1876 emit_MOV( func, 2, 0 );
1877 emit_sub( func, 2, 1 );
1878 STORE( func, *inst, 2, 0, CHAN_Y );
1879 }
1880 }
1881 /* dst.z = ex2(src.x) */
1882 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1883 emit_ex2( func, 3, 0 );
1884 STORE( func, *inst, 0, 0, CHAN_Z );
1885 }
1886 }
1887 /* dst.w = 1.0 */
1888 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1889 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1890 STORE( func, *inst, 0, 0, CHAN_W );
1891 }
1892 break;
1893
1894 case TGSI_OPCODE_LOG:
1895 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1896 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1897 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1898 FETCH( func, *inst, 0, 0, CHAN_X );
1899 emit_abs( func, 0 );
1900 emit_MOV( func, 1, 0 );
1901 emit_lg2( func, 2, 1 );
1902 /* dst.z = lg2(abs(src.x)) */
1903 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1904 STORE( func, *inst, 1, 0, CHAN_Z );
1905 }
1906 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1907 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1908 emit_flr( func, 2, 1 );
1909 /* dst.x = floor(lg2(abs(src.x))) */
1910 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1911 STORE( func, *inst, 1, 0, CHAN_X );
1912 }
1913 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1914 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1915 emit_ex2( func, 2, 1 );
1916 emit_rcp( func, 1, 1 );
1917 emit_mul( func, 0, 1 );
1918 STORE( func, *inst, 0, 0, CHAN_Y );
1919 }
1920 }
1921 }
1922 /* dst.w = 1.0 */
1923 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1924 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1925 STORE( func, *inst, 0, 0, CHAN_W );
1926 }
1927 break;
1928
1929 case TGSI_OPCODE_MUL:
1930 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1931 FETCH( func, *inst, 0, 0, chan_index );
1932 FETCH( func, *inst, 1, 1, chan_index );
1933 emit_mul( func, 0, 1 );
1934 STORE( func, *inst, 0, 0, chan_index );
1935 }
1936 break;
1937
1938 case TGSI_OPCODE_ADD:
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940 FETCH( func, *inst, 0, 0, chan_index );
1941 FETCH( func, *inst, 1, 1, chan_index );
1942 emit_add( func, 0, 1 );
1943 STORE( func, *inst, 0, 0, chan_index );
1944 }
1945 break;
1946
1947 case TGSI_OPCODE_DP3:
1948 /* TGSI_OPCODE_DOT3 */
1949 FETCH( func, *inst, 0, 0, CHAN_X );
1950 FETCH( func, *inst, 1, 1, CHAN_X );
1951 emit_mul( func, 0, 1 );
1952 FETCH( func, *inst, 1, 0, CHAN_Y );
1953 FETCH( func, *inst, 2, 1, CHAN_Y );
1954 emit_mul( func, 1, 2 );
1955 emit_add( func, 0, 1 );
1956 FETCH( func, *inst, 1, 0, CHAN_Z );
1957 FETCH( func, *inst, 2, 1, CHAN_Z );
1958 emit_mul( func, 1, 2 );
1959 emit_add( func, 0, 1 );
1960 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1961 STORE( func, *inst, 0, 0, chan_index );
1962 }
1963 break;
1964
1965 case TGSI_OPCODE_DP4:
1966 /* TGSI_OPCODE_DOT4 */
1967 FETCH( func, *inst, 0, 0, CHAN_X );
1968 FETCH( func, *inst, 1, 1, CHAN_X );
1969 emit_mul( func, 0, 1 );
1970 FETCH( func, *inst, 1, 0, CHAN_Y );
1971 FETCH( func, *inst, 2, 1, CHAN_Y );
1972 emit_mul( func, 1, 2 );
1973 emit_add( func, 0, 1 );
1974 FETCH( func, *inst, 1, 0, CHAN_Z );
1975 FETCH( func, *inst, 2, 1, CHAN_Z );
1976 emit_mul(func, 1, 2 );
1977 emit_add(func, 0, 1 );
1978 FETCH( func, *inst, 1, 0, CHAN_W );
1979 FETCH( func, *inst, 2, 1, CHAN_W );
1980 emit_mul( func, 1, 2 );
1981 emit_add( func, 0, 1 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983 STORE( func, *inst, 0, 0, chan_index );
1984 }
1985 break;
1986
1987 case TGSI_OPCODE_DST:
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1989 emit_tempf(
1990 func,
1991 0,
1992 TEMP_ONE_I,
1993 TEMP_ONE_C );
1994 STORE( func, *inst, 0, 0, CHAN_X );
1995 }
1996 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1997 FETCH( func, *inst, 0, 0, CHAN_Y );
1998 FETCH( func, *inst, 1, 1, CHAN_Y );
1999 emit_mul( func, 0, 1 );
2000 STORE( func, *inst, 0, 0, CHAN_Y );
2001 }
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2003 FETCH( func, *inst, 0, 0, CHAN_Z );
2004 STORE( func, *inst, 0, 0, CHAN_Z );
2005 }
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2007 FETCH( func, *inst, 0, 1, CHAN_W );
2008 STORE( func, *inst, 0, 0, CHAN_W );
2009 }
2010 break;
2011
2012 case TGSI_OPCODE_MIN:
2013 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2014 FETCH( func, *inst, 0, 0, chan_index );
2015 FETCH( func, *inst, 1, 1, chan_index );
2016 sse_minps(
2017 func,
2018 make_xmm( 0 ),
2019 make_xmm( 1 ) );
2020 STORE( func, *inst, 0, 0, chan_index );
2021 }
2022 break;
2023
2024 case TGSI_OPCODE_MAX:
2025 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2026 FETCH( func, *inst, 0, 0, chan_index );
2027 FETCH( func, *inst, 1, 1, chan_index );
2028 sse_maxps(
2029 func,
2030 make_xmm( 0 ),
2031 make_xmm( 1 ) );
2032 STORE( func, *inst, 0, 0, chan_index );
2033 }
2034 break;
2035
2036 case TGSI_OPCODE_SLT:
2037 /* TGSI_OPCODE_SETLT */
2038 emit_setcc( func, inst, cc_LessThan );
2039 break;
2040
2041 case TGSI_OPCODE_SGE:
2042 /* TGSI_OPCODE_SETGE */
2043 emit_setcc( func, inst, cc_NotLessThan );
2044 break;
2045
2046 case TGSI_OPCODE_MAD:
2047 /* TGSI_OPCODE_MADD */
2048 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2049 FETCH( func, *inst, 0, 0, chan_index );
2050 FETCH( func, *inst, 1, 1, chan_index );
2051 FETCH( func, *inst, 2, 2, chan_index );
2052 emit_mul( func, 0, 1 );
2053 emit_add( func, 0, 2 );
2054 STORE( func, *inst, 0, 0, chan_index );
2055 }
2056 break;
2057
2058 case TGSI_OPCODE_SUB:
2059 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2060 FETCH( func, *inst, 0, 0, chan_index );
2061 FETCH( func, *inst, 1, 1, chan_index );
2062 emit_sub( func, 0, 1 );
2063 STORE( func, *inst, 0, 0, chan_index );
2064 }
2065 break;
2066
2067 case TGSI_OPCODE_LRP:
2068 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2069 FETCH( func, *inst, 0, 0, chan_index );
2070 FETCH( func, *inst, 1, 1, chan_index );
2071 FETCH( func, *inst, 2, 2, chan_index );
2072 emit_sub( func, 1, 2 );
2073 emit_mul( func, 0, 1 );
2074 emit_add( func, 0, 2 );
2075 STORE( func, *inst, 0, 0, chan_index );
2076 }
2077 break;
2078
2079 case TGSI_OPCODE_CND:
2080 return 0;
2081 break;
2082
2083 case TGSI_OPCODE_CND0:
2084 return 0;
2085 break;
2086
2087 case TGSI_OPCODE_DP2A:
2088 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2089 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2090 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2091 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2092 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2093 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2094 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2095 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2096 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2097 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2098 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2099 }
2100 break;
2101
2102 case TGSI_OPCODE_FRC:
2103 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2104 FETCH( func, *inst, 0, 0, chan_index );
2105 emit_frc( func, 0, 0 );
2106 STORE( func, *inst, 0, 0, chan_index );
2107 }
2108 break;
2109
2110 case TGSI_OPCODE_CLAMP:
2111 return 0;
2112 break;
2113
2114 case TGSI_OPCODE_FLR:
2115 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2116 FETCH( func, *inst, 0, 0, chan_index );
2117 emit_flr( func, 0, 0 );
2118 STORE( func, *inst, 0, 0, chan_index );
2119 }
2120 break;
2121
2122 case TGSI_OPCODE_ROUND:
2123 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2124 FETCH( func, *inst, 0, 0, chan_index );
2125 emit_rnd( func, 0, 0 );
2126 STORE( func, *inst, 0, 0, chan_index );
2127 }
2128 break;
2129
2130 case TGSI_OPCODE_EX2:
2131 FETCH( func, *inst, 0, 0, CHAN_X );
2132 emit_ex2( func, 0, 0 );
2133 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2134 STORE( func, *inst, 0, 0, chan_index );
2135 }
2136 break;
2137
2138 case TGSI_OPCODE_LG2:
2139 FETCH( func, *inst, 0, 0, CHAN_X );
2140 emit_lg2( func, 0, 0 );
2141 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2142 STORE( func, *inst, 0, 0, chan_index );
2143 }
2144 break;
2145
2146 case TGSI_OPCODE_POW:
2147 FETCH( func, *inst, 0, 0, CHAN_X );
2148 FETCH( func, *inst, 1, 1, CHAN_X );
2149 emit_pow( func, 0, 0, 0, 1 );
2150 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2151 STORE( func, *inst, 0, 0, chan_index );
2152 }
2153 break;
2154
2155 case TGSI_OPCODE_XPD:
2156 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2157 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2158 FETCH( func, *inst, 1, 1, CHAN_Z );
2159 FETCH( func, *inst, 3, 0, CHAN_Z );
2160 }
2161 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2162 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2163 FETCH( func, *inst, 0, 0, CHAN_Y );
2164 FETCH( func, *inst, 4, 1, CHAN_Y );
2165 }
2166 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2167 emit_MOV( func, 2, 0 );
2168 emit_mul( func, 2, 1 );
2169 emit_MOV( func, 5, 3 );
2170 emit_mul( func, 5, 4 );
2171 emit_sub( func, 2, 5 );
2172 STORE( func, *inst, 2, 0, CHAN_X );
2173 }
2174 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2175 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2176 FETCH( func, *inst, 2, 1, CHAN_X );
2177 FETCH( func, *inst, 5, 0, CHAN_X );
2178 }
2179 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2180 emit_mul( func, 3, 2 );
2181 emit_mul( func, 1, 5 );
2182 emit_sub( func, 3, 1 );
2183 STORE( func, *inst, 3, 0, CHAN_Y );
2184 }
2185 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2186 emit_mul( func, 5, 4 );
2187 emit_mul( func, 0, 2 );
2188 emit_sub( func, 5, 0 );
2189 STORE( func, *inst, 5, 0, CHAN_Z );
2190 }
2191 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2192 emit_tempf(
2193 func,
2194 0,
2195 TEMP_ONE_I,
2196 TEMP_ONE_C );
2197 STORE( func, *inst, 0, 0, CHAN_W );
2198 }
2199 break;
2200
2201 case TGSI_OPCODE_ABS:
2202 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2203 FETCH( func, *inst, 0, 0, chan_index );
2204 emit_abs( func, 0) ;
2205
2206 STORE( func, *inst, 0, 0, chan_index );
2207 }
2208 break;
2209
2210 case TGSI_OPCODE_RCC:
2211 return 0;
2212 break;
2213
2214 case TGSI_OPCODE_DPH:
2215 FETCH( func, *inst, 0, 0, CHAN_X );
2216 FETCH( func, *inst, 1, 1, CHAN_X );
2217 emit_mul( func, 0, 1 );
2218 FETCH( func, *inst, 1, 0, CHAN_Y );
2219 FETCH( func, *inst, 2, 1, CHAN_Y );
2220 emit_mul( func, 1, 2 );
2221 emit_add( func, 0, 1 );
2222 FETCH( func, *inst, 1, 0, CHAN_Z );
2223 FETCH( func, *inst, 2, 1, CHAN_Z );
2224 emit_mul( func, 1, 2 );
2225 emit_add( func, 0, 1 );
2226 FETCH( func, *inst, 1, 1, CHAN_W );
2227 emit_add( func, 0, 1 );
2228 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2229 STORE( func, *inst, 0, 0, chan_index );
2230 }
2231 break;
2232
2233 case TGSI_OPCODE_COS:
2234 FETCH( func, *inst, 0, 0, CHAN_X );
2235 emit_cos( func, 0, 0 );
2236 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2237 STORE( func, *inst, 0, 0, chan_index );
2238 }
2239 break;
2240
2241 case TGSI_OPCODE_DDX:
2242 return 0;
2243 break;
2244
2245 case TGSI_OPCODE_DDY:
2246 return 0;
2247 break;
2248
2249 case TGSI_OPCODE_KILP:
2250 /* predicated kill */
2251 emit_kilp( func );
2252 return 0; /* XXX fix me */
2253 break;
2254
2255 case TGSI_OPCODE_KIL:
2256 /* conditional kill */
2257 emit_kil( func, &inst->FullSrcRegisters[0] );
2258 break;
2259
2260 case TGSI_OPCODE_PK2H:
2261 return 0;
2262 break;
2263
2264 case TGSI_OPCODE_PK2US:
2265 return 0;
2266 break;
2267
2268 case TGSI_OPCODE_PK4B:
2269 return 0;
2270 break;
2271
2272 case TGSI_OPCODE_PK4UB:
2273 return 0;
2274 break;
2275
2276 case TGSI_OPCODE_RFL:
2277 return 0;
2278 break;
2279
2280 case TGSI_OPCODE_SEQ:
2281 return 0;
2282 break;
2283
2284 case TGSI_OPCODE_SFL:
2285 return 0;
2286 break;
2287
2288 case TGSI_OPCODE_SGT:
2289 return 0;
2290 break;
2291
2292 case TGSI_OPCODE_SIN:
2293 FETCH( func, *inst, 0, 0, CHAN_X );
2294 emit_sin( func, 0, 0 );
2295 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2296 STORE( func, *inst, 0, 0, chan_index );
2297 }
2298 break;
2299
2300 case TGSI_OPCODE_SLE:
2301 return 0;
2302 break;
2303
2304 case TGSI_OPCODE_SNE:
2305 return 0;
2306 break;
2307
2308 case TGSI_OPCODE_STR:
2309 return 0;
2310 break;
2311
2312 case TGSI_OPCODE_TEX:
2313 emit_tex( func, inst, FALSE, FALSE );
2314 break;
2315
2316 case TGSI_OPCODE_TXD:
2317 return 0;
2318 break;
2319
2320 case TGSI_OPCODE_UP2H:
2321 return 0;
2322 break;
2323
2324 case TGSI_OPCODE_UP2US:
2325 return 0;
2326 break;
2327
2328 case TGSI_OPCODE_UP4B:
2329 return 0;
2330 break;
2331
2332 case TGSI_OPCODE_UP4UB:
2333 return 0;
2334 break;
2335
2336 case TGSI_OPCODE_X2D:
2337 return 0;
2338 break;
2339
2340 case TGSI_OPCODE_ARA:
2341 return 0;
2342 break;
2343
2344 case TGSI_OPCODE_ARR:
2345 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2346 FETCH( func, *inst, 0, 0, chan_index );
2347 emit_rnd( func, 0, 0 );
2348 emit_f2it( func, 0 );
2349 STORE( func, *inst, 0, 0, chan_index );
2350 }
2351 break;
2352
2353 case TGSI_OPCODE_BRA:
2354 return 0;
2355 break;
2356
2357 case TGSI_OPCODE_CAL:
2358 return 0;
2359 break;
2360
2361 case TGSI_OPCODE_RET:
2362 emit_ret( func );
2363 break;
2364
2365 case TGSI_OPCODE_END:
2366 break;
2367
2368 case TGSI_OPCODE_SSG:
2369 /* TGSI_OPCODE_SGN */
2370 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2371 FETCH( func, *inst, 0, 0, chan_index );
2372 emit_sgn( func, 0, 0 );
2373 STORE( func, *inst, 0, 0, chan_index );
2374 }
2375 break;
2376
2377 case TGSI_OPCODE_CMP:
2378 emit_cmp (func, inst);
2379 break;
2380
2381 case TGSI_OPCODE_SCS:
2382 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2383 FETCH( func, *inst, 0, 0, CHAN_X );
2384 emit_cos( func, 0, 0 );
2385 STORE( func, *inst, 0, 0, CHAN_X );
2386 }
2387 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2388 FETCH( func, *inst, 0, 0, CHAN_X );
2389 emit_sin( func, 0, 0 );
2390 STORE( func, *inst, 0, 0, CHAN_Y );
2391 }
2392 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2393 emit_tempf(
2394 func,
2395 0,
2396 TGSI_EXEC_TEMP_00000000_I,
2397 TGSI_EXEC_TEMP_00000000_C );
2398 STORE( func, *inst, 0, 0, CHAN_Z );
2399 }
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2401 emit_tempf(
2402 func,
2403 0,
2404 TEMP_ONE_I,
2405 TEMP_ONE_C );
2406 STORE( func, *inst, 0, 0, CHAN_W );
2407 }
2408 break;
2409
2410 case TGSI_OPCODE_TXB:
2411 emit_tex( func, inst, TRUE, FALSE );
2412 break;
2413
2414 case TGSI_OPCODE_NRM:
2415 /* fall-through */
2416 case TGSI_OPCODE_NRM4:
2417 /* 3 or 4-component normalization */
2418 {
2419 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2420
2421 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2422 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2423 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2424 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2425
2426 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2427
2428 /* xmm4 = src.x */
2429 /* xmm0 = src.x * src.x */
2430 FETCH(func, *inst, 0, 0, CHAN_X);
2431 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2432 emit_MOV(func, 4, 0);
2433 }
2434 emit_mul(func, 0, 0);
2435
2436 /* xmm5 = src.y */
2437 /* xmm0 = xmm0 + src.y * src.y */
2438 FETCH(func, *inst, 1, 0, CHAN_Y);
2439 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2440 emit_MOV(func, 5, 1);
2441 }
2442 emit_mul(func, 1, 1);
2443 emit_add(func, 0, 1);
2444
2445 /* xmm6 = src.z */
2446 /* xmm0 = xmm0 + src.z * src.z */
2447 FETCH(func, *inst, 1, 0, CHAN_Z);
2448 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2449 emit_MOV(func, 6, 1);
2450 }
2451 emit_mul(func, 1, 1);
2452 emit_add(func, 0, 1);
2453
2454 if (dims == 4) {
2455 /* xmm7 = src.w */
2456 /* xmm0 = xmm0 + src.w * src.w */
2457 FETCH(func, *inst, 1, 0, CHAN_W);
2458 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2459 emit_MOV(func, 7, 1);
2460 }
2461 emit_mul(func, 1, 1);
2462 emit_add(func, 0, 1);
2463 }
2464
2465 /* xmm1 = 1 / sqrt(xmm0) */
2466 emit_rsqrt(func, 1, 0);
2467
2468 /* dst.x = xmm1 * src.x */
2469 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2470 emit_mul(func, 4, 1);
2471 STORE(func, *inst, 4, 0, CHAN_X);
2472 }
2473
2474 /* dst.y = xmm1 * src.y */
2475 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2476 emit_mul(func, 5, 1);
2477 STORE(func, *inst, 5, 0, CHAN_Y);
2478 }
2479
2480 /* dst.z = xmm1 * src.z */
2481 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2482 emit_mul(func, 6, 1);
2483 STORE(func, *inst, 6, 0, CHAN_Z);
2484 }
2485
2486 /* dst.w = xmm1 * src.w */
2487 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2488 emit_mul(func, 7, 1);
2489 STORE(func, *inst, 7, 0, CHAN_W);
2490 }
2491 }
2492
2493 /* dst0.w = 1.0 */
2494 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2495 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2496 STORE(func, *inst, 0, 0, CHAN_W);
2497 }
2498 }
2499 break;
2500
2501 case TGSI_OPCODE_DIV:
2502 return 0;
2503 break;
2504
2505 case TGSI_OPCODE_DP2:
2506 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2507 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2508 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2509 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2510 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2511 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2512 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2513 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2514 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2515 }
2516 break;
2517
2518 case TGSI_OPCODE_TXL:
2519 emit_tex( func, inst, TRUE, FALSE );
2520 break;
2521
2522 case TGSI_OPCODE_TXP:
2523 emit_tex( func, inst, FALSE, TRUE );
2524 break;
2525
2526 case TGSI_OPCODE_BRK:
2527 return 0;
2528 break;
2529
2530 case TGSI_OPCODE_IF:
2531 return 0;
2532 break;
2533
2534 case TGSI_OPCODE_LOOP:
2535 return 0;
2536 break;
2537
2538 case TGSI_OPCODE_REP:
2539 return 0;
2540 break;
2541
2542 case TGSI_OPCODE_ELSE:
2543 return 0;
2544 break;
2545
2546 case TGSI_OPCODE_ENDIF:
2547 return 0;
2548 break;
2549
2550 case TGSI_OPCODE_ENDLOOP:
2551 return 0;
2552 break;
2553
2554 case TGSI_OPCODE_ENDREP:
2555 return 0;
2556 break;
2557
2558 case TGSI_OPCODE_PUSHA:
2559 return 0;
2560 break;
2561
2562 case TGSI_OPCODE_POPA:
2563 return 0;
2564 break;
2565
2566 case TGSI_OPCODE_CEIL:
2567 return 0;
2568 break;
2569
2570 case TGSI_OPCODE_I2F:
2571 return 0;
2572 break;
2573
2574 case TGSI_OPCODE_NOT:
2575 return 0;
2576 break;
2577
2578 case TGSI_OPCODE_TRUNC:
2579 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2580 FETCH( func, *inst, 0, 0, chan_index );
2581 emit_f2it( func, 0 );
2582 emit_i2f( func, 0 );
2583 STORE( func, *inst, 0, 0, chan_index );
2584 }
2585 break;
2586
2587 case TGSI_OPCODE_SHL:
2588 return 0;
2589 break;
2590
2591 case TGSI_OPCODE_SHR:
2592 return 0;
2593 break;
2594
2595 case TGSI_OPCODE_AND:
2596 return 0;
2597 break;
2598
2599 case TGSI_OPCODE_OR:
2600 return 0;
2601 break;
2602
2603 case TGSI_OPCODE_MOD:
2604 return 0;
2605 break;
2606
2607 case TGSI_OPCODE_XOR:
2608 return 0;
2609 break;
2610
2611 case TGSI_OPCODE_SAD:
2612 return 0;
2613 break;
2614
2615 case TGSI_OPCODE_TXF:
2616 return 0;
2617 break;
2618
2619 case TGSI_OPCODE_TXQ:
2620 return 0;
2621 break;
2622
2623 case TGSI_OPCODE_CONT:
2624 return 0;
2625 break;
2626
2627 case TGSI_OPCODE_EMIT:
2628 return 0;
2629 break;
2630
2631 case TGSI_OPCODE_ENDPRIM:
2632 return 0;
2633 break;
2634
2635 default:
2636 return 0;
2637 }
2638
2639 return 1;
2640 }
2641
2642 static void
2643 emit_declaration(
2644 struct x86_function *func,
2645 struct tgsi_full_declaration *decl )
2646 {
2647 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2648 unsigned first, last, mask;
2649 unsigned i, j;
2650
2651 first = decl->DeclarationRange.First;
2652 last = decl->DeclarationRange.Last;
2653 mask = decl->Declaration.UsageMask;
2654
2655 for( i = first; i <= last; i++ ) {
2656 for( j = 0; j < NUM_CHANNELS; j++ ) {
2657 if( mask & (1 << j) ) {
2658 switch( decl->Declaration.Interpolate ) {
2659 case TGSI_INTERPOLATE_CONSTANT:
2660 emit_coef_a0( func, 0, i, j );
2661 emit_inputs( func, 0, i, j );
2662 break;
2663
2664 case TGSI_INTERPOLATE_LINEAR:
2665 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2666 emit_coef_dadx( func, 1, i, j );
2667 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2668 emit_coef_dady( func, 3, i, j );
2669 emit_mul( func, 0, 1 ); /* x * dadx */
2670 emit_coef_a0( func, 4, i, j );
2671 emit_mul( func, 2, 3 ); /* y * dady */
2672 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2673 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2674 emit_inputs( func, 0, i, j );
2675 break;
2676
2677 case TGSI_INTERPOLATE_PERSPECTIVE:
2678 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2679 emit_coef_dadx( func, 1, i, j );
2680 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2681 emit_coef_dady( func, 3, i, j );
2682 emit_mul( func, 0, 1 ); /* x * dadx */
2683 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2684 emit_coef_a0( func, 5, i, j );
2685 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2686 emit_mul( func, 2, 3 ); /* y * dady */
2687 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2688 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2689 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2690 emit_inputs( func, 0, i, j );
2691 break;
2692
2693 default:
2694 assert( 0 );
2695 break;
2696 }
2697 }
2698 }
2699 }
2700 }
2701 }
2702
2703 static void aos_to_soa( struct x86_function *func,
2704 uint arg_aos,
2705 uint arg_machine,
2706 uint arg_num,
2707 uint arg_stride )
2708 {
2709 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2710 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2711 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2712 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2713 int inner_loop;
2714
2715
2716 /* Save EBX */
2717 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2718
2719 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2720 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2721 x86_lea( func, soa_input,
2722 x86_make_disp( soa_input,
2723 Offset(struct tgsi_exec_machine, Inputs) ) );
2724 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2725 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2726
2727 /* do */
2728 inner_loop = x86_get_label( func );
2729 {
2730 x86_push( func, aos_input );
2731 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2732 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2733 x86_add( func, aos_input, stride );
2734 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2735 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2736 x86_add( func, aos_input, stride );
2737 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2738 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2739 x86_add( func, aos_input, stride );
2740 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2741 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2742 x86_pop( func, aos_input );
2743
2744 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2745 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2746 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2747 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2748 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2749 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2750
2751 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2752 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2753 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2754 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2755
2756 /* Advance to next input */
2757 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2758 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2759 }
2760 /* while --num_inputs */
2761 x86_dec( func, num_inputs );
2762 x86_jcc( func, cc_NE, inner_loop );
2763
2764 /* Restore EBX */
2765 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2766 }
2767
2768 static void soa_to_aos( struct x86_function *func,
2769 uint arg_aos,
2770 uint arg_machine,
2771 uint arg_num,
2772 uint arg_stride )
2773 {
2774 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2775 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2776 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2777 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2778 int inner_loop;
2779
2780 /* Save EBX */
2781 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2782
2783 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2784 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2785 x86_lea( func, soa_output,
2786 x86_make_disp( soa_output,
2787 Offset(struct tgsi_exec_machine, Outputs) ) );
2788 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2789
2790 /* do */
2791 inner_loop = x86_get_label( func );
2792 {
2793 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2794 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2795 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2796 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2797
2798 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2799 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2800 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2801 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2802 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2803 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2804
2805 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2806 x86_push( func, aos_output );
2807 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2808 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2809 x86_add( func, aos_output, temp );
2810 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2811 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2812 x86_add( func, aos_output, temp );
2813 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2814 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2815 x86_add( func, aos_output, temp );
2816 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2817 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2818 x86_pop( func, aos_output );
2819
2820 /* Advance to next output */
2821 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2822 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2823 }
2824 /* while --num_outputs */
2825 x86_dec( func, num_outputs );
2826 x86_jcc( func, cc_NE, inner_loop );
2827
2828 /* Restore EBX */
2829 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2830 }
2831
2832 /**
2833 * Translate a TGSI vertex/fragment shader to SSE2 code.
2834 * Slightly different things are done for vertex vs. fragment shaders.
2835 *
2836 * \param tokens the TGSI input shader
2837 * \param func the output SSE code/function
2838 * \param immediates buffer to place immediates, later passed to SSE func
2839 * \param return 1 for success, 0 if translation failed
2840 */
2841 unsigned
2842 tgsi_emit_sse2(
2843 const struct tgsi_token *tokens,
2844 struct x86_function *func,
2845 float (*immediates)[4],
2846 boolean do_swizzles )
2847 {
2848 struct tgsi_parse_context parse;
2849 unsigned ok = 1;
2850 uint num_immediates = 0;
2851
2852 util_init_math();
2853
2854 func->csr = func->store;
2855
2856 tgsi_parse_init( &parse, tokens );
2857
2858 /* Can't just use EDI, EBX without save/restoring them:
2859 */
2860 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2861 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2862
2863 /*
2864 * Different function args for vertex/fragment shaders:
2865 */
2866 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2867 if (do_swizzles)
2868 aos_to_soa( func,
2869 4, /* aos_input */
2870 1, /* machine */
2871 5, /* num_inputs */
2872 6 ); /* input_stride */
2873 }
2874
2875 x86_mov(
2876 func,
2877 get_machine_base(),
2878 x86_fn_arg( func, 1 ) );
2879 x86_mov(
2880 func,
2881 get_const_base(),
2882 x86_fn_arg( func, 2 ) );
2883 x86_mov(
2884 func,
2885 get_immediate_base(),
2886 x86_fn_arg( func, 3 ) );
2887
2888 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2889 x86_mov(
2890 func,
2891 get_coef_base(),
2892 x86_fn_arg( func, 4 ) );
2893 }
2894
2895 x86_mov(
2896 func,
2897 get_sampler_base(),
2898 x86_make_disp( get_machine_base(),
2899 Offset( struct tgsi_exec_machine, Samplers ) ) );
2900
2901
2902 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2903 tgsi_parse_token( &parse );
2904
2905 switch( parse.FullToken.Token.Type ) {
2906 case TGSI_TOKEN_TYPE_DECLARATION:
2907 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2908 emit_declaration(
2909 func,
2910 &parse.FullToken.FullDeclaration );
2911 }
2912 break;
2913
2914 case TGSI_TOKEN_TYPE_INSTRUCTION:
2915 ok = emit_instruction(
2916 func,
2917 &parse.FullToken.FullInstruction );
2918
2919 if (!ok) {
2920 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2921 parse.FullToken.FullInstruction.Instruction.Opcode,
2922 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2923 "vertex shader" : "fragment shader");
2924 }
2925 break;
2926
2927 case TGSI_TOKEN_TYPE_IMMEDIATE:
2928 /* simply copy the immediate values into the next immediates[] slot */
2929 {
2930 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2931 uint i;
2932 assert(size <= 4);
2933 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2934 for( i = 0; i < size; i++ ) {
2935 immediates[num_immediates][i] =
2936 parse.FullToken.FullImmediate.u[i].Float;
2937 }
2938 #if 0
2939 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2940 num_immediates,
2941 immediates[num_immediates][0],
2942 immediates[num_immediates][1],
2943 immediates[num_immediates][2],
2944 immediates[num_immediates][3]);
2945 #endif
2946 num_immediates++;
2947 }
2948 break;
2949
2950 default:
2951 ok = 0;
2952 assert( 0 );
2953 }
2954 }
2955
2956 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2957 if (do_swizzles)
2958 soa_to_aos( func,
2959 7, /* aos_output */
2960 1, /* machine */
2961 8, /* num_outputs */
2962 9 ); /* output_stride */
2963 }
2964
2965 /* Can't just use EBX, EDI without save/restoring them:
2966 */
2967 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2968 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2969
2970 emit_ret( func );
2971
2972 tgsi_parse_free( &parse );
2973
2974 return ok;
2975 }
2976
2977 #endif /* PIPE_ARCH_X86 */
2978