pipebuffer: Include fenced buffer manager in build
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 #include "pipe/p_config.h"
30
31 #if defined(PIPE_ARCH_X86)
32
33 #include "util/u_debug.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37 #if defined(PIPE_ARCH_SSE)
38 #include "util/u_sse.h"
39 #endif
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_parse.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44 #include "tgsi/tgsi_exec.h"
45 #include "tgsi/tgsi_sse2.h"
46
47 #include "rtasm/rtasm_x86sse.h"
48
49 /* for 1/sqrt()
50 *
51 * This costs about 100fps (close to 10%) in gears:
52 */
53 #define HIGH_PRECISION 1
54
55 #define FAST_MATH 1
56
57
58 #define FOR_EACH_CHANNEL( CHAN )\
59 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
60
61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
63
64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
65 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
66
67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
68 FOR_EACH_CHANNEL( CHAN )\
69 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
70
71 #define CHAN_X 0
72 #define CHAN_Y 1
73 #define CHAN_Z 2
74 #define CHAN_W 3
75
76 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
77 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
78
79 #define TEMP_R0 TGSI_EXEC_TEMP_R0
80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83
84
85 /**
86 * X86 utility functions.
87 */
88
89 static struct x86_reg
90 make_xmm(
91 unsigned xmm )
92 {
93 return x86_make_reg(
94 file_XMM,
95 (enum x86_reg_name) xmm );
96 }
97
98 /**
99 * X86 register mapping helpers.
100 */
101
102 static struct x86_reg
103 get_const_base( void )
104 {
105 return x86_make_reg(
106 file_REG32,
107 reg_AX );
108 }
109
110 static struct x86_reg
111 get_machine_base( void )
112 {
113 return x86_make_reg(
114 file_REG32,
115 reg_CX );
116 }
117
118 static struct x86_reg
119 get_input_base( void )
120 {
121 return x86_make_disp(
122 get_machine_base(),
123 Offset(struct tgsi_exec_machine, Inputs) );
124 }
125
126 static struct x86_reg
127 get_output_base( void )
128 {
129 return x86_make_disp(
130 get_machine_base(),
131 Offset(struct tgsi_exec_machine, Outputs) );
132 }
133
134 static struct x86_reg
135 get_temp_base( void )
136 {
137 return x86_make_disp(
138 get_machine_base(),
139 Offset(struct tgsi_exec_machine, Temps) );
140 }
141
142 static struct x86_reg
143 get_coef_base( void )
144 {
145 return x86_make_reg(
146 file_REG32,
147 reg_BX );
148 }
149
150 static struct x86_reg
151 get_sampler_base( void )
152 {
153 return x86_make_reg(
154 file_REG32,
155 reg_DI );
156 }
157
158 static struct x86_reg
159 get_immediate_base( void )
160 {
161 return x86_make_reg(
162 file_REG32,
163 reg_DX );
164 }
165
166
167 /**
168 * Data access helpers.
169 */
170
171
172 static struct x86_reg
173 get_immediate(
174 unsigned vec,
175 unsigned chan )
176 {
177 return x86_make_disp(
178 get_immediate_base(),
179 (vec * 4 + chan) * 4 );
180 }
181
182 static struct x86_reg
183 get_const(
184 unsigned vec,
185 unsigned chan )
186 {
187 return x86_make_disp(
188 get_const_base(),
189 (vec * 4 + chan) * 4 );
190 }
191
192 static struct x86_reg
193 get_sampler_ptr(
194 unsigned unit )
195 {
196 return x86_make_disp(
197 get_sampler_base(),
198 unit * sizeof( struct tgsi_sampler * ) );
199 }
200
201 static struct x86_reg
202 get_input(
203 unsigned vec,
204 unsigned chan )
205 {
206 return x86_make_disp(
207 get_input_base(),
208 (vec * 4 + chan) * 16 );
209 }
210
211 static struct x86_reg
212 get_output(
213 unsigned vec,
214 unsigned chan )
215 {
216 return x86_make_disp(
217 get_output_base(),
218 (vec * 4 + chan) * 16 );
219 }
220
221 static struct x86_reg
222 get_temp(
223 unsigned vec,
224 unsigned chan )
225 {
226 return x86_make_disp(
227 get_temp_base(),
228 (vec * 4 + chan) * 16 );
229 }
230
231 static struct x86_reg
232 get_coef(
233 unsigned vec,
234 unsigned chan,
235 unsigned member )
236 {
237 return x86_make_disp(
238 get_coef_base(),
239 ((vec * 3 + member) * 4 + chan) * 4 );
240 }
241
242
243 static void
244 emit_ret(
245 struct x86_function *func )
246 {
247 x86_ret( func );
248 }
249
250
251 /**
252 * Data fetch helpers.
253 */
254
255 /**
256 * Copy a shader constant to xmm register
257 * \param xmm the destination xmm register
258 * \param vec the src const buffer index
259 * \param chan src channel to fetch (X, Y, Z or W)
260 */
261 static void
262 emit_const(
263 struct x86_function *func,
264 uint xmm,
265 int vec,
266 uint chan,
267 uint indirect,
268 uint indirectFile,
269 int indirectIndex )
270 {
271 if (indirect) {
272 /* 'vec' is the offset from the address register's value.
273 * We're loading CONST[ADDR+vec] into an xmm register.
274 */
275 struct x86_reg r0 = get_immediate_base();
276 struct x86_reg r1 = get_coef_base();
277 uint i;
278
279 assert( indirectFile == TGSI_FILE_ADDRESS );
280 assert( indirectIndex == 0 );
281 assert( r0.mod == mod_REG );
282 assert( r1.mod == mod_REG );
283
284 x86_push( func, r0 );
285 x86_push( func, r1 );
286
287 /*
288 * Loop over the four pixels or vertices in the quad.
289 * Get the value of the address (offset) register for pixel/vertex[i],
290 * add it to the src offset and index into the constant buffer.
291 * Note that we're working on SOA data.
292 * If any of the pixel/vertex execution channels are unused their
293 * values will be garbage. It's very important that we don't use
294 * those garbage values as indexes into the constant buffer since
295 * that'll cause segfaults.
296 * The solution is to bitwise-AND the offset with the execution mask
297 * register whose values are either 0 or ~0.
298 * The caller must setup the execution mask register to indicate
299 * which channels are valid/alive before running the shader.
300 * The execution mask will also figure into loops and conditionals
301 * someday.
302 */
303 for (i = 0; i < QUAD_SIZE; i++) {
304 /* r1 = address register[i] */
305 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
306 /* r0 = execution mask[i] */
307 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
308 /* r1 = r1 & r0 */
309 x86_and( func, r1, r0 );
310 /* r0 = 'vec', the offset */
311 x86_lea( func, r0, get_const( vec, chan ) );
312
313 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
314 */
315 x86_add( func, r1, r1 );
316 x86_add( func, r1, r1 );
317 x86_add( func, r1, r1 );
318 x86_add( func, r1, r1 );
319
320 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
321 x86_mov( func, r1, x86_deref( r0 ) );
322 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
323 }
324
325 x86_pop( func, r1 );
326 x86_pop( func, r0 );
327
328 sse_movaps(
329 func,
330 make_xmm( xmm ),
331 get_temp( TEMP_R0, CHAN_X ) );
332 }
333 else {
334 /* 'vec' is the index into the src register file, such as TEMP[vec] */
335 assert( vec >= 0 );
336
337 sse_movss(
338 func,
339 make_xmm( xmm ),
340 get_const( vec, chan ) );
341 sse_shufps(
342 func,
343 make_xmm( xmm ),
344 make_xmm( xmm ),
345 SHUF( 0, 0, 0, 0 ) );
346 }
347 }
348
349 static void
350 emit_immediate(
351 struct x86_function *func,
352 unsigned xmm,
353 unsigned vec,
354 unsigned chan )
355 {
356 sse_movss(
357 func,
358 make_xmm( xmm ),
359 get_immediate( vec, chan ) );
360 sse_shufps(
361 func,
362 make_xmm( xmm ),
363 make_xmm( xmm ),
364 SHUF( 0, 0, 0, 0 ) );
365 }
366
367
368 /**
369 * Copy a shader input to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src input attrib
372 * \param chan src channel to fetch (X, Y, Z or W)
373 */
374 static void
375 emit_inputf(
376 struct x86_function *func,
377 unsigned xmm,
378 unsigned vec,
379 unsigned chan )
380 {
381 sse_movups(
382 func,
383 make_xmm( xmm ),
384 get_input( vec, chan ) );
385 }
386
387 /**
388 * Store an xmm register to a shader output
389 * \param xmm the source xmm register
390 * \param vec the dest output attrib
391 * \param chan src dest channel to store (X, Y, Z or W)
392 */
393 static void
394 emit_output(
395 struct x86_function *func,
396 unsigned xmm,
397 unsigned vec,
398 unsigned chan )
399 {
400 sse_movups(
401 func,
402 get_output( vec, chan ),
403 make_xmm( xmm ) );
404 }
405
406 /**
407 * Copy a shader temporary to xmm register
408 * \param xmm the destination xmm register
409 * \param vec the src temp register
410 * \param chan src channel to fetch (X, Y, Z or W)
411 */
412 static void
413 emit_tempf(
414 struct x86_function *func,
415 unsigned xmm,
416 unsigned vec,
417 unsigned chan )
418 {
419 sse_movaps(
420 func,
421 make_xmm( xmm ),
422 get_temp( vec, chan ) );
423 }
424
425 /**
426 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
427 * \param xmm the destination xmm register
428 * \param vec the src input/attribute coefficient index
429 * \param chan src channel to fetch (X, Y, Z or W)
430 * \param member 0=a0, 1=dadx, 2=dady
431 */
432 static void
433 emit_coef(
434 struct x86_function *func,
435 unsigned xmm,
436 unsigned vec,
437 unsigned chan,
438 unsigned member )
439 {
440 sse_movss(
441 func,
442 make_xmm( xmm ),
443 get_coef( vec, chan, member ) );
444 sse_shufps(
445 func,
446 make_xmm( xmm ),
447 make_xmm( xmm ),
448 SHUF( 0, 0, 0, 0 ) );
449 }
450
451 /**
452 * Data store helpers.
453 */
454
455 static void
456 emit_inputs(
457 struct x86_function *func,
458 unsigned xmm,
459 unsigned vec,
460 unsigned chan )
461 {
462 sse_movups(
463 func,
464 get_input( vec, chan ),
465 make_xmm( xmm ) );
466 }
467
468 static void
469 emit_temps(
470 struct x86_function *func,
471 unsigned xmm,
472 unsigned vec,
473 unsigned chan )
474 {
475 sse_movaps(
476 func,
477 get_temp( vec, chan ),
478 make_xmm( xmm ) );
479 }
480
481 static void
482 emit_addrs(
483 struct x86_function *func,
484 unsigned xmm,
485 unsigned vec,
486 unsigned chan )
487 {
488 assert( vec == 0 );
489
490 emit_temps(
491 func,
492 xmm,
493 vec + TGSI_EXEC_TEMP_ADDR,
494 chan );
495 }
496
497 /**
498 * Coefficent fetch helpers.
499 */
500
501 static void
502 emit_coef_a0(
503 struct x86_function *func,
504 unsigned xmm,
505 unsigned vec,
506 unsigned chan )
507 {
508 emit_coef(
509 func,
510 xmm,
511 vec,
512 chan,
513 0 );
514 }
515
516 static void
517 emit_coef_dadx(
518 struct x86_function *func,
519 unsigned xmm,
520 unsigned vec,
521 unsigned chan )
522 {
523 emit_coef(
524 func,
525 xmm,
526 vec,
527 chan,
528 1 );
529 }
530
531 static void
532 emit_coef_dady(
533 struct x86_function *func,
534 unsigned xmm,
535 unsigned vec,
536 unsigned chan )
537 {
538 emit_coef(
539 func,
540 xmm,
541 vec,
542 chan,
543 2 );
544 }
545
546 /**
547 * Function call helpers.
548 */
549
550 /**
551 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
552 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
553 * that the stack pointer is 16 byte aligned, as expected.
554 */
555 static void
556 emit_func_call(
557 struct x86_function *func,
558 unsigned xmm_save_mask,
559 const struct x86_reg *arg,
560 unsigned nr_args,
561 void (PIPE_CDECL *code)() )
562 {
563 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
564 unsigned i, n;
565
566 x86_push(
567 func,
568 x86_make_reg( file_REG32, reg_AX) );
569 x86_push(
570 func,
571 x86_make_reg( file_REG32, reg_CX) );
572 x86_push(
573 func,
574 x86_make_reg( file_REG32, reg_DX) );
575
576 /* Store XMM regs to the stack
577 */
578 for(i = 0, n = 0; i < 8; ++i)
579 if(xmm_save_mask & (1 << i))
580 ++n;
581
582 x86_sub_imm(
583 func,
584 x86_make_reg( file_REG32, reg_SP ),
585 n*16);
586
587 for(i = 0, n = 0; i < 8; ++i)
588 if(xmm_save_mask & (1 << i)) {
589 sse_movups(
590 func,
591 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
592 make_xmm( i ) );
593 ++n;
594 }
595
596 for (i = 0; i < nr_args; i++) {
597 /* Load the address of the buffer we use for passing arguments and
598 * receiving results:
599 */
600 x86_lea(
601 func,
602 ecx,
603 arg[i] );
604
605 /* Push actual function arguments (currently just the pointer to
606 * the buffer above), and call the function:
607 */
608 x86_push( func, ecx );
609 }
610
611 x86_mov_reg_imm( func, ecx, (unsigned long) code );
612 x86_call( func, ecx );
613
614 /* Pop the arguments (or just add an immediate to esp)
615 */
616 for (i = 0; i < nr_args; i++) {
617 x86_pop(func, ecx );
618 }
619
620 /* Pop the saved XMM regs:
621 */
622 for(i = 0, n = 0; i < 8; ++i)
623 if(xmm_save_mask & (1 << i)) {
624 sse_movups(
625 func,
626 make_xmm( i ),
627 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
628 ++n;
629 }
630
631 x86_add_imm(
632 func,
633 x86_make_reg( file_REG32, reg_SP ),
634 n*16);
635
636 /* Restore GP registers in a reverse order.
637 */
638 x86_pop(
639 func,
640 x86_make_reg( file_REG32, reg_DX) );
641 x86_pop(
642 func,
643 x86_make_reg( file_REG32, reg_CX) );
644 x86_pop(
645 func,
646 x86_make_reg( file_REG32, reg_AX) );
647 }
648
649 static void
650 emit_func_call_dst_src1(
651 struct x86_function *func,
652 unsigned xmm_save,
653 unsigned xmm_dst,
654 unsigned xmm_src0,
655 void (PIPE_CDECL *code)() )
656 {
657 struct x86_reg store = get_temp( TEMP_R0, 0 );
658 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
659
660 /* Store our input parameters (in xmm regs) to the buffer we use
661 * for passing arguments. We will pass a pointer to this buffer as
662 * the actual function argument.
663 */
664 sse_movaps(
665 func,
666 store,
667 make_xmm( xmm_src0 ) );
668
669 emit_func_call( func,
670 xmm_mask,
671 &store,
672 1,
673 code );
674
675 sse_movaps(
676 func,
677 make_xmm( xmm_dst ),
678 store );
679 }
680
681
682 static void
683 emit_func_call_dst_src2(
684 struct x86_function *func,
685 unsigned xmm_save,
686 unsigned xmm_dst,
687 unsigned xmm_src0,
688 unsigned xmm_src1,
689 void (PIPE_CDECL *code)() )
690 {
691 struct x86_reg store = get_temp( TEMP_R0, 0 );
692 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
693
694 /* Store two inputs to parameter buffer.
695 */
696 sse_movaps(
697 func,
698 store,
699 make_xmm( xmm_src0 ) );
700
701 sse_movaps(
702 func,
703 x86_make_disp( store, 4 * sizeof(float) ),
704 make_xmm( xmm_src1 ) );
705
706
707 /* Emit the call
708 */
709 emit_func_call( func,
710 xmm_mask,
711 &store,
712 1,
713 code );
714
715 /* Retrieve the results:
716 */
717 sse_movaps(
718 func,
719 make_xmm( xmm_dst ),
720 store );
721 }
722
723
724
725
726
727 #if defined(PIPE_ARCH_SSE)
728
729 /*
730 * Fast SSE2 implementation of special math functions.
731 */
732
733 #define POLY0(x, c0) _mm_set1_ps(c0)
734 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
735 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
736 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
737 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
738 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
739
740 #define EXP_POLY_DEGREE 3
741 #define LOG_POLY_DEGREE 5
742
743 /**
744 * See http://www.devmaster.net/forums/showthread.php?p=43580
745 */
746 static INLINE __m128
747 exp2f4(__m128 x)
748 {
749 __m128i ipart;
750 __m128 fpart, expipart, expfpart;
751
752 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
753 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
754
755 /* ipart = int(x - 0.5) */
756 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
757
758 /* fpart = x - ipart */
759 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
760
761 /* expipart = (float) (1 << ipart) */
762 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
763
764 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
765 #if EXP_POLY_DEGREE == 5
766 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
767 #elif EXP_POLY_DEGREE == 4
768 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
769 #elif EXP_POLY_DEGREE == 3
770 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
771 #elif EXP_POLY_DEGREE == 2
772 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
773 #else
774 #error
775 #endif
776
777 return _mm_mul_ps(expipart, expfpart);
778 }
779
780
781 /**
782 * See http://www.devmaster.net/forums/showthread.php?p=43580
783 */
784 static INLINE __m128
785 log2f4(__m128 x)
786 {
787 __m128i expmask = _mm_set1_epi32(0x7f800000);
788 __m128i mantmask = _mm_set1_epi32(0x007fffff);
789 __m128 one = _mm_set1_ps(1.0f);
790
791 __m128i i = _mm_castps_si128(x);
792
793 /* exp = (float) exponent(x) */
794 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
795
796 /* mant = (float) mantissa(x) */
797 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
798
799 __m128 logmant;
800
801 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
802 * These coefficients can be generate with
803 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
804 */
805 #if LOG_POLY_DEGREE == 6
806 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
807 #elif LOG_POLY_DEGREE == 5
808 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
809 #elif LOG_POLY_DEGREE == 4
810 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
811 #elif LOG_POLY_DEGREE == 3
812 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
813 #else
814 #error
815 #endif
816
817 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
818 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
819
820 return _mm_add_ps(logmant, exp);
821 }
822
823
824 static INLINE __m128
825 powf4(__m128 x, __m128 y)
826 {
827 return exp2f4(_mm_mul_ps(log2f4(x), y));
828 }
829
830 #endif /* PIPE_ARCH_SSE */
831
832
833
834 /**
835 * Low-level instruction translators.
836 */
837
838 static void
839 emit_abs(
840 struct x86_function *func,
841 unsigned xmm )
842 {
843 sse_andps(
844 func,
845 make_xmm( xmm ),
846 get_temp(
847 TGSI_EXEC_TEMP_7FFFFFFF_I,
848 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
849 }
850
851 static void
852 emit_add(
853 struct x86_function *func,
854 unsigned xmm_dst,
855 unsigned xmm_src )
856 {
857 sse_addps(
858 func,
859 make_xmm( xmm_dst ),
860 make_xmm( xmm_src ) );
861 }
862
863 static void PIPE_CDECL
864 cos4f(
865 float *store )
866 {
867 store[0] = cosf( store[0] );
868 store[1] = cosf( store[1] );
869 store[2] = cosf( store[2] );
870 store[3] = cosf( store[3] );
871 }
872
873 static void
874 emit_cos(
875 struct x86_function *func,
876 unsigned xmm_save,
877 unsigned xmm_dst )
878 {
879 emit_func_call_dst_src1(
880 func,
881 xmm_save,
882 xmm_dst,
883 xmm_dst,
884 cos4f );
885 }
886
887 static void PIPE_CDECL
888 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
889 __attribute__((force_align_arg_pointer))
890 #endif
891 ex24f(
892 float *store )
893 {
894 #if defined(PIPE_ARCH_SSE)
895 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
896 #else
897 store[0] = util_fast_exp2( store[0] );
898 store[1] = util_fast_exp2( store[1] );
899 store[2] = util_fast_exp2( store[2] );
900 store[3] = util_fast_exp2( store[3] );
901 #endif
902 }
903
904 static void
905 emit_ex2(
906 struct x86_function *func,
907 unsigned xmm_save,
908 unsigned xmm_dst )
909 {
910 emit_func_call_dst_src1(
911 func,
912 xmm_save,
913 xmm_dst,
914 xmm_dst,
915 ex24f );
916 }
917
918 static void
919 emit_f2it(
920 struct x86_function *func,
921 unsigned xmm )
922 {
923 sse2_cvttps2dq(
924 func,
925 make_xmm( xmm ),
926 make_xmm( xmm ) );
927 }
928
929 static void
930 emit_i2f(
931 struct x86_function *func,
932 unsigned xmm )
933 {
934 sse2_cvtdq2ps(
935 func,
936 make_xmm( xmm ),
937 make_xmm( xmm ) );
938 }
939
940 static void PIPE_CDECL
941 flr4f(
942 float *store )
943 {
944 store[0] = floorf( store[0] );
945 store[1] = floorf( store[1] );
946 store[2] = floorf( store[2] );
947 store[3] = floorf( store[3] );
948 }
949
950 static void
951 emit_flr(
952 struct x86_function *func,
953 unsigned xmm_save,
954 unsigned xmm_dst )
955 {
956 emit_func_call_dst_src1(
957 func,
958 xmm_save,
959 xmm_dst,
960 xmm_dst,
961 flr4f );
962 }
963
964 static void PIPE_CDECL
965 frc4f(
966 float *store )
967 {
968 store[0] -= floorf( store[0] );
969 store[1] -= floorf( store[1] );
970 store[2] -= floorf( store[2] );
971 store[3] -= floorf( store[3] );
972 }
973
974 static void
975 emit_frc(
976 struct x86_function *func,
977 unsigned xmm_save,
978 unsigned xmm_dst )
979 {
980 emit_func_call_dst_src1(
981 func,
982 xmm_save,
983 xmm_dst,
984 xmm_dst,
985 frc4f );
986 }
987
988 static void PIPE_CDECL
989 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
990 __attribute__((force_align_arg_pointer))
991 #endif
992 lg24f(
993 float *store )
994 {
995 #if defined(PIPE_ARCH_SSE)
996 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
997 #else
998 store[0] = util_fast_log2( store[0] );
999 store[1] = util_fast_log2( store[1] );
1000 store[2] = util_fast_log2( store[2] );
1001 store[3] = util_fast_log2( store[3] );
1002 #endif
1003 }
1004
1005 static void
1006 emit_lg2(
1007 struct x86_function *func,
1008 unsigned xmm_save,
1009 unsigned xmm_dst )
1010 {
1011 emit_func_call_dst_src1(
1012 func,
1013 xmm_save,
1014 xmm_dst,
1015 xmm_dst,
1016 lg24f );
1017 }
1018
1019 static void
1020 emit_MOV(
1021 struct x86_function *func,
1022 unsigned xmm_dst,
1023 unsigned xmm_src )
1024 {
1025 sse_movups(
1026 func,
1027 make_xmm( xmm_dst ),
1028 make_xmm( xmm_src ) );
1029 }
1030
1031 static void
1032 emit_mul (struct x86_function *func,
1033 unsigned xmm_dst,
1034 unsigned xmm_src)
1035 {
1036 sse_mulps(
1037 func,
1038 make_xmm( xmm_dst ),
1039 make_xmm( xmm_src ) );
1040 }
1041
1042 static void
1043 emit_neg(
1044 struct x86_function *func,
1045 unsigned xmm )
1046 {
1047 sse_xorps(
1048 func,
1049 make_xmm( xmm ),
1050 get_temp(
1051 TGSI_EXEC_TEMP_80000000_I,
1052 TGSI_EXEC_TEMP_80000000_C ) );
1053 }
1054
1055 static void PIPE_CDECL
1056 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1057 __attribute__((force_align_arg_pointer))
1058 #endif
1059 pow4f(
1060 float *store )
1061 {
1062 #if defined(PIPE_ARCH_SSE)
1063 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1064 #else
1065 store[0] = util_fast_pow( store[0], store[4] );
1066 store[1] = util_fast_pow( store[1], store[5] );
1067 store[2] = util_fast_pow( store[2], store[6] );
1068 store[3] = util_fast_pow( store[3], store[7] );
1069 #endif
1070 }
1071
1072 static void
1073 emit_pow(
1074 struct x86_function *func,
1075 unsigned xmm_save,
1076 unsigned xmm_dst,
1077 unsigned xmm_src0,
1078 unsigned xmm_src1 )
1079 {
1080 emit_func_call_dst_src2(
1081 func,
1082 xmm_save,
1083 xmm_dst,
1084 xmm_src0,
1085 xmm_src1,
1086 pow4f );
1087 }
1088
1089 static void
1090 emit_rcp (
1091 struct x86_function *func,
1092 unsigned xmm_dst,
1093 unsigned xmm_src )
1094 {
1095 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1096 * good enough. Need to either emit a proper divide or use the
1097 * iterative technique described below in emit_rsqrt().
1098 */
1099 sse2_rcpps(
1100 func,
1101 make_xmm( xmm_dst ),
1102 make_xmm( xmm_src ) );
1103 }
1104
1105 static void PIPE_CDECL
1106 rnd4f(
1107 float *store )
1108 {
1109 store[0] = floorf( store[0] + 0.5f );
1110 store[1] = floorf( store[1] + 0.5f );
1111 store[2] = floorf( store[2] + 0.5f );
1112 store[3] = floorf( store[3] + 0.5f );
1113 }
1114
1115 static void
1116 emit_rnd(
1117 struct x86_function *func,
1118 unsigned xmm_save,
1119 unsigned xmm_dst )
1120 {
1121 emit_func_call_dst_src1(
1122 func,
1123 xmm_save,
1124 xmm_dst,
1125 xmm_dst,
1126 rnd4f );
1127 }
1128
1129 static void
1130 emit_rsqrt(
1131 struct x86_function *func,
1132 unsigned xmm_dst,
1133 unsigned xmm_src )
1134 {
1135 #if HIGH_PRECISION
1136 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1137 * implementations, it is possible to improve its precision at
1138 * fairly low cost, using a newton/raphson step, as below:
1139 *
1140 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1141 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1142 *
1143 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1144 */
1145 {
1146 struct x86_reg dst = make_xmm( xmm_dst );
1147 struct x86_reg src = make_xmm( xmm_src );
1148 struct x86_reg tmp0 = make_xmm( 2 );
1149 struct x86_reg tmp1 = make_xmm( 3 );
1150
1151 assert( xmm_dst != xmm_src );
1152 assert( xmm_dst != 2 && xmm_dst != 3 );
1153 assert( xmm_src != 2 && xmm_src != 3 );
1154
1155 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1156 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1157 sse_rsqrtps( func, tmp1, src );
1158 sse_mulps( func, src, tmp1 );
1159 sse_mulps( func, dst, tmp1 );
1160 sse_mulps( func, src, tmp1 );
1161 sse_subps( func, tmp0, src );
1162 sse_mulps( func, dst, tmp0 );
1163 }
1164 #else
1165 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1166 * good enough.
1167 */
1168 sse_rsqrtps(
1169 func,
1170 make_xmm( xmm_dst ),
1171 make_xmm( xmm_src ) );
1172 #endif
1173 }
1174
1175 static void
1176 emit_setsign(
1177 struct x86_function *func,
1178 unsigned xmm )
1179 {
1180 sse_orps(
1181 func,
1182 make_xmm( xmm ),
1183 get_temp(
1184 TGSI_EXEC_TEMP_80000000_I,
1185 TGSI_EXEC_TEMP_80000000_C ) );
1186 }
1187
1188 static void PIPE_CDECL
1189 sgn4f(
1190 float *store )
1191 {
1192 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1193 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1194 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1195 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1196 }
1197
1198 static void
1199 emit_sgn(
1200 struct x86_function *func,
1201 unsigned xmm_save,
1202 unsigned xmm_dst )
1203 {
1204 emit_func_call_dst_src1(
1205 func,
1206 xmm_save,
1207 xmm_dst,
1208 xmm_dst,
1209 sgn4f );
1210 }
1211
1212 static void PIPE_CDECL
1213 sin4f(
1214 float *store )
1215 {
1216 store[0] = sinf( store[0] );
1217 store[1] = sinf( store[1] );
1218 store[2] = sinf( store[2] );
1219 store[3] = sinf( store[3] );
1220 }
1221
1222 static void
1223 emit_sin (struct x86_function *func,
1224 unsigned xmm_save,
1225 unsigned xmm_dst)
1226 {
1227 emit_func_call_dst_src1(
1228 func,
1229 xmm_save,
1230 xmm_dst,
1231 xmm_dst,
1232 sin4f );
1233 }
1234
1235 static void
1236 emit_sub(
1237 struct x86_function *func,
1238 unsigned xmm_dst,
1239 unsigned xmm_src )
1240 {
1241 sse_subps(
1242 func,
1243 make_xmm( xmm_dst ),
1244 make_xmm( xmm_src ) );
1245 }
1246
1247
1248
1249
1250
1251
1252
1253 /**
1254 * Register fetch.
1255 */
1256
1257 static void
1258 emit_fetch(
1259 struct x86_function *func,
1260 unsigned xmm,
1261 const struct tgsi_full_src_register *reg,
1262 const unsigned chan_index )
1263 {
1264 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1265
1266 switch (swizzle) {
1267 case TGSI_SWIZZLE_X:
1268 case TGSI_SWIZZLE_Y:
1269 case TGSI_SWIZZLE_Z:
1270 case TGSI_SWIZZLE_W:
1271 switch (reg->Register.File) {
1272 case TGSI_FILE_CONSTANT:
1273 emit_const(
1274 func,
1275 xmm,
1276 reg->Register.Index,
1277 swizzle,
1278 reg->Register.Indirect,
1279 reg->Indirect.File,
1280 reg->Indirect.Index );
1281 break;
1282
1283 case TGSI_FILE_IMMEDIATE:
1284 emit_immediate(
1285 func,
1286 xmm,
1287 reg->Register.Index,
1288 swizzle );
1289 break;
1290
1291 case TGSI_FILE_INPUT:
1292 case TGSI_FILE_SYSTEM_VALUE:
1293 emit_inputf(
1294 func,
1295 xmm,
1296 reg->Register.Index,
1297 swizzle );
1298 break;
1299
1300 case TGSI_FILE_TEMPORARY:
1301 emit_tempf(
1302 func,
1303 xmm,
1304 reg->Register.Index,
1305 swizzle );
1306 break;
1307
1308 default:
1309 assert( 0 );
1310 }
1311 break;
1312
1313 default:
1314 assert( 0 );
1315 }
1316
1317 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1318 case TGSI_UTIL_SIGN_CLEAR:
1319 emit_abs( func, xmm );
1320 break;
1321
1322 case TGSI_UTIL_SIGN_SET:
1323 emit_setsign( func, xmm );
1324 break;
1325
1326 case TGSI_UTIL_SIGN_TOGGLE:
1327 emit_neg( func, xmm );
1328 break;
1329
1330 case TGSI_UTIL_SIGN_KEEP:
1331 break;
1332 }
1333 }
1334
1335 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1336 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1337
1338 /**
1339 * Register store.
1340 */
1341
1342 static void
1343 emit_store(
1344 struct x86_function *func,
1345 unsigned xmm,
1346 const struct tgsi_full_dst_register *reg,
1347 const struct tgsi_full_instruction *inst,
1348 unsigned chan_index )
1349 {
1350 switch( inst->Instruction.Saturate ) {
1351 case TGSI_SAT_NONE:
1352 break;
1353
1354 case TGSI_SAT_ZERO_ONE:
1355 sse_maxps(
1356 func,
1357 make_xmm( xmm ),
1358 get_temp(
1359 TGSI_EXEC_TEMP_00000000_I,
1360 TGSI_EXEC_TEMP_00000000_C ) );
1361
1362 sse_minps(
1363 func,
1364 make_xmm( xmm ),
1365 get_temp(
1366 TGSI_EXEC_TEMP_ONE_I,
1367 TGSI_EXEC_TEMP_ONE_C ) );
1368 break;
1369
1370 case TGSI_SAT_MINUS_PLUS_ONE:
1371 assert( 0 );
1372 break;
1373 }
1374
1375
1376 switch( reg->Register.File ) {
1377 case TGSI_FILE_OUTPUT:
1378 emit_output(
1379 func,
1380 xmm,
1381 reg->Register.Index,
1382 chan_index );
1383 break;
1384
1385 case TGSI_FILE_TEMPORARY:
1386 emit_temps(
1387 func,
1388 xmm,
1389 reg->Register.Index,
1390 chan_index );
1391 break;
1392
1393 case TGSI_FILE_ADDRESS:
1394 emit_addrs(
1395 func,
1396 xmm,
1397 reg->Register.Index,
1398 chan_index );
1399 break;
1400
1401 default:
1402 assert( 0 );
1403 }
1404 }
1405
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1408
1409
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1412 float *store )
1413 {
1414 #if 0
1415 uint j;
1416
1417 debug_printf("%s sampler: %p (%p) store: %p\n",
1418 __FUNCTION__,
1419 sampler, *sampler,
1420 store );
1421
1422 for (j = 0; j < 4; j++)
1423 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1424 j,
1425 store[0+j],
1426 store[4+j],
1427 store[8 + j],
1428 store[12 + j]);
1429 #endif
1430
1431 {
1432 float rgba[NUM_CHANNELS][QUAD_SIZE];
1433 (*sampler)->get_samples(*sampler,
1434 &store[0], /* s */
1435 &store[4], /* t */
1436 &store[8], /* r */
1437 &store[12], /* lodbias */
1438 tgsi_sampler_lod_bias,
1439 rgba); /* results */
1440
1441 memcpy( store, rgba, 16 * sizeof(float));
1442 }
1443
1444 #if 0
1445 for (j = 0; j < 4; j++)
1446 debug_printf("sample %d result %f %f %f %f\n",
1447 j,
1448 store[0+j],
1449 store[4+j],
1450 store[8+j],
1451 store[12+j]);
1452 #endif
1453 }
1454
1455 /**
1456 * High-level instruction translators.
1457 */
1458
1459 static void
1460 emit_tex( struct x86_function *func,
1461 const struct tgsi_full_instruction *inst,
1462 boolean lodbias,
1463 boolean projected)
1464 {
1465 const uint unit = inst->Src[1].Register.Index;
1466 struct x86_reg args[2];
1467 unsigned count;
1468 unsigned i;
1469
1470 assert(inst->Instruction.Texture);
1471 switch (inst->Texture.Texture) {
1472 case TGSI_TEXTURE_1D:
1473 count = 1;
1474 break;
1475 case TGSI_TEXTURE_2D:
1476 case TGSI_TEXTURE_RECT:
1477 count = 2;
1478 break;
1479 case TGSI_TEXTURE_SHADOW1D:
1480 case TGSI_TEXTURE_SHADOW2D:
1481 case TGSI_TEXTURE_SHADOWRECT:
1482 case TGSI_TEXTURE_3D:
1483 case TGSI_TEXTURE_CUBE:
1484 count = 3;
1485 break;
1486 default:
1487 assert(0);
1488 return;
1489 }
1490
1491 if (lodbias) {
1492 FETCH( func, *inst, 3, 0, 3 );
1493 }
1494 else {
1495 emit_tempf(
1496 func,
1497 3,
1498 TGSI_EXEC_TEMP_00000000_I,
1499 TGSI_EXEC_TEMP_00000000_C );
1500
1501 }
1502
1503 /* store lodbias whether enabled or not -- fetch_texel currently
1504 * respects it always.
1505 */
1506 sse_movaps( func,
1507 get_temp( TEMP_R0, 3 ),
1508 make_xmm( 3 ) );
1509
1510
1511 if (projected) {
1512 FETCH( func, *inst, 3, 0, 3 );
1513
1514 emit_rcp( func, 3, 3 );
1515 }
1516
1517 for (i = 0; i < count; i++) {
1518 FETCH( func, *inst, i, 0, i );
1519
1520 if (projected) {
1521 sse_mulps(
1522 func,
1523 make_xmm( i ),
1524 make_xmm( 3 ) );
1525 }
1526
1527 /* Store in the argument buffer:
1528 */
1529 sse_movaps(
1530 func,
1531 get_temp( TEMP_R0, i ),
1532 make_xmm( i ) );
1533 }
1534
1535 args[0] = get_temp( TEMP_R0, 0 );
1536 args[1] = get_sampler_ptr( unit );
1537
1538
1539 emit_func_call( func,
1540 0,
1541 args,
1542 Elements(args),
1543 fetch_texel );
1544
1545 /* If all four channels are enabled, could use a pointer to
1546 * dst[0].x instead of TEMP_R0 for store?
1547 */
1548 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1549
1550 sse_movaps(
1551 func,
1552 make_xmm( 0 ),
1553 get_temp( TEMP_R0, i ) );
1554
1555 STORE( func, *inst, 0, 0, i );
1556 }
1557 }
1558
1559
1560 static void
1561 emit_kil(
1562 struct x86_function *func,
1563 const struct tgsi_full_src_register *reg )
1564 {
1565 unsigned uniquemask;
1566 unsigned unique_count = 0;
1567 unsigned chan_index;
1568 unsigned i;
1569
1570 /* This mask stores component bits that were already tested. Note that
1571 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1572 * tested. */
1573 uniquemask = 0;
1574
1575 FOR_EACH_CHANNEL( chan_index ) {
1576 unsigned swizzle;
1577
1578 /* unswizzle channel */
1579 swizzle = tgsi_util_get_full_src_register_swizzle(
1580 reg,
1581 chan_index );
1582
1583 /* check if the component has not been already tested */
1584 if( !(uniquemask & (1 << swizzle)) ) {
1585 uniquemask |= 1 << swizzle;
1586
1587 /* allocate register */
1588 emit_fetch(
1589 func,
1590 unique_count++,
1591 reg,
1592 chan_index );
1593 }
1594 }
1595
1596 x86_push(
1597 func,
1598 x86_make_reg( file_REG32, reg_AX ) );
1599 x86_push(
1600 func,
1601 x86_make_reg( file_REG32, reg_DX ) );
1602
1603 for (i = 0 ; i < unique_count; i++ ) {
1604 struct x86_reg dataXMM = make_xmm(i);
1605
1606 sse_cmpps(
1607 func,
1608 dataXMM,
1609 get_temp(
1610 TGSI_EXEC_TEMP_00000000_I,
1611 TGSI_EXEC_TEMP_00000000_C ),
1612 cc_LessThan );
1613
1614 if( i == 0 ) {
1615 sse_movmskps(
1616 func,
1617 x86_make_reg( file_REG32, reg_AX ),
1618 dataXMM );
1619 }
1620 else {
1621 sse_movmskps(
1622 func,
1623 x86_make_reg( file_REG32, reg_DX ),
1624 dataXMM );
1625 x86_or(
1626 func,
1627 x86_make_reg( file_REG32, reg_AX ),
1628 x86_make_reg( file_REG32, reg_DX ) );
1629 }
1630 }
1631
1632 x86_or(
1633 func,
1634 get_temp(
1635 TGSI_EXEC_TEMP_KILMASK_I,
1636 TGSI_EXEC_TEMP_KILMASK_C ),
1637 x86_make_reg( file_REG32, reg_AX ) );
1638
1639 x86_pop(
1640 func,
1641 x86_make_reg( file_REG32, reg_DX ) );
1642 x86_pop(
1643 func,
1644 x86_make_reg( file_REG32, reg_AX ) );
1645 }
1646
1647
1648 static void
1649 emit_kilp(
1650 struct x86_function *func )
1651 {
1652 /* XXX todo / fix me */
1653 }
1654
1655
1656 static void
1657 emit_setcc(
1658 struct x86_function *func,
1659 struct tgsi_full_instruction *inst,
1660 enum sse_cc cc )
1661 {
1662 unsigned chan_index;
1663
1664 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1665 FETCH( func, *inst, 0, 0, chan_index );
1666 FETCH( func, *inst, 1, 1, chan_index );
1667 sse_cmpps(
1668 func,
1669 make_xmm( 0 ),
1670 make_xmm( 1 ),
1671 cc );
1672 sse_andps(
1673 func,
1674 make_xmm( 0 ),
1675 get_temp(
1676 TEMP_ONE_I,
1677 TEMP_ONE_C ) );
1678 STORE( func, *inst, 0, 0, chan_index );
1679 }
1680 }
1681
1682 static void
1683 emit_cmp(
1684 struct x86_function *func,
1685 struct tgsi_full_instruction *inst )
1686 {
1687 unsigned chan_index;
1688
1689 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1690 FETCH( func, *inst, 0, 0, chan_index );
1691 FETCH( func, *inst, 1, 1, chan_index );
1692 FETCH( func, *inst, 2, 2, chan_index );
1693 sse_cmpps(
1694 func,
1695 make_xmm( 0 ),
1696 get_temp(
1697 TGSI_EXEC_TEMP_00000000_I,
1698 TGSI_EXEC_TEMP_00000000_C ),
1699 cc_LessThan );
1700 sse_andps(
1701 func,
1702 make_xmm( 1 ),
1703 make_xmm( 0 ) );
1704 sse_andnps(
1705 func,
1706 make_xmm( 0 ),
1707 make_xmm( 2 ) );
1708 sse_orps(
1709 func,
1710 make_xmm( 0 ),
1711 make_xmm( 1 ) );
1712 STORE( func, *inst, 0, 0, chan_index );
1713 }
1714 }
1715
1716
1717 /**
1718 * Check if inst src/dest regs use indirect addressing into temporary
1719 * register file.
1720 */
1721 static boolean
1722 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1723 {
1724 uint i;
1725 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1726 const struct tgsi_full_src_register *reg = &inst->Src[i];
1727 if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1728 reg->Register.Indirect)
1729 return TRUE;
1730 }
1731 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1732 const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1733 if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1734 reg->Register.Indirect)
1735 return TRUE;
1736 }
1737 return FALSE;
1738 }
1739
1740
1741 static int
1742 emit_instruction(
1743 struct x86_function *func,
1744 struct tgsi_full_instruction *inst )
1745 {
1746 unsigned chan_index;
1747
1748 /* we can't handle indirect addressing into temp register file yet */
1749 if (indirect_temp_reference(inst))
1750 return FALSE;
1751
1752 switch (inst->Instruction.Opcode) {
1753 case TGSI_OPCODE_ARL:
1754 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1755 FETCH( func, *inst, 0, 0, chan_index );
1756 emit_flr(func, 0, 0);
1757 emit_f2it( func, 0 );
1758 STORE( func, *inst, 0, 0, chan_index );
1759 }
1760 break;
1761
1762 case TGSI_OPCODE_MOV:
1763 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1764 FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1765 }
1766 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1767 STORE( func, *inst, 4 + chan_index, 0, chan_index );
1768 }
1769 break;
1770
1771 case TGSI_OPCODE_LIT:
1772 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1773 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1774 emit_tempf(
1775 func,
1776 0,
1777 TEMP_ONE_I,
1778 TEMP_ONE_C);
1779 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1780 STORE( func, *inst, 0, 0, CHAN_X );
1781 }
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1783 STORE( func, *inst, 0, 0, CHAN_W );
1784 }
1785 }
1786 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1787 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1788 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1789 FETCH( func, *inst, 0, 0, CHAN_X );
1790 sse_maxps(
1791 func,
1792 make_xmm( 0 ),
1793 get_temp(
1794 TGSI_EXEC_TEMP_00000000_I,
1795 TGSI_EXEC_TEMP_00000000_C ) );
1796 STORE( func, *inst, 0, 0, CHAN_Y );
1797 }
1798 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1799 /* XMM[1] = SrcReg[0].yyyy */
1800 FETCH( func, *inst, 1, 0, CHAN_Y );
1801 /* XMM[1] = max(XMM[1], 0) */
1802 sse_maxps(
1803 func,
1804 make_xmm( 1 ),
1805 get_temp(
1806 TGSI_EXEC_TEMP_00000000_I,
1807 TGSI_EXEC_TEMP_00000000_C ) );
1808 /* XMM[2] = SrcReg[0].wwww */
1809 FETCH( func, *inst, 2, 0, CHAN_W );
1810 /* XMM[2] = min(XMM[2], 128.0) */
1811 sse_minps(
1812 func,
1813 make_xmm( 2 ),
1814 get_temp(
1815 TGSI_EXEC_TEMP_128_I,
1816 TGSI_EXEC_TEMP_128_C ) );
1817 /* XMM[2] = max(XMM[2], -128.0) */
1818 sse_maxps(
1819 func,
1820 make_xmm( 2 ),
1821 get_temp(
1822 TGSI_EXEC_TEMP_MINUS_128_I,
1823 TGSI_EXEC_TEMP_MINUS_128_C ) );
1824 emit_pow( func, 3, 1, 1, 2 );
1825 FETCH( func, *inst, 0, 0, CHAN_X );
1826 sse_xorps(
1827 func,
1828 make_xmm( 2 ),
1829 make_xmm( 2 ) );
1830 sse_cmpps(
1831 func,
1832 make_xmm( 2 ),
1833 make_xmm( 0 ),
1834 cc_LessThan );
1835 sse_andps(
1836 func,
1837 make_xmm( 2 ),
1838 make_xmm( 1 ) );
1839 STORE( func, *inst, 2, 0, CHAN_Z );
1840 }
1841 }
1842 break;
1843
1844 case TGSI_OPCODE_RCP:
1845 FETCH( func, *inst, 0, 0, CHAN_X );
1846 emit_rcp( func, 0, 0 );
1847 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1848 STORE( func, *inst, 0, 0, chan_index );
1849 }
1850 break;
1851
1852 case TGSI_OPCODE_RSQ:
1853 FETCH( func, *inst, 0, 0, CHAN_X );
1854 emit_abs( func, 0 );
1855 emit_rsqrt( func, 1, 0 );
1856 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1857 STORE( func, *inst, 1, 0, chan_index );
1858 }
1859 break;
1860
1861 case TGSI_OPCODE_EXP:
1862 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1863 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1864 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1865 FETCH( func, *inst, 0, 0, CHAN_X );
1866 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1867 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1868 emit_MOV( func, 1, 0 );
1869 emit_flr( func, 2, 1 );
1870 /* dst.x = ex2(floor(src.x)) */
1871 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1872 emit_MOV( func, 2, 1 );
1873 emit_ex2( func, 3, 2 );
1874 STORE( func, *inst, 2, 0, CHAN_X );
1875 }
1876 /* dst.y = src.x - floor(src.x) */
1877 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1878 emit_MOV( func, 2, 0 );
1879 emit_sub( func, 2, 1 );
1880 STORE( func, *inst, 2, 0, CHAN_Y );
1881 }
1882 }
1883 /* dst.z = ex2(src.x) */
1884 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1885 emit_ex2( func, 3, 0 );
1886 STORE( func, *inst, 0, 0, CHAN_Z );
1887 }
1888 }
1889 /* dst.w = 1.0 */
1890 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1891 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1892 STORE( func, *inst, 0, 0, CHAN_W );
1893 }
1894 break;
1895
1896 case TGSI_OPCODE_LOG:
1897 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1898 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1899 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900 FETCH( func, *inst, 0, 0, CHAN_X );
1901 emit_abs( func, 0 );
1902 emit_MOV( func, 1, 0 );
1903 emit_lg2( func, 2, 1 );
1904 /* dst.z = lg2(abs(src.x)) */
1905 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1906 STORE( func, *inst, 1, 0, CHAN_Z );
1907 }
1908 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1909 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1910 emit_flr( func, 2, 1 );
1911 /* dst.x = floor(lg2(abs(src.x))) */
1912 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1913 STORE( func, *inst, 1, 0, CHAN_X );
1914 }
1915 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1916 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1917 emit_ex2( func, 2, 1 );
1918 emit_rcp( func, 1, 1 );
1919 emit_mul( func, 0, 1 );
1920 STORE( func, *inst, 0, 0, CHAN_Y );
1921 }
1922 }
1923 }
1924 /* dst.w = 1.0 */
1925 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1926 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1927 STORE( func, *inst, 0, 0, CHAN_W );
1928 }
1929 break;
1930
1931 case TGSI_OPCODE_MUL:
1932 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1933 FETCH( func, *inst, 0, 0, chan_index );
1934 FETCH( func, *inst, 1, 1, chan_index );
1935 emit_mul( func, 0, 1 );
1936 STORE( func, *inst, 0, 0, chan_index );
1937 }
1938 break;
1939
1940 case TGSI_OPCODE_ADD:
1941 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1942 FETCH( func, *inst, 0, 0, chan_index );
1943 FETCH( func, *inst, 1, 1, chan_index );
1944 emit_add( func, 0, 1 );
1945 STORE( func, *inst, 0, 0, chan_index );
1946 }
1947 break;
1948
1949 case TGSI_OPCODE_DP3:
1950 FETCH( func, *inst, 0, 0, CHAN_X );
1951 FETCH( func, *inst, 1, 1, CHAN_X );
1952 emit_mul( func, 0, 1 );
1953 FETCH( func, *inst, 1, 0, CHAN_Y );
1954 FETCH( func, *inst, 2, 1, CHAN_Y );
1955 emit_mul( func, 1, 2 );
1956 emit_add( func, 0, 1 );
1957 FETCH( func, *inst, 1, 0, CHAN_Z );
1958 FETCH( func, *inst, 2, 1, CHAN_Z );
1959 emit_mul( func, 1, 2 );
1960 emit_add( func, 0, 1 );
1961 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1962 STORE( func, *inst, 0, 0, chan_index );
1963 }
1964 break;
1965
1966 case TGSI_OPCODE_DP4:
1967 FETCH( func, *inst, 0, 0, CHAN_X );
1968 FETCH( func, *inst, 1, 1, CHAN_X );
1969 emit_mul( func, 0, 1 );
1970 FETCH( func, *inst, 1, 0, CHAN_Y );
1971 FETCH( func, *inst, 2, 1, CHAN_Y );
1972 emit_mul( func, 1, 2 );
1973 emit_add( func, 0, 1 );
1974 FETCH( func, *inst, 1, 0, CHAN_Z );
1975 FETCH( func, *inst, 2, 1, CHAN_Z );
1976 emit_mul(func, 1, 2 );
1977 emit_add(func, 0, 1 );
1978 FETCH( func, *inst, 1, 0, CHAN_W );
1979 FETCH( func, *inst, 2, 1, CHAN_W );
1980 emit_mul( func, 1, 2 );
1981 emit_add( func, 0, 1 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983 STORE( func, *inst, 0, 0, chan_index );
1984 }
1985 break;
1986
1987 case TGSI_OPCODE_DST:
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1989 emit_tempf(
1990 func,
1991 0,
1992 TEMP_ONE_I,
1993 TEMP_ONE_C );
1994 STORE( func, *inst, 0, 0, CHAN_X );
1995 }
1996 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1997 FETCH( func, *inst, 0, 0, CHAN_Y );
1998 FETCH( func, *inst, 1, 1, CHAN_Y );
1999 emit_mul( func, 0, 1 );
2000 STORE( func, *inst, 0, 0, CHAN_Y );
2001 }
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2003 FETCH( func, *inst, 0, 0, CHAN_Z );
2004 STORE( func, *inst, 0, 0, CHAN_Z );
2005 }
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2007 FETCH( func, *inst, 0, 1, CHAN_W );
2008 STORE( func, *inst, 0, 0, CHAN_W );
2009 }
2010 break;
2011
2012 case TGSI_OPCODE_MIN:
2013 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2014 FETCH( func, *inst, 0, 0, chan_index );
2015 FETCH( func, *inst, 1, 1, chan_index );
2016 sse_minps(
2017 func,
2018 make_xmm( 0 ),
2019 make_xmm( 1 ) );
2020 STORE( func, *inst, 0, 0, chan_index );
2021 }
2022 break;
2023
2024 case TGSI_OPCODE_MAX:
2025 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2026 FETCH( func, *inst, 0, 0, chan_index );
2027 FETCH( func, *inst, 1, 1, chan_index );
2028 sse_maxps(
2029 func,
2030 make_xmm( 0 ),
2031 make_xmm( 1 ) );
2032 STORE( func, *inst, 0, 0, chan_index );
2033 }
2034 break;
2035
2036 case TGSI_OPCODE_SLT:
2037 emit_setcc( func, inst, cc_LessThan );
2038 break;
2039
2040 case TGSI_OPCODE_SGE:
2041 emit_setcc( func, inst, cc_NotLessThan );
2042 break;
2043
2044 case TGSI_OPCODE_MAD:
2045 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2046 FETCH( func, *inst, 0, 0, chan_index );
2047 FETCH( func, *inst, 1, 1, chan_index );
2048 FETCH( func, *inst, 2, 2, chan_index );
2049 emit_mul( func, 0, 1 );
2050 emit_add( func, 0, 2 );
2051 STORE( func, *inst, 0, 0, chan_index );
2052 }
2053 break;
2054
2055 case TGSI_OPCODE_SUB:
2056 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2057 FETCH( func, *inst, 0, 0, chan_index );
2058 FETCH( func, *inst, 1, 1, chan_index );
2059 emit_sub( func, 0, 1 );
2060 STORE( func, *inst, 0, 0, chan_index );
2061 }
2062 break;
2063
2064 case TGSI_OPCODE_LRP:
2065 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2066 FETCH( func, *inst, 0, 0, chan_index );
2067 FETCH( func, *inst, 1, 1, chan_index );
2068 FETCH( func, *inst, 2, 2, chan_index );
2069 emit_sub( func, 1, 2 );
2070 emit_mul( func, 0, 1 );
2071 emit_add( func, 0, 2 );
2072 STORE( func, *inst, 0, 0, chan_index );
2073 }
2074 break;
2075
2076 case TGSI_OPCODE_CND:
2077 return 0;
2078 break;
2079
2080 case TGSI_OPCODE_DP2A:
2081 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2082 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2083 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2084 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2085 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2086 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2087 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2088 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2089 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2090 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2091 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2092 }
2093 break;
2094
2095 case TGSI_OPCODE_FRC:
2096 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2097 FETCH( func, *inst, 0, 0, chan_index );
2098 emit_frc( func, 0, 0 );
2099 STORE( func, *inst, 0, 0, chan_index );
2100 }
2101 break;
2102
2103 case TGSI_OPCODE_CLAMP:
2104 return 0;
2105 break;
2106
2107 case TGSI_OPCODE_FLR:
2108 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2109 FETCH( func, *inst, 0, 0, chan_index );
2110 emit_flr( func, 0, 0 );
2111 STORE( func, *inst, 0, 0, chan_index );
2112 }
2113 break;
2114
2115 case TGSI_OPCODE_ROUND:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117 FETCH( func, *inst, 0, 0, chan_index );
2118 emit_rnd( func, 0, 0 );
2119 STORE( func, *inst, 0, 0, chan_index );
2120 }
2121 break;
2122
2123 case TGSI_OPCODE_EX2:
2124 FETCH( func, *inst, 0, 0, CHAN_X );
2125 emit_ex2( func, 0, 0 );
2126 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2127 STORE( func, *inst, 0, 0, chan_index );
2128 }
2129 break;
2130
2131 case TGSI_OPCODE_LG2:
2132 FETCH( func, *inst, 0, 0, CHAN_X );
2133 emit_lg2( func, 0, 0 );
2134 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2135 STORE( func, *inst, 0, 0, chan_index );
2136 }
2137 break;
2138
2139 case TGSI_OPCODE_POW:
2140 FETCH( func, *inst, 0, 0, CHAN_X );
2141 FETCH( func, *inst, 1, 1, CHAN_X );
2142 emit_pow( func, 0, 0, 0, 1 );
2143 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2144 STORE( func, *inst, 0, 0, chan_index );
2145 }
2146 break;
2147
2148 case TGSI_OPCODE_XPD:
2149 /* Note: we do all stores after all operands have been fetched
2150 * to avoid src/dst register aliasing issues for an instruction
2151 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2152 */
2153 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2154 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2155 FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2156 FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2157 }
2158 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2159 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2160 FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2161 FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2162 }
2163 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2164 emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */
2165 emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2166 emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */
2167 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2168 emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2169 /* store xmm[7] in dst.x below */
2170 }
2171 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2172 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2173 FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2174 FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2175 }
2176 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2177 emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2178 emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2179 emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2180 /* store xmm[3] in dst.y below */
2181 }
2182 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2183 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2184 emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2185 emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2186 STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2187 }
2188 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2189 STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2190 }
2191 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2192 STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2193 }
2194 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2195 emit_tempf(
2196 func,
2197 0,
2198 TEMP_ONE_I,
2199 TEMP_ONE_C );
2200 STORE( func, *inst, 0, 0, CHAN_W );
2201 }
2202 break;
2203
2204 case TGSI_OPCODE_ABS:
2205 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2206 FETCH( func, *inst, 0, 0, chan_index );
2207 emit_abs( func, 0) ;
2208
2209 STORE( func, *inst, 0, 0, chan_index );
2210 }
2211 break;
2212
2213 case TGSI_OPCODE_RCC:
2214 return 0;
2215 break;
2216
2217 case TGSI_OPCODE_DPH:
2218 FETCH( func, *inst, 0, 0, CHAN_X );
2219 FETCH( func, *inst, 1, 1, CHAN_X );
2220 emit_mul( func, 0, 1 );
2221 FETCH( func, *inst, 1, 0, CHAN_Y );
2222 FETCH( func, *inst, 2, 1, CHAN_Y );
2223 emit_mul( func, 1, 2 );
2224 emit_add( func, 0, 1 );
2225 FETCH( func, *inst, 1, 0, CHAN_Z );
2226 FETCH( func, *inst, 2, 1, CHAN_Z );
2227 emit_mul( func, 1, 2 );
2228 emit_add( func, 0, 1 );
2229 FETCH( func, *inst, 1, 1, CHAN_W );
2230 emit_add( func, 0, 1 );
2231 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2232 STORE( func, *inst, 0, 0, chan_index );
2233 }
2234 break;
2235
2236 case TGSI_OPCODE_COS:
2237 FETCH( func, *inst, 0, 0, CHAN_X );
2238 emit_cos( func, 0, 0 );
2239 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2240 STORE( func, *inst, 0, 0, chan_index );
2241 }
2242 break;
2243
2244 case TGSI_OPCODE_DDX:
2245 return 0;
2246 break;
2247
2248 case TGSI_OPCODE_DDY:
2249 return 0;
2250 break;
2251
2252 case TGSI_OPCODE_KILP:
2253 /* predicated kill */
2254 emit_kilp( func );
2255 return 0; /* XXX fix me */
2256 break;
2257
2258 case TGSI_OPCODE_KIL:
2259 /* conditional kill */
2260 emit_kil( func, &inst->Src[0] );
2261 break;
2262
2263 case TGSI_OPCODE_PK2H:
2264 return 0;
2265 break;
2266
2267 case TGSI_OPCODE_PK2US:
2268 return 0;
2269 break;
2270
2271 case TGSI_OPCODE_PK4B:
2272 return 0;
2273 break;
2274
2275 case TGSI_OPCODE_PK4UB:
2276 return 0;
2277 break;
2278
2279 case TGSI_OPCODE_RFL:
2280 return 0;
2281 break;
2282
2283 case TGSI_OPCODE_SEQ:
2284 emit_setcc( func, inst, cc_Equal );
2285 break;
2286
2287 case TGSI_OPCODE_SFL:
2288 return 0;
2289 break;
2290
2291 case TGSI_OPCODE_SGT:
2292 emit_setcc( func, inst, cc_NotLessThanEqual );
2293 break;
2294
2295 case TGSI_OPCODE_SIN:
2296 FETCH( func, *inst, 0, 0, CHAN_X );
2297 emit_sin( func, 0, 0 );
2298 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2299 STORE( func, *inst, 0, 0, chan_index );
2300 }
2301 break;
2302
2303 case TGSI_OPCODE_SLE:
2304 emit_setcc( func, inst, cc_LessThanEqual );
2305 break;
2306
2307 case TGSI_OPCODE_SNE:
2308 emit_setcc( func, inst, cc_NotEqual );
2309 break;
2310
2311 case TGSI_OPCODE_STR:
2312 return 0;
2313 break;
2314
2315 case TGSI_OPCODE_TEX:
2316 emit_tex( func, inst, FALSE, FALSE );
2317 break;
2318
2319 case TGSI_OPCODE_TXD:
2320 return 0;
2321 break;
2322
2323 case TGSI_OPCODE_UP2H:
2324 return 0;
2325 break;
2326
2327 case TGSI_OPCODE_UP2US:
2328 return 0;
2329 break;
2330
2331 case TGSI_OPCODE_UP4B:
2332 return 0;
2333 break;
2334
2335 case TGSI_OPCODE_UP4UB:
2336 return 0;
2337 break;
2338
2339 case TGSI_OPCODE_X2D:
2340 return 0;
2341 break;
2342
2343 case TGSI_OPCODE_ARA:
2344 return 0;
2345 break;
2346
2347 case TGSI_OPCODE_ARR:
2348 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2349 FETCH( func, *inst, 0, 0, chan_index );
2350 emit_rnd( func, 0, 0 );
2351 emit_f2it( func, 0 );
2352 STORE( func, *inst, 0, 0, chan_index );
2353 }
2354 break;
2355
2356 case TGSI_OPCODE_BRA:
2357 return 0;
2358 break;
2359
2360 case TGSI_OPCODE_CAL:
2361 return 0;
2362 break;
2363
2364 case TGSI_OPCODE_RET:
2365 emit_ret( func );
2366 break;
2367
2368 case TGSI_OPCODE_END:
2369 break;
2370
2371 case TGSI_OPCODE_SSG:
2372 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2373 FETCH( func, *inst, 0, 0, chan_index );
2374 emit_sgn( func, 0, 0 );
2375 STORE( func, *inst, 0, 0, chan_index );
2376 }
2377 break;
2378
2379 case TGSI_OPCODE_CMP:
2380 emit_cmp (func, inst);
2381 break;
2382
2383 case TGSI_OPCODE_SCS:
2384 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2385 FETCH( func, *inst, 0, 0, CHAN_X );
2386 emit_cos( func, 0, 0 );
2387 STORE( func, *inst, 0, 0, CHAN_X );
2388 }
2389 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2390 FETCH( func, *inst, 0, 0, CHAN_X );
2391 emit_sin( func, 0, 0 );
2392 STORE( func, *inst, 0, 0, CHAN_Y );
2393 }
2394 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2395 emit_tempf(
2396 func,
2397 0,
2398 TGSI_EXEC_TEMP_00000000_I,
2399 TGSI_EXEC_TEMP_00000000_C );
2400 STORE( func, *inst, 0, 0, CHAN_Z );
2401 }
2402 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2403 emit_tempf(
2404 func,
2405 0,
2406 TEMP_ONE_I,
2407 TEMP_ONE_C );
2408 STORE( func, *inst, 0, 0, CHAN_W );
2409 }
2410 break;
2411
2412 case TGSI_OPCODE_TXB:
2413 emit_tex( func, inst, TRUE, FALSE );
2414 break;
2415
2416 case TGSI_OPCODE_NRM:
2417 /* fall-through */
2418 case TGSI_OPCODE_NRM4:
2419 /* 3 or 4-component normalization */
2420 {
2421 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2422
2423 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2424 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2425 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2426 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2427
2428 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2429
2430 /* xmm4 = src.x */
2431 /* xmm0 = src.x * src.x */
2432 FETCH(func, *inst, 0, 0, CHAN_X);
2433 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2434 emit_MOV(func, 4, 0);
2435 }
2436 emit_mul(func, 0, 0);
2437
2438 /* xmm5 = src.y */
2439 /* xmm0 = xmm0 + src.y * src.y */
2440 FETCH(func, *inst, 1, 0, CHAN_Y);
2441 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2442 emit_MOV(func, 5, 1);
2443 }
2444 emit_mul(func, 1, 1);
2445 emit_add(func, 0, 1);
2446
2447 /* xmm6 = src.z */
2448 /* xmm0 = xmm0 + src.z * src.z */
2449 FETCH(func, *inst, 1, 0, CHAN_Z);
2450 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2451 emit_MOV(func, 6, 1);
2452 }
2453 emit_mul(func, 1, 1);
2454 emit_add(func, 0, 1);
2455
2456 if (dims == 4) {
2457 /* xmm7 = src.w */
2458 /* xmm0 = xmm0 + src.w * src.w */
2459 FETCH(func, *inst, 1, 0, CHAN_W);
2460 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2461 emit_MOV(func, 7, 1);
2462 }
2463 emit_mul(func, 1, 1);
2464 emit_add(func, 0, 1);
2465 }
2466
2467 /* xmm1 = 1 / sqrt(xmm0) */
2468 emit_rsqrt(func, 1, 0);
2469
2470 /* dst.x = xmm1 * src.x */
2471 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2472 emit_mul(func, 4, 1);
2473 STORE(func, *inst, 4, 0, CHAN_X);
2474 }
2475
2476 /* dst.y = xmm1 * src.y */
2477 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2478 emit_mul(func, 5, 1);
2479 STORE(func, *inst, 5, 0, CHAN_Y);
2480 }
2481
2482 /* dst.z = xmm1 * src.z */
2483 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2484 emit_mul(func, 6, 1);
2485 STORE(func, *inst, 6, 0, CHAN_Z);
2486 }
2487
2488 /* dst.w = xmm1 * src.w */
2489 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2490 emit_mul(func, 7, 1);
2491 STORE(func, *inst, 7, 0, CHAN_W);
2492 }
2493 }
2494
2495 /* dst0.w = 1.0 */
2496 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2497 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2498 STORE(func, *inst, 0, 0, CHAN_W);
2499 }
2500 }
2501 break;
2502
2503 case TGSI_OPCODE_DIV:
2504 return 0;
2505 break;
2506
2507 case TGSI_OPCODE_DP2:
2508 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2509 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2510 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2511 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2512 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2513 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2514 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2515 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2516 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2517 }
2518 break;
2519
2520 case TGSI_OPCODE_TXL:
2521 return 0;
2522 break;
2523
2524 case TGSI_OPCODE_TXP:
2525 emit_tex( func, inst, FALSE, TRUE );
2526 break;
2527
2528 case TGSI_OPCODE_BRK:
2529 return 0;
2530 break;
2531
2532 case TGSI_OPCODE_IF:
2533 return 0;
2534 break;
2535
2536 case TGSI_OPCODE_BGNFOR:
2537 return 0;
2538 break;
2539
2540 case TGSI_OPCODE_REP:
2541 return 0;
2542 break;
2543
2544 case TGSI_OPCODE_ELSE:
2545 return 0;
2546 break;
2547
2548 case TGSI_OPCODE_ENDIF:
2549 return 0;
2550 break;
2551
2552 case TGSI_OPCODE_ENDFOR:
2553 return 0;
2554 break;
2555
2556 case TGSI_OPCODE_ENDREP:
2557 return 0;
2558 break;
2559
2560 case TGSI_OPCODE_PUSHA:
2561 return 0;
2562 break;
2563
2564 case TGSI_OPCODE_POPA:
2565 return 0;
2566 break;
2567
2568 case TGSI_OPCODE_CEIL:
2569 return 0;
2570 break;
2571
2572 case TGSI_OPCODE_I2F:
2573 return 0;
2574 break;
2575
2576 case TGSI_OPCODE_NOT:
2577 return 0;
2578 break;
2579
2580 case TGSI_OPCODE_TRUNC:
2581 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2582 FETCH( func, *inst, 0, 0, chan_index );
2583 emit_f2it( func, 0 );
2584 emit_i2f( func, 0 );
2585 STORE( func, *inst, 0, 0, chan_index );
2586 }
2587 break;
2588
2589 case TGSI_OPCODE_SHL:
2590 return 0;
2591 break;
2592
2593 case TGSI_OPCODE_ISHR:
2594 return 0;
2595 break;
2596
2597 case TGSI_OPCODE_AND:
2598 return 0;
2599 break;
2600
2601 case TGSI_OPCODE_OR:
2602 return 0;
2603 break;
2604
2605 case TGSI_OPCODE_MOD:
2606 return 0;
2607 break;
2608
2609 case TGSI_OPCODE_XOR:
2610 return 0;
2611 break;
2612
2613 case TGSI_OPCODE_SAD:
2614 return 0;
2615 break;
2616
2617 case TGSI_OPCODE_TXF:
2618 return 0;
2619 break;
2620
2621 case TGSI_OPCODE_TXQ:
2622 return 0;
2623 break;
2624
2625 case TGSI_OPCODE_CONT:
2626 return 0;
2627 break;
2628
2629 case TGSI_OPCODE_EMIT:
2630 return 0;
2631 break;
2632
2633 case TGSI_OPCODE_ENDPRIM:
2634 return 0;
2635 break;
2636
2637 default:
2638 return 0;
2639 }
2640
2641 return 1;
2642 }
2643
2644 static void
2645 emit_declaration(
2646 struct x86_function *func,
2647 struct tgsi_full_declaration *decl )
2648 {
2649 if( decl->Declaration.File == TGSI_FILE_INPUT ||
2650 decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
2651 unsigned first, last, mask;
2652 unsigned i, j;
2653
2654 first = decl->Range.First;
2655 last = decl->Range.Last;
2656 mask = decl->Declaration.UsageMask;
2657
2658 for( i = first; i <= last; i++ ) {
2659 for( j = 0; j < NUM_CHANNELS; j++ ) {
2660 if( mask & (1 << j) ) {
2661 switch( decl->Declaration.Interpolate ) {
2662 case TGSI_INTERPOLATE_CONSTANT:
2663 emit_coef_a0( func, 0, i, j );
2664 emit_inputs( func, 0, i, j );
2665 break;
2666
2667 case TGSI_INTERPOLATE_LINEAR:
2668 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2669 emit_coef_dadx( func, 1, i, j );
2670 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2671 emit_coef_dady( func, 3, i, j );
2672 emit_mul( func, 0, 1 ); /* x * dadx */
2673 emit_coef_a0( func, 4, i, j );
2674 emit_mul( func, 2, 3 ); /* y * dady */
2675 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2676 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2677 emit_inputs( func, 0, i, j );
2678 break;
2679
2680 case TGSI_INTERPOLATE_PERSPECTIVE:
2681 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2682 emit_coef_dadx( func, 1, i, j );
2683 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2684 emit_coef_dady( func, 3, i, j );
2685 emit_mul( func, 0, 1 ); /* x * dadx */
2686 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2687 emit_coef_a0( func, 5, i, j );
2688 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2689 emit_mul( func, 2, 3 ); /* y * dady */
2690 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2691 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2692 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2693 emit_inputs( func, 0, i, j );
2694 break;
2695
2696 default:
2697 assert( 0 );
2698 break;
2699 }
2700 }
2701 }
2702 }
2703 }
2704 }
2705
2706 static void aos_to_soa( struct x86_function *func,
2707 uint arg_aos,
2708 uint arg_machine,
2709 uint arg_num,
2710 uint arg_stride )
2711 {
2712 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2713 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2714 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2715 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2716 int inner_loop;
2717
2718
2719 /* Save EBX */
2720 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2721
2722 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2723 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2724 x86_lea( func, soa_input,
2725 x86_make_disp( soa_input,
2726 Offset(struct tgsi_exec_machine, Inputs) ) );
2727 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2728 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2729
2730 /* do */
2731 inner_loop = x86_get_label( func );
2732 {
2733 x86_push( func, aos_input );
2734 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2735 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2736 x86_add( func, aos_input, stride );
2737 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2738 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2739 x86_add( func, aos_input, stride );
2740 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2741 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2742 x86_add( func, aos_input, stride );
2743 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2744 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2745 x86_pop( func, aos_input );
2746
2747 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2748 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2749 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2750 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2751 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2752 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2753
2754 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2755 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2756 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2757 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2758
2759 /* Advance to next input */
2760 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2761 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2762 }
2763 /* while --num_inputs */
2764 x86_dec( func, num_inputs );
2765 x86_jcc( func, cc_NE, inner_loop );
2766
2767 /* Restore EBX */
2768 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2769 }
2770
2771 static void soa_to_aos( struct x86_function *func,
2772 uint arg_aos,
2773 uint arg_machine,
2774 uint arg_num,
2775 uint arg_stride )
2776 {
2777 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2778 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2779 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2780 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2781 int inner_loop;
2782
2783 /* Save EBX */
2784 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2785
2786 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2787 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2788 x86_lea( func, soa_output,
2789 x86_make_disp( soa_output,
2790 Offset(struct tgsi_exec_machine, Outputs) ) );
2791 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2792
2793 /* do */
2794 inner_loop = x86_get_label( func );
2795 {
2796 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2797 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2798 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2799 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2800
2801 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2802 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2803 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2804 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2805 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2806 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2807
2808 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2809 x86_push( func, aos_output );
2810 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2811 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2812 x86_add( func, aos_output, temp );
2813 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2814 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2815 x86_add( func, aos_output, temp );
2816 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2817 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2818 x86_add( func, aos_output, temp );
2819 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2820 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2821 x86_pop( func, aos_output );
2822
2823 /* Advance to next output */
2824 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2825 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2826 }
2827 /* while --num_outputs */
2828 x86_dec( func, num_outputs );
2829 x86_jcc( func, cc_NE, inner_loop );
2830
2831 /* Restore EBX */
2832 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2833 }
2834
2835 /**
2836 * Translate a TGSI vertex/fragment shader to SSE2 code.
2837 * Slightly different things are done for vertex vs. fragment shaders.
2838 *
2839 * \param tokens the TGSI input shader
2840 * \param func the output SSE code/function
2841 * \param immediates buffer to place immediates, later passed to SSE func
2842 * \param return 1 for success, 0 if translation failed
2843 */
2844 unsigned
2845 tgsi_emit_sse2(
2846 const struct tgsi_token *tokens,
2847 struct x86_function *func,
2848 float (*immediates)[4],
2849 boolean do_swizzles )
2850 {
2851 struct tgsi_parse_context parse;
2852 unsigned ok = 1;
2853 uint num_immediates = 0;
2854
2855 util_init_math();
2856
2857 func->csr = func->store;
2858
2859 tgsi_parse_init( &parse, tokens );
2860
2861 /* Can't just use EDI, EBX without save/restoring them:
2862 */
2863 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2864 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2865
2866 /*
2867 * Different function args for vertex/fragment shaders:
2868 */
2869 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2870 if (do_swizzles)
2871 aos_to_soa( func,
2872 4, /* aos_input */
2873 1, /* machine */
2874 5, /* num_inputs */
2875 6 ); /* input_stride */
2876 }
2877
2878 x86_mov(
2879 func,
2880 get_machine_base(),
2881 x86_fn_arg( func, 1 ) );
2882 x86_mov(
2883 func,
2884 get_const_base(),
2885 x86_fn_arg( func, 2 ) );
2886 x86_mov(
2887 func,
2888 get_immediate_base(),
2889 x86_fn_arg( func, 3 ) );
2890
2891 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2892 x86_mov(
2893 func,
2894 get_coef_base(),
2895 x86_fn_arg( func, 4 ) );
2896 }
2897
2898 x86_mov(
2899 func,
2900 get_sampler_base(),
2901 x86_make_disp( get_machine_base(),
2902 Offset( struct tgsi_exec_machine, Samplers ) ) );
2903
2904
2905 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2906 tgsi_parse_token( &parse );
2907
2908 switch( parse.FullToken.Token.Type ) {
2909 case TGSI_TOKEN_TYPE_DECLARATION:
2910 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2911 emit_declaration(
2912 func,
2913 &parse.FullToken.FullDeclaration );
2914 }
2915 break;
2916
2917 case TGSI_TOKEN_TYPE_INSTRUCTION:
2918 ok = emit_instruction(
2919 func,
2920 &parse.FullToken.FullInstruction );
2921
2922 if (!ok) {
2923 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2924 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2925 opcode,
2926 tgsi_get_opcode_name(opcode),
2927 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2928 "vertex shader" : "fragment shader");
2929 }
2930
2931 if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2932 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2933
2934 /* XXX: we only handle src/dst aliasing in a few opcodes
2935 * currently. Need to use an additional temporay to hold
2936 * the result in the cases where the code is too opaque to
2937 * fix.
2938 */
2939 if (opcode != TGSI_OPCODE_MOV) {
2940 debug_printf("Warning: src/dst aliasing in instruction"
2941 " is not handled:\n");
2942 tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2943 }
2944 }
2945 break;
2946
2947 case TGSI_TOKEN_TYPE_IMMEDIATE:
2948 /* simply copy the immediate values into the next immediates[] slot */
2949 {
2950 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2951 uint i;
2952 assert(size <= 4);
2953 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2954 for( i = 0; i < size; i++ ) {
2955 immediates[num_immediates][i] =
2956 parse.FullToken.FullImmediate.u[i].Float;
2957 }
2958 #if 0
2959 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2960 num_immediates,
2961 immediates[num_immediates][0],
2962 immediates[num_immediates][1],
2963 immediates[num_immediates][2],
2964 immediates[num_immediates][3]);
2965 #endif
2966 num_immediates++;
2967 }
2968 break;
2969 case TGSI_TOKEN_TYPE_PROPERTY:
2970 /* we just ignore them for now */
2971 break;
2972
2973 default:
2974 ok = 0;
2975 assert( 0 );
2976 }
2977 }
2978
2979 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2980 if (do_swizzles)
2981 soa_to_aos( func,
2982 7, /* aos_output */
2983 1, /* machine */
2984 8, /* num_outputs */
2985 9 ); /* output_stride */
2986 }
2987
2988 /* Can't just use EBX, EDI without save/restoring them:
2989 */
2990 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2991 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2992
2993 emit_ret( func );
2994
2995 tgsi_parse_free( &parse );
2996
2997 return ok;
2998 }
2999
3000 #endif /* PIPE_ARCH_X86 */
3001