Merge branch 'glsl2-head' into glsl2
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 #include "pipe/p_config.h"
30
31 #if defined(PIPE_ARCH_X86)
32
33 #include "util/u_debug.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37 #if defined(PIPE_ARCH_SSE)
38 #include "util/u_sse.h"
39 #endif
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_parse.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44 #include "tgsi/tgsi_exec.h"
45 #include "tgsi/tgsi_sse2.h"
46
47 #include "rtasm/rtasm_x86sse.h"
48
49 /* for 1/sqrt()
50 *
51 * This costs about 100fps (close to 10%) in gears:
52 */
53 #define HIGH_PRECISION 1
54
55 #define FAST_MATH 1
56
57
58 #define FOR_EACH_CHANNEL( CHAN )\
59 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
60
61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
63
64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
65 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
66
67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
68 FOR_EACH_CHANNEL( CHAN )\
69 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
70
71 #define CHAN_X 0
72 #define CHAN_Y 1
73 #define CHAN_Z 2
74 #define CHAN_W 3
75
76 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
77 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
78
79 #define TEMP_R0 TGSI_EXEC_TEMP_R0
80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83
84
85 /**
86 * X86 utility functions.
87 */
88
89 static struct x86_reg
90 make_xmm(
91 unsigned xmm )
92 {
93 return x86_make_reg(
94 file_XMM,
95 (enum x86_reg_name) xmm );
96 }
97
98 /**
99 * X86 register mapping helpers.
100 */
101
102 static struct x86_reg
103 get_const_base( void )
104 {
105 return x86_make_reg(
106 file_REG32,
107 reg_AX );
108 }
109
110 static struct x86_reg
111 get_machine_base( void )
112 {
113 return x86_make_reg(
114 file_REG32,
115 reg_CX );
116 }
117
118 static struct x86_reg
119 get_input_base( void )
120 {
121 return x86_make_disp(
122 get_machine_base(),
123 Offset(struct tgsi_exec_machine, Inputs) );
124 }
125
126 static struct x86_reg
127 get_output_base( void )
128 {
129 return x86_make_disp(
130 get_machine_base(),
131 Offset(struct tgsi_exec_machine, Outputs) );
132 }
133
134 static struct x86_reg
135 get_temp_base( void )
136 {
137 return x86_make_disp(
138 get_machine_base(),
139 Offset(struct tgsi_exec_machine, Temps) );
140 }
141
142 static struct x86_reg
143 get_coef_base( void )
144 {
145 return x86_make_reg(
146 file_REG32,
147 reg_BX );
148 }
149
150 static struct x86_reg
151 get_sampler_base( void )
152 {
153 return x86_make_reg(
154 file_REG32,
155 reg_DI );
156 }
157
158 static struct x86_reg
159 get_immediate_base( void )
160 {
161 return x86_make_reg(
162 file_REG32,
163 reg_DX );
164 }
165
166
167 /**
168 * Data access helpers.
169 */
170
171
172 static struct x86_reg
173 get_immediate(
174 unsigned vec,
175 unsigned chan )
176 {
177 return x86_make_disp(
178 get_immediate_base(),
179 (vec * 4 + chan) * 4 );
180 }
181
182 static struct x86_reg
183 get_const(
184 unsigned vec,
185 unsigned chan )
186 {
187 return x86_make_disp(
188 get_const_base(),
189 (vec * 4 + chan) * 4 );
190 }
191
192 static struct x86_reg
193 get_sampler_ptr(
194 unsigned unit )
195 {
196 return x86_make_disp(
197 get_sampler_base(),
198 unit * sizeof( struct tgsi_sampler * ) );
199 }
200
201 static struct x86_reg
202 get_input(
203 unsigned vec,
204 unsigned chan )
205 {
206 return x86_make_disp(
207 get_input_base(),
208 (vec * 4 + chan) * 16 );
209 }
210
211 static struct x86_reg
212 get_output(
213 unsigned vec,
214 unsigned chan )
215 {
216 return x86_make_disp(
217 get_output_base(),
218 (vec * 4 + chan) * 16 );
219 }
220
221 static struct x86_reg
222 get_temp(
223 unsigned vec,
224 unsigned chan )
225 {
226 return x86_make_disp(
227 get_temp_base(),
228 (vec * 4 + chan) * 16 );
229 }
230
231 static struct x86_reg
232 get_coef(
233 unsigned vec,
234 unsigned chan,
235 unsigned member )
236 {
237 return x86_make_disp(
238 get_coef_base(),
239 ((vec * 3 + member) * 4 + chan) * 4 );
240 }
241
242
243 static void
244 emit_ret(
245 struct x86_function *func )
246 {
247 x86_ret( func );
248 }
249
250
251 /**
252 * Data fetch helpers.
253 */
254
255 /**
256 * Copy a shader constant to xmm register
257 * \param xmm the destination xmm register
258 * \param vec the src const buffer index
259 * \param chan src channel to fetch (X, Y, Z or W)
260 */
261 static void
262 emit_const(
263 struct x86_function *func,
264 uint xmm,
265 int vec,
266 uint chan,
267 uint indirect,
268 uint indirectFile,
269 int indirectIndex )
270 {
271 if (indirect) {
272 /* 'vec' is the offset from the address register's value.
273 * We're loading CONST[ADDR+vec] into an xmm register.
274 */
275 struct x86_reg r0 = get_immediate_base();
276 struct x86_reg r1 = get_coef_base();
277 uint i;
278
279 assert( indirectFile == TGSI_FILE_ADDRESS );
280 assert( indirectIndex == 0 );
281 assert( r0.mod == mod_REG );
282 assert( r1.mod == mod_REG );
283
284 x86_push( func, r0 );
285 x86_push( func, r1 );
286
287 /*
288 * Loop over the four pixels or vertices in the quad.
289 * Get the value of the address (offset) register for pixel/vertex[i],
290 * add it to the src offset and index into the constant buffer.
291 * Note that we're working on SOA data.
292 * If any of the pixel/vertex execution channels are unused their
293 * values will be garbage. It's very important that we don't use
294 * those garbage values as indexes into the constant buffer since
295 * that'll cause segfaults.
296 * The solution is to bitwise-AND the offset with the execution mask
297 * register whose values are either 0 or ~0.
298 * The caller must setup the execution mask register to indicate
299 * which channels are valid/alive before running the shader.
300 * The execution mask will also figure into loops and conditionals
301 * someday.
302 */
303 for (i = 0; i < QUAD_SIZE; i++) {
304 /* r1 = address register[i] */
305 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
306 /* r0 = execution mask[i] */
307 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
308 /* r1 = r1 & r0 */
309 x86_and( func, r1, r0 );
310 /* r0 = 'vec', the offset */
311 x86_lea( func, r0, get_const( vec, chan ) );
312
313 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
314 */
315 x86_add( func, r1, r1 );
316 x86_add( func, r1, r1 );
317 x86_add( func, r1, r1 );
318 x86_add( func, r1, r1 );
319
320 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
321 x86_mov( func, r1, x86_deref( r0 ) );
322 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
323 }
324
325 x86_pop( func, r1 );
326 x86_pop( func, r0 );
327
328 sse_movaps(
329 func,
330 make_xmm( xmm ),
331 get_temp( TEMP_R0, CHAN_X ) );
332 }
333 else {
334 /* 'vec' is the index into the src register file, such as TEMP[vec] */
335 assert( vec >= 0 );
336
337 sse_movss(
338 func,
339 make_xmm( xmm ),
340 get_const( vec, chan ) );
341 sse_shufps(
342 func,
343 make_xmm( xmm ),
344 make_xmm( xmm ),
345 SHUF( 0, 0, 0, 0 ) );
346 }
347 }
348
349 static void
350 emit_immediate(
351 struct x86_function *func,
352 unsigned xmm,
353 unsigned vec,
354 unsigned chan )
355 {
356 sse_movss(
357 func,
358 make_xmm( xmm ),
359 get_immediate( vec, chan ) );
360 sse_shufps(
361 func,
362 make_xmm( xmm ),
363 make_xmm( xmm ),
364 SHUF( 0, 0, 0, 0 ) );
365 }
366
367
368 /**
369 * Copy a shader input to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src input attrib
372 * \param chan src channel to fetch (X, Y, Z or W)
373 */
374 static void
375 emit_inputf(
376 struct x86_function *func,
377 unsigned xmm,
378 unsigned vec,
379 unsigned chan )
380 {
381 sse_movups(
382 func,
383 make_xmm( xmm ),
384 get_input( vec, chan ) );
385 }
386
387 /**
388 * Store an xmm register to a shader output
389 * \param xmm the source xmm register
390 * \param vec the dest output attrib
391 * \param chan src dest channel to store (X, Y, Z or W)
392 */
393 static void
394 emit_output(
395 struct x86_function *func,
396 unsigned xmm,
397 unsigned vec,
398 unsigned chan )
399 {
400 sse_movups(
401 func,
402 get_output( vec, chan ),
403 make_xmm( xmm ) );
404 }
405
406 /**
407 * Copy a shader temporary to xmm register
408 * \param xmm the destination xmm register
409 * \param vec the src temp register
410 * \param chan src channel to fetch (X, Y, Z or W)
411 */
412 static void
413 emit_tempf(
414 struct x86_function *func,
415 unsigned xmm,
416 unsigned vec,
417 unsigned chan )
418 {
419 sse_movaps(
420 func,
421 make_xmm( xmm ),
422 get_temp( vec, chan ) );
423 }
424
425 /**
426 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
427 * \param xmm the destination xmm register
428 * \param vec the src input/attribute coefficient index
429 * \param chan src channel to fetch (X, Y, Z or W)
430 * \param member 0=a0, 1=dadx, 2=dady
431 */
432 static void
433 emit_coef(
434 struct x86_function *func,
435 unsigned xmm,
436 unsigned vec,
437 unsigned chan,
438 unsigned member )
439 {
440 sse_movss(
441 func,
442 make_xmm( xmm ),
443 get_coef( vec, chan, member ) );
444 sse_shufps(
445 func,
446 make_xmm( xmm ),
447 make_xmm( xmm ),
448 SHUF( 0, 0, 0, 0 ) );
449 }
450
451 /**
452 * Data store helpers.
453 */
454
455 static void
456 emit_inputs(
457 struct x86_function *func,
458 unsigned xmm,
459 unsigned vec,
460 unsigned chan )
461 {
462 sse_movups(
463 func,
464 get_input( vec, chan ),
465 make_xmm( xmm ) );
466 }
467
468 static void
469 emit_temps(
470 struct x86_function *func,
471 unsigned xmm,
472 unsigned vec,
473 unsigned chan )
474 {
475 sse_movaps(
476 func,
477 get_temp( vec, chan ),
478 make_xmm( xmm ) );
479 }
480
481 static void
482 emit_addrs(
483 struct x86_function *func,
484 unsigned xmm,
485 unsigned vec,
486 unsigned chan )
487 {
488 assert( vec == 0 );
489
490 emit_temps(
491 func,
492 xmm,
493 vec + TGSI_EXEC_TEMP_ADDR,
494 chan );
495 }
496
497 /**
498 * Coefficent fetch helpers.
499 */
500
501 static void
502 emit_coef_a0(
503 struct x86_function *func,
504 unsigned xmm,
505 unsigned vec,
506 unsigned chan )
507 {
508 emit_coef(
509 func,
510 xmm,
511 vec,
512 chan,
513 0 );
514 }
515
516 static void
517 emit_coef_dadx(
518 struct x86_function *func,
519 unsigned xmm,
520 unsigned vec,
521 unsigned chan )
522 {
523 emit_coef(
524 func,
525 xmm,
526 vec,
527 chan,
528 1 );
529 }
530
531 static void
532 emit_coef_dady(
533 struct x86_function *func,
534 unsigned xmm,
535 unsigned vec,
536 unsigned chan )
537 {
538 emit_coef(
539 func,
540 xmm,
541 vec,
542 chan,
543 2 );
544 }
545
546 /**
547 * Function call helpers.
548 */
549
550 /**
551 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
552 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
553 * that the stack pointer is 16 byte aligned, as expected.
554 */
555 static void
556 emit_func_call(
557 struct x86_function *func,
558 unsigned xmm_save_mask,
559 const struct x86_reg *arg,
560 unsigned nr_args,
561 void (PIPE_CDECL *code)() )
562 {
563 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
564 unsigned i, n;
565
566 x86_push(
567 func,
568 x86_make_reg( file_REG32, reg_AX) );
569 x86_push(
570 func,
571 x86_make_reg( file_REG32, reg_CX) );
572 x86_push(
573 func,
574 x86_make_reg( file_REG32, reg_DX) );
575
576 /* Store XMM regs to the stack
577 */
578 for(i = 0, n = 0; i < 8; ++i)
579 if(xmm_save_mask & (1 << i))
580 ++n;
581
582 x86_sub_imm(
583 func,
584 x86_make_reg( file_REG32, reg_SP ),
585 n*16);
586
587 for(i = 0, n = 0; i < 8; ++i)
588 if(xmm_save_mask & (1 << i)) {
589 sse_movups(
590 func,
591 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
592 make_xmm( i ) );
593 ++n;
594 }
595
596 for (i = 0; i < nr_args; i++) {
597 /* Load the address of the buffer we use for passing arguments and
598 * receiving results:
599 */
600 x86_lea(
601 func,
602 ecx,
603 arg[i] );
604
605 /* Push actual function arguments (currently just the pointer to
606 * the buffer above), and call the function:
607 */
608 x86_push( func, ecx );
609 }
610
611 x86_mov_reg_imm( func, ecx, (unsigned long) code );
612 x86_call( func, ecx );
613
614 /* Pop the arguments (or just add an immediate to esp)
615 */
616 for (i = 0; i < nr_args; i++) {
617 x86_pop(func, ecx );
618 }
619
620 /* Pop the saved XMM regs:
621 */
622 for(i = 0, n = 0; i < 8; ++i)
623 if(xmm_save_mask & (1 << i)) {
624 sse_movups(
625 func,
626 make_xmm( i ),
627 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
628 ++n;
629 }
630
631 x86_add_imm(
632 func,
633 x86_make_reg( file_REG32, reg_SP ),
634 n*16);
635
636 /* Restore GP registers in a reverse order.
637 */
638 x86_pop(
639 func,
640 x86_make_reg( file_REG32, reg_DX) );
641 x86_pop(
642 func,
643 x86_make_reg( file_REG32, reg_CX) );
644 x86_pop(
645 func,
646 x86_make_reg( file_REG32, reg_AX) );
647 }
648
649 static void
650 emit_func_call_dst_src1(
651 struct x86_function *func,
652 unsigned xmm_save,
653 unsigned xmm_dst,
654 unsigned xmm_src0,
655 void (PIPE_CDECL *code)() )
656 {
657 struct x86_reg store = get_temp( TEMP_R0, 0 );
658 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
659
660 /* Store our input parameters (in xmm regs) to the buffer we use
661 * for passing arguments. We will pass a pointer to this buffer as
662 * the actual function argument.
663 */
664 sse_movaps(
665 func,
666 store,
667 make_xmm( xmm_src0 ) );
668
669 emit_func_call( func,
670 xmm_mask,
671 &store,
672 1,
673 code );
674
675 sse_movaps(
676 func,
677 make_xmm( xmm_dst ),
678 store );
679 }
680
681
682 static void
683 emit_func_call_dst_src2(
684 struct x86_function *func,
685 unsigned xmm_save,
686 unsigned xmm_dst,
687 unsigned xmm_src0,
688 unsigned xmm_src1,
689 void (PIPE_CDECL *code)() )
690 {
691 struct x86_reg store = get_temp( TEMP_R0, 0 );
692 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
693
694 /* Store two inputs to parameter buffer.
695 */
696 sse_movaps(
697 func,
698 store,
699 make_xmm( xmm_src0 ) );
700
701 sse_movaps(
702 func,
703 x86_make_disp( store, 4 * sizeof(float) ),
704 make_xmm( xmm_src1 ) );
705
706
707 /* Emit the call
708 */
709 emit_func_call( func,
710 xmm_mask,
711 &store,
712 1,
713 code );
714
715 /* Retrieve the results:
716 */
717 sse_movaps(
718 func,
719 make_xmm( xmm_dst ),
720 store );
721 }
722
723
724
725
726
727 #if defined(PIPE_ARCH_SSE)
728
729 /*
730 * Fast SSE2 implementation of special math functions.
731 */
732
733 #define POLY0(x, c0) _mm_set1_ps(c0)
734 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
735 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
736 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
737 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
738 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
739
740 #define EXP_POLY_DEGREE 3
741 #define LOG_POLY_DEGREE 5
742
743 /**
744 * See http://www.devmaster.net/forums/showthread.php?p=43580
745 */
746 static INLINE __m128
747 exp2f4(__m128 x)
748 {
749 __m128i ipart;
750 __m128 fpart, expipart, expfpart;
751
752 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
753 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
754
755 /* ipart = int(x - 0.5) */
756 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
757
758 /* fpart = x - ipart */
759 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
760
761 /* expipart = (float) (1 << ipart) */
762 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
763
764 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
765 #if EXP_POLY_DEGREE == 5
766 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
767 #elif EXP_POLY_DEGREE == 4
768 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
769 #elif EXP_POLY_DEGREE == 3
770 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
771 #elif EXP_POLY_DEGREE == 2
772 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
773 #else
774 #error
775 #endif
776
777 return _mm_mul_ps(expipart, expfpart);
778 }
779
780
781 /**
782 * See http://www.devmaster.net/forums/showthread.php?p=43580
783 */
784 static INLINE __m128
785 log2f4(__m128 x)
786 {
787 __m128i expmask = _mm_set1_epi32(0x7f800000);
788 __m128i mantmask = _mm_set1_epi32(0x007fffff);
789 __m128 one = _mm_set1_ps(1.0f);
790
791 __m128i i = _mm_castps_si128(x);
792
793 /* exp = (float) exponent(x) */
794 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
795
796 /* mant = (float) mantissa(x) */
797 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
798
799 __m128 logmant;
800
801 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
802 * These coefficients can be generate with
803 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
804 */
805 #if LOG_POLY_DEGREE == 6
806 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
807 #elif LOG_POLY_DEGREE == 5
808 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
809 #elif LOG_POLY_DEGREE == 4
810 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
811 #elif LOG_POLY_DEGREE == 3
812 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
813 #else
814 #error
815 #endif
816
817 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
818 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
819
820 return _mm_add_ps(logmant, exp);
821 }
822
823
824 static INLINE __m128
825 powf4(__m128 x, __m128 y)
826 {
827 return exp2f4(_mm_mul_ps(log2f4(x), y));
828 }
829
830 #endif /* PIPE_ARCH_SSE */
831
832
833
834 /**
835 * Low-level instruction translators.
836 */
837
838 static void
839 emit_abs(
840 struct x86_function *func,
841 unsigned xmm )
842 {
843 sse_andps(
844 func,
845 make_xmm( xmm ),
846 get_temp(
847 TGSI_EXEC_TEMP_7FFFFFFF_I,
848 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
849 }
850
851 static void
852 emit_add(
853 struct x86_function *func,
854 unsigned xmm_dst,
855 unsigned xmm_src )
856 {
857 sse_addps(
858 func,
859 make_xmm( xmm_dst ),
860 make_xmm( xmm_src ) );
861 }
862
863 static void PIPE_CDECL
864 cos4f(
865 float *store )
866 {
867 store[0] = cosf( store[0] );
868 store[1] = cosf( store[1] );
869 store[2] = cosf( store[2] );
870 store[3] = cosf( store[3] );
871 }
872
873 static void
874 emit_cos(
875 struct x86_function *func,
876 unsigned xmm_save,
877 unsigned xmm_dst )
878 {
879 emit_func_call_dst_src1(
880 func,
881 xmm_save,
882 xmm_dst,
883 xmm_dst,
884 cos4f );
885 }
886
887 static void PIPE_CDECL
888 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
889 __attribute__((force_align_arg_pointer))
890 #endif
891 ex24f(
892 float *store )
893 {
894 #if defined(PIPE_ARCH_SSE)
895 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
896 #else
897 store[0] = util_fast_exp2( store[0] );
898 store[1] = util_fast_exp2( store[1] );
899 store[2] = util_fast_exp2( store[2] );
900 store[3] = util_fast_exp2( store[3] );
901 #endif
902 }
903
904 static void
905 emit_ex2(
906 struct x86_function *func,
907 unsigned xmm_save,
908 unsigned xmm_dst )
909 {
910 emit_func_call_dst_src1(
911 func,
912 xmm_save,
913 xmm_dst,
914 xmm_dst,
915 ex24f );
916 }
917
918 static void
919 emit_f2it(
920 struct x86_function *func,
921 unsigned xmm )
922 {
923 sse2_cvttps2dq(
924 func,
925 make_xmm( xmm ),
926 make_xmm( xmm ) );
927 }
928
929 static void
930 emit_i2f(
931 struct x86_function *func,
932 unsigned xmm )
933 {
934 sse2_cvtdq2ps(
935 func,
936 make_xmm( xmm ),
937 make_xmm( xmm ) );
938 }
939
940 static void PIPE_CDECL
941 flr4f(
942 float *store )
943 {
944 store[0] = floorf( store[0] );
945 store[1] = floorf( store[1] );
946 store[2] = floorf( store[2] );
947 store[3] = floorf( store[3] );
948 }
949
950 static void
951 emit_flr(
952 struct x86_function *func,
953 unsigned xmm_save,
954 unsigned xmm_dst )
955 {
956 emit_func_call_dst_src1(
957 func,
958 xmm_save,
959 xmm_dst,
960 xmm_dst,
961 flr4f );
962 }
963
964 static void PIPE_CDECL
965 frc4f(
966 float *store )
967 {
968 store[0] -= floorf( store[0] );
969 store[1] -= floorf( store[1] );
970 store[2] -= floorf( store[2] );
971 store[3] -= floorf( store[3] );
972 }
973
974 static void
975 emit_frc(
976 struct x86_function *func,
977 unsigned xmm_save,
978 unsigned xmm_dst )
979 {
980 emit_func_call_dst_src1(
981 func,
982 xmm_save,
983 xmm_dst,
984 xmm_dst,
985 frc4f );
986 }
987
988 static void PIPE_CDECL
989 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
990 __attribute__((force_align_arg_pointer))
991 #endif
992 lg24f(
993 float *store )
994 {
995 #if defined(PIPE_ARCH_SSE)
996 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
997 #else
998 store[0] = util_fast_log2( store[0] );
999 store[1] = util_fast_log2( store[1] );
1000 store[2] = util_fast_log2( store[2] );
1001 store[3] = util_fast_log2( store[3] );
1002 #endif
1003 }
1004
1005 static void
1006 emit_lg2(
1007 struct x86_function *func,
1008 unsigned xmm_save,
1009 unsigned xmm_dst )
1010 {
1011 emit_func_call_dst_src1(
1012 func,
1013 xmm_save,
1014 xmm_dst,
1015 xmm_dst,
1016 lg24f );
1017 }
1018
1019 static void
1020 emit_MOV(
1021 struct x86_function *func,
1022 unsigned xmm_dst,
1023 unsigned xmm_src )
1024 {
1025 sse_movups(
1026 func,
1027 make_xmm( xmm_dst ),
1028 make_xmm( xmm_src ) );
1029 }
1030
1031 static void
1032 emit_mul (struct x86_function *func,
1033 unsigned xmm_dst,
1034 unsigned xmm_src)
1035 {
1036 sse_mulps(
1037 func,
1038 make_xmm( xmm_dst ),
1039 make_xmm( xmm_src ) );
1040 }
1041
1042 static void
1043 emit_neg(
1044 struct x86_function *func,
1045 unsigned xmm )
1046 {
1047 sse_xorps(
1048 func,
1049 make_xmm( xmm ),
1050 get_temp(
1051 TGSI_EXEC_TEMP_80000000_I,
1052 TGSI_EXEC_TEMP_80000000_C ) );
1053 }
1054
1055 static void PIPE_CDECL
1056 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1057 __attribute__((force_align_arg_pointer))
1058 #endif
1059 pow4f(
1060 float *store )
1061 {
1062 #if defined(PIPE_ARCH_SSE)
1063 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1064 #else
1065 store[0] = util_fast_pow( store[0], store[4] );
1066 store[1] = util_fast_pow( store[1], store[5] );
1067 store[2] = util_fast_pow( store[2], store[6] );
1068 store[3] = util_fast_pow( store[3], store[7] );
1069 #endif
1070 }
1071
1072 static void
1073 emit_pow(
1074 struct x86_function *func,
1075 unsigned xmm_save,
1076 unsigned xmm_dst,
1077 unsigned xmm_src0,
1078 unsigned xmm_src1 )
1079 {
1080 emit_func_call_dst_src2(
1081 func,
1082 xmm_save,
1083 xmm_dst,
1084 xmm_src0,
1085 xmm_src1,
1086 pow4f );
1087 }
1088
1089 static void
1090 emit_rcp (
1091 struct x86_function *func,
1092 unsigned xmm_dst,
1093 unsigned xmm_src )
1094 {
1095 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1096 * good enough. Need to either emit a proper divide or use the
1097 * iterative technique described below in emit_rsqrt().
1098 */
1099 sse2_rcpps(
1100 func,
1101 make_xmm( xmm_dst ),
1102 make_xmm( xmm_src ) );
1103 }
1104
1105 static void PIPE_CDECL
1106 rnd4f(
1107 float *store )
1108 {
1109 store[0] = floorf( store[0] + 0.5f );
1110 store[1] = floorf( store[1] + 0.5f );
1111 store[2] = floorf( store[2] + 0.5f );
1112 store[3] = floorf( store[3] + 0.5f );
1113 }
1114
1115 static void
1116 emit_rnd(
1117 struct x86_function *func,
1118 unsigned xmm_save,
1119 unsigned xmm_dst )
1120 {
1121 emit_func_call_dst_src1(
1122 func,
1123 xmm_save,
1124 xmm_dst,
1125 xmm_dst,
1126 rnd4f );
1127 }
1128
1129 static void
1130 emit_rsqrt(
1131 struct x86_function *func,
1132 unsigned xmm_dst,
1133 unsigned xmm_src )
1134 {
1135 #if HIGH_PRECISION
1136 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1137 * implementations, it is possible to improve its precision at
1138 * fairly low cost, using a newton/raphson step, as below:
1139 *
1140 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1141 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1142 *
1143 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1144 */
1145 {
1146 struct x86_reg dst = make_xmm( xmm_dst );
1147 struct x86_reg src = make_xmm( xmm_src );
1148 struct x86_reg tmp0 = make_xmm( 2 );
1149 struct x86_reg tmp1 = make_xmm( 3 );
1150
1151 assert( xmm_dst != xmm_src );
1152 assert( xmm_dst != 2 && xmm_dst != 3 );
1153 assert( xmm_src != 2 && xmm_src != 3 );
1154
1155 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1156 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1157 sse_rsqrtps( func, tmp1, src );
1158 sse_mulps( func, src, tmp1 );
1159 sse_mulps( func, dst, tmp1 );
1160 sse_mulps( func, src, tmp1 );
1161 sse_subps( func, tmp0, src );
1162 sse_mulps( func, dst, tmp0 );
1163 }
1164 #else
1165 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1166 * good enough.
1167 */
1168 sse_rsqrtps(
1169 func,
1170 make_xmm( xmm_dst ),
1171 make_xmm( xmm_src ) );
1172 #endif
1173 }
1174
1175 static void
1176 emit_setsign(
1177 struct x86_function *func,
1178 unsigned xmm )
1179 {
1180 sse_orps(
1181 func,
1182 make_xmm( xmm ),
1183 get_temp(
1184 TGSI_EXEC_TEMP_80000000_I,
1185 TGSI_EXEC_TEMP_80000000_C ) );
1186 }
1187
1188 static void PIPE_CDECL
1189 sgn4f(
1190 float *store )
1191 {
1192 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1193 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1194 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1195 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1196 }
1197
1198 static void
1199 emit_sgn(
1200 struct x86_function *func,
1201 unsigned xmm_save,
1202 unsigned xmm_dst )
1203 {
1204 emit_func_call_dst_src1(
1205 func,
1206 xmm_save,
1207 xmm_dst,
1208 xmm_dst,
1209 sgn4f );
1210 }
1211
1212 static void PIPE_CDECL
1213 sin4f(
1214 float *store )
1215 {
1216 store[0] = sinf( store[0] );
1217 store[1] = sinf( store[1] );
1218 store[2] = sinf( store[2] );
1219 store[3] = sinf( store[3] );
1220 }
1221
1222 static void
1223 emit_sin (struct x86_function *func,
1224 unsigned xmm_save,
1225 unsigned xmm_dst)
1226 {
1227 emit_func_call_dst_src1(
1228 func,
1229 xmm_save,
1230 xmm_dst,
1231 xmm_dst,
1232 sin4f );
1233 }
1234
1235 static void
1236 emit_sub(
1237 struct x86_function *func,
1238 unsigned xmm_dst,
1239 unsigned xmm_src )
1240 {
1241 sse_subps(
1242 func,
1243 make_xmm( xmm_dst ),
1244 make_xmm( xmm_src ) );
1245 }
1246
1247 /**
1248 * Register fetch.
1249 */
1250 static void
1251 emit_fetch(
1252 struct x86_function *func,
1253 unsigned xmm,
1254 const struct tgsi_full_src_register *reg,
1255 const unsigned chan_index )
1256 {
1257 unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1258
1259 switch (swizzle) {
1260 case TGSI_SWIZZLE_X:
1261 case TGSI_SWIZZLE_Y:
1262 case TGSI_SWIZZLE_Z:
1263 case TGSI_SWIZZLE_W:
1264 switch (reg->Register.File) {
1265 case TGSI_FILE_CONSTANT:
1266 emit_const(
1267 func,
1268 xmm,
1269 reg->Register.Index,
1270 swizzle,
1271 reg->Register.Indirect,
1272 reg->Indirect.File,
1273 reg->Indirect.Index );
1274 break;
1275
1276 case TGSI_FILE_IMMEDIATE:
1277 emit_immediate(
1278 func,
1279 xmm,
1280 reg->Register.Index,
1281 swizzle );
1282 break;
1283
1284 case TGSI_FILE_INPUT:
1285 case TGSI_FILE_SYSTEM_VALUE:
1286 emit_inputf(
1287 func,
1288 xmm,
1289 reg->Register.Index,
1290 swizzle );
1291 break;
1292
1293 case TGSI_FILE_TEMPORARY:
1294 emit_tempf(
1295 func,
1296 xmm,
1297 reg->Register.Index,
1298 swizzle );
1299 break;
1300
1301 default:
1302 assert( 0 );
1303 }
1304 break;
1305
1306 default:
1307 assert( 0 );
1308 }
1309
1310 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1311 case TGSI_UTIL_SIGN_CLEAR:
1312 emit_abs( func, xmm );
1313 break;
1314
1315 case TGSI_UTIL_SIGN_SET:
1316 emit_setsign( func, xmm );
1317 break;
1318
1319 case TGSI_UTIL_SIGN_TOGGLE:
1320 emit_neg( func, xmm );
1321 break;
1322
1323 case TGSI_UTIL_SIGN_KEEP:
1324 break;
1325 }
1326 }
1327
1328 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1329 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1330
1331 /**
1332 * Register store.
1333 */
1334 static void
1335 emit_store(
1336 struct x86_function *func,
1337 unsigned xmm,
1338 const struct tgsi_full_dst_register *reg,
1339 const struct tgsi_full_instruction *inst,
1340 unsigned chan_index )
1341 {
1342 switch( inst->Instruction.Saturate ) {
1343 case TGSI_SAT_NONE:
1344 break;
1345
1346 case TGSI_SAT_ZERO_ONE:
1347 sse_maxps(
1348 func,
1349 make_xmm( xmm ),
1350 get_temp(
1351 TGSI_EXEC_TEMP_00000000_I,
1352 TGSI_EXEC_TEMP_00000000_C ) );
1353
1354 sse_minps(
1355 func,
1356 make_xmm( xmm ),
1357 get_temp(
1358 TGSI_EXEC_TEMP_ONE_I,
1359 TGSI_EXEC_TEMP_ONE_C ) );
1360 break;
1361
1362 case TGSI_SAT_MINUS_PLUS_ONE:
1363 assert( 0 );
1364 break;
1365 }
1366
1367
1368 switch( reg->Register.File ) {
1369 case TGSI_FILE_OUTPUT:
1370 emit_output(
1371 func,
1372 xmm,
1373 reg->Register.Index,
1374 chan_index );
1375 break;
1376
1377 case TGSI_FILE_TEMPORARY:
1378 emit_temps(
1379 func,
1380 xmm,
1381 reg->Register.Index,
1382 chan_index );
1383 break;
1384
1385 case TGSI_FILE_ADDRESS:
1386 emit_addrs(
1387 func,
1388 xmm,
1389 reg->Register.Index,
1390 chan_index );
1391 break;
1392
1393 default:
1394 assert( 0 );
1395 }
1396 }
1397
1398 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1399 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1400
1401
1402 static void PIPE_CDECL
1403 fetch_texel( struct tgsi_sampler **sampler,
1404 float *store )
1405 {
1406 #if 0
1407 uint j;
1408
1409 debug_printf("%s sampler: %p (%p) store: %p\n",
1410 __FUNCTION__,
1411 sampler, *sampler,
1412 store );
1413
1414 for (j = 0; j < 4; j++)
1415 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1416 j,
1417 store[0+j],
1418 store[4+j],
1419 store[8 + j],
1420 store[12 + j]);
1421 #endif
1422
1423 {
1424 float rgba[NUM_CHANNELS][QUAD_SIZE];
1425 (*sampler)->get_samples(*sampler,
1426 &store[0], /* s */
1427 &store[4], /* t */
1428 &store[8], /* r */
1429 &store[12], /* lodbias */
1430 tgsi_sampler_lod_bias,
1431 rgba); /* results */
1432
1433 memcpy( store, rgba, 16 * sizeof(float));
1434 }
1435
1436 #if 0
1437 for (j = 0; j < 4; j++)
1438 debug_printf("sample %d result %f %f %f %f\n",
1439 j,
1440 store[0+j],
1441 store[4+j],
1442 store[8+j],
1443 store[12+j]);
1444 #endif
1445 }
1446
1447 /**
1448 * High-level instruction translators.
1449 */
1450 static void
1451 emit_tex( struct x86_function *func,
1452 const struct tgsi_full_instruction *inst,
1453 boolean lodbias,
1454 boolean projected)
1455 {
1456 const uint unit = inst->Src[1].Register.Index;
1457 struct x86_reg args[2];
1458 unsigned count;
1459 unsigned i;
1460
1461 assert(inst->Instruction.Texture);
1462 switch (inst->Texture.Texture) {
1463 case TGSI_TEXTURE_1D:
1464 count = 1;
1465 break;
1466 case TGSI_TEXTURE_2D:
1467 case TGSI_TEXTURE_RECT:
1468 count = 2;
1469 break;
1470 case TGSI_TEXTURE_SHADOW1D:
1471 case TGSI_TEXTURE_SHADOW2D:
1472 case TGSI_TEXTURE_SHADOWRECT:
1473 case TGSI_TEXTURE_3D:
1474 case TGSI_TEXTURE_CUBE:
1475 count = 3;
1476 break;
1477 default:
1478 assert(0);
1479 return;
1480 }
1481
1482 if (lodbias) {
1483 FETCH( func, *inst, 3, 0, 3 );
1484 }
1485 else {
1486 emit_tempf(
1487 func,
1488 3,
1489 TGSI_EXEC_TEMP_00000000_I,
1490 TGSI_EXEC_TEMP_00000000_C );
1491
1492 }
1493
1494 /* store lodbias whether enabled or not -- fetch_texel currently
1495 * respects it always.
1496 */
1497 sse_movaps( func,
1498 get_temp( TEMP_R0, 3 ),
1499 make_xmm( 3 ) );
1500
1501 if (projected) {
1502 FETCH( func, *inst, 3, 0, 3 );
1503
1504 emit_rcp( func, 3, 3 );
1505 }
1506
1507 for (i = 0; i < count; i++) {
1508 FETCH( func, *inst, i, 0, i );
1509
1510 if (projected) {
1511 sse_mulps(
1512 func,
1513 make_xmm( i ),
1514 make_xmm( 3 ) );
1515 }
1516
1517 /* Store in the argument buffer:
1518 */
1519 sse_movaps(
1520 func,
1521 get_temp( TEMP_R0, i ),
1522 make_xmm( i ) );
1523 }
1524
1525 args[0] = get_temp( TEMP_R0, 0 );
1526 args[1] = get_sampler_ptr( unit );
1527
1528 emit_func_call( func,
1529 0,
1530 args,
1531 Elements(args),
1532 fetch_texel );
1533
1534 /* If all four channels are enabled, could use a pointer to
1535 * dst[0].x instead of TEMP_R0 for store?
1536 */
1537 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1538
1539 sse_movaps(
1540 func,
1541 make_xmm( 0 ),
1542 get_temp( TEMP_R0, i ) );
1543
1544 STORE( func, *inst, 0, 0, i );
1545 }
1546 }
1547
1548
1549 static void
1550 emit_kil(
1551 struct x86_function *func,
1552 const struct tgsi_full_src_register *reg )
1553 {
1554 unsigned uniquemask;
1555 unsigned unique_count = 0;
1556 unsigned chan_index;
1557 unsigned i;
1558
1559 /* This mask stores component bits that were already tested. Note that
1560 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1561 * tested.
1562 */
1563 uniquemask = 0;
1564
1565 FOR_EACH_CHANNEL( chan_index ) {
1566 unsigned swizzle;
1567
1568 /* unswizzle channel */
1569 swizzle = tgsi_util_get_full_src_register_swizzle(
1570 reg,
1571 chan_index );
1572
1573 /* check if the component has not been already tested */
1574 if( !(uniquemask & (1 << swizzle)) ) {
1575 uniquemask |= 1 << swizzle;
1576
1577 /* allocate register */
1578 emit_fetch(
1579 func,
1580 unique_count++,
1581 reg,
1582 chan_index );
1583 }
1584 }
1585
1586 x86_push(
1587 func,
1588 x86_make_reg( file_REG32, reg_AX ) );
1589 x86_push(
1590 func,
1591 x86_make_reg( file_REG32, reg_DX ) );
1592
1593 for (i = 0 ; i < unique_count; i++ ) {
1594 struct x86_reg dataXMM = make_xmm(i);
1595
1596 sse_cmpps(
1597 func,
1598 dataXMM,
1599 get_temp(
1600 TGSI_EXEC_TEMP_00000000_I,
1601 TGSI_EXEC_TEMP_00000000_C ),
1602 cc_LessThan );
1603
1604 if( i == 0 ) {
1605 sse_movmskps(
1606 func,
1607 x86_make_reg( file_REG32, reg_AX ),
1608 dataXMM );
1609 }
1610 else {
1611 sse_movmskps(
1612 func,
1613 x86_make_reg( file_REG32, reg_DX ),
1614 dataXMM );
1615 x86_or(
1616 func,
1617 x86_make_reg( file_REG32, reg_AX ),
1618 x86_make_reg( file_REG32, reg_DX ) );
1619 }
1620 }
1621
1622 x86_or(
1623 func,
1624 get_temp(
1625 TGSI_EXEC_TEMP_KILMASK_I,
1626 TGSI_EXEC_TEMP_KILMASK_C ),
1627 x86_make_reg( file_REG32, reg_AX ) );
1628
1629 x86_pop(
1630 func,
1631 x86_make_reg( file_REG32, reg_DX ) );
1632 x86_pop(
1633 func,
1634 x86_make_reg( file_REG32, reg_AX ) );
1635 }
1636
1637
1638 static void
1639 emit_kilp(
1640 struct x86_function *func )
1641 {
1642 /* XXX todo / fix me */
1643 }
1644
1645
1646 static void
1647 emit_setcc(
1648 struct x86_function *func,
1649 struct tgsi_full_instruction *inst,
1650 enum sse_cc cc )
1651 {
1652 unsigned chan_index;
1653
1654 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1655 FETCH( func, *inst, 0, 0, chan_index );
1656 FETCH( func, *inst, 1, 1, chan_index );
1657 sse_cmpps(
1658 func,
1659 make_xmm( 0 ),
1660 make_xmm( 1 ),
1661 cc );
1662 sse_andps(
1663 func,
1664 make_xmm( 0 ),
1665 get_temp(
1666 TEMP_ONE_I,
1667 TEMP_ONE_C ) );
1668 STORE( func, *inst, 0, 0, chan_index );
1669 }
1670 }
1671
1672 static void
1673 emit_cmp(
1674 struct x86_function *func,
1675 struct tgsi_full_instruction *inst )
1676 {
1677 unsigned chan_index;
1678
1679 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1680 FETCH( func, *inst, 0, 0, chan_index );
1681 FETCH( func, *inst, 1, 1, chan_index );
1682 FETCH( func, *inst, 2, 2, chan_index );
1683 sse_cmpps(
1684 func,
1685 make_xmm( 0 ),
1686 get_temp(
1687 TGSI_EXEC_TEMP_00000000_I,
1688 TGSI_EXEC_TEMP_00000000_C ),
1689 cc_LessThan );
1690 sse_andps(
1691 func,
1692 make_xmm( 1 ),
1693 make_xmm( 0 ) );
1694 sse_andnps(
1695 func,
1696 make_xmm( 0 ),
1697 make_xmm( 2 ) );
1698 sse_orps(
1699 func,
1700 make_xmm( 0 ),
1701 make_xmm( 1 ) );
1702 STORE( func, *inst, 0, 0, chan_index );
1703 }
1704 }
1705
1706
1707 /**
1708 * Check if inst src/dest regs use indirect addressing into temporary,
1709 * input or output register files.
1710 */
1711 static boolean
1712 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1713 {
1714 uint i;
1715 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1716 const struct tgsi_full_src_register *reg = &inst->Src[i];
1717 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1718 reg->Register.File == TGSI_FILE_INPUT ||
1719 reg->Register.File == TGSI_FILE_OUTPUT) &&
1720 reg->Register.Indirect)
1721 return TRUE;
1722 }
1723 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1724 const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1725 if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1726 reg->Register.File == TGSI_FILE_INPUT ||
1727 reg->Register.File == TGSI_FILE_OUTPUT) &&
1728 reg->Register.Indirect)
1729 return TRUE;
1730 }
1731 return FALSE;
1732 }
1733
1734
1735 static int
1736 emit_instruction(
1737 struct x86_function *func,
1738 struct tgsi_full_instruction *inst )
1739 {
1740 unsigned chan_index;
1741
1742 /* we can't handle indirect addressing into temp register file yet */
1743 if (indirect_reg_reference(inst))
1744 return FALSE;
1745
1746 switch (inst->Instruction.Opcode) {
1747 case TGSI_OPCODE_ARL:
1748 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1749 FETCH( func, *inst, 0, 0, chan_index );
1750 emit_flr(func, 0, 0);
1751 emit_f2it( func, 0 );
1752 STORE( func, *inst, 0, 0, chan_index );
1753 }
1754 break;
1755
1756 case TGSI_OPCODE_MOV:
1757 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1758 FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1759 }
1760 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761 STORE( func, *inst, 4 + chan_index, 0, chan_index );
1762 }
1763 break;
1764
1765 case TGSI_OPCODE_LIT:
1766 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1767 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1768 emit_tempf(
1769 func,
1770 0,
1771 TEMP_ONE_I,
1772 TEMP_ONE_C);
1773 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1774 STORE( func, *inst, 0, 0, CHAN_X );
1775 }
1776 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1777 STORE( func, *inst, 0, 0, CHAN_W );
1778 }
1779 }
1780 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1781 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1782 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1783 FETCH( func, *inst, 0, 0, CHAN_X );
1784 sse_maxps(
1785 func,
1786 make_xmm( 0 ),
1787 get_temp(
1788 TGSI_EXEC_TEMP_00000000_I,
1789 TGSI_EXEC_TEMP_00000000_C ) );
1790 STORE( func, *inst, 0, 0, CHAN_Y );
1791 }
1792 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793 /* XMM[1] = SrcReg[0].yyyy */
1794 FETCH( func, *inst, 1, 0, CHAN_Y );
1795 /* XMM[1] = max(XMM[1], 0) */
1796 sse_maxps(
1797 func,
1798 make_xmm( 1 ),
1799 get_temp(
1800 TGSI_EXEC_TEMP_00000000_I,
1801 TGSI_EXEC_TEMP_00000000_C ) );
1802 /* XMM[2] = SrcReg[0].wwww */
1803 FETCH( func, *inst, 2, 0, CHAN_W );
1804 /* XMM[2] = min(XMM[2], 128.0) */
1805 sse_minps(
1806 func,
1807 make_xmm( 2 ),
1808 get_temp(
1809 TGSI_EXEC_TEMP_128_I,
1810 TGSI_EXEC_TEMP_128_C ) );
1811 /* XMM[2] = max(XMM[2], -128.0) */
1812 sse_maxps(
1813 func,
1814 make_xmm( 2 ),
1815 get_temp(
1816 TGSI_EXEC_TEMP_MINUS_128_I,
1817 TGSI_EXEC_TEMP_MINUS_128_C ) );
1818 emit_pow( func, 3, 1, 1, 2 );
1819 FETCH( func, *inst, 0, 0, CHAN_X );
1820 sse_xorps(
1821 func,
1822 make_xmm( 2 ),
1823 make_xmm( 2 ) );
1824 sse_cmpps(
1825 func,
1826 make_xmm( 2 ),
1827 make_xmm( 0 ),
1828 cc_LessThan );
1829 sse_andps(
1830 func,
1831 make_xmm( 2 ),
1832 make_xmm( 1 ) );
1833 STORE( func, *inst, 2, 0, CHAN_Z );
1834 }
1835 }
1836 break;
1837
1838 case TGSI_OPCODE_RCP:
1839 FETCH( func, *inst, 0, 0, CHAN_X );
1840 emit_rcp( func, 0, 0 );
1841 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1842 STORE( func, *inst, 0, 0, chan_index );
1843 }
1844 break;
1845
1846 case TGSI_OPCODE_RSQ:
1847 FETCH( func, *inst, 0, 0, CHAN_X );
1848 emit_abs( func, 0 );
1849 emit_rsqrt( func, 1, 0 );
1850 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1851 STORE( func, *inst, 1, 0, chan_index );
1852 }
1853 break;
1854
1855 case TGSI_OPCODE_EXP:
1856 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1857 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1858 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1859 FETCH( func, *inst, 0, 0, CHAN_X );
1860 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1862 emit_MOV( func, 1, 0 );
1863 emit_flr( func, 2, 1 );
1864 /* dst.x = ex2(floor(src.x)) */
1865 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1866 emit_MOV( func, 2, 1 );
1867 emit_ex2( func, 3, 2 );
1868 STORE( func, *inst, 2, 0, CHAN_X );
1869 }
1870 /* dst.y = src.x - floor(src.x) */
1871 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1872 emit_MOV( func, 2, 0 );
1873 emit_sub( func, 2, 1 );
1874 STORE( func, *inst, 2, 0, CHAN_Y );
1875 }
1876 }
1877 /* dst.z = ex2(src.x) */
1878 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1879 emit_ex2( func, 3, 0 );
1880 STORE( func, *inst, 0, 0, CHAN_Z );
1881 }
1882 }
1883 /* dst.w = 1.0 */
1884 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1885 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1886 STORE( func, *inst, 0, 0, CHAN_W );
1887 }
1888 break;
1889
1890 case TGSI_OPCODE_LOG:
1891 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1892 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1893 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1894 FETCH( func, *inst, 0, 0, CHAN_X );
1895 emit_abs( func, 0 );
1896 emit_MOV( func, 1, 0 );
1897 emit_lg2( func, 2, 1 );
1898 /* dst.z = lg2(abs(src.x)) */
1899 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900 STORE( func, *inst, 1, 0, CHAN_Z );
1901 }
1902 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1903 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1904 emit_flr( func, 2, 1 );
1905 /* dst.x = floor(lg2(abs(src.x))) */
1906 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1907 STORE( func, *inst, 1, 0, CHAN_X );
1908 }
1909 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1910 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1911 emit_ex2( func, 2, 1 );
1912 emit_rcp( func, 1, 1 );
1913 emit_mul( func, 0, 1 );
1914 STORE( func, *inst, 0, 0, CHAN_Y );
1915 }
1916 }
1917 }
1918 /* dst.w = 1.0 */
1919 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1920 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1921 STORE( func, *inst, 0, 0, CHAN_W );
1922 }
1923 break;
1924
1925 case TGSI_OPCODE_MUL:
1926 /* do all fetches and adds, storing results in temp regs */
1927 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1928 int r = chan_index + 1;
1929 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1930 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1931 emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */
1932 }
1933 /* do all stores of the temp regs */
1934 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1935 int r = chan_index + 1;
1936 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1937 }
1938 break;
1939
1940 case TGSI_OPCODE_ADD:
1941 /* do all fetches and adds, storing results in temp regs */
1942 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1943 int r = chan_index + 1;
1944 FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1945 FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1946 emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */
1947 }
1948 /* do all stores of the temp regs */
1949 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1950 int r = chan_index + 1;
1951 STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1952 }
1953 break;
1954
1955 case TGSI_OPCODE_DP3:
1956 FETCH( func, *inst, 0, 0, CHAN_X );
1957 FETCH( func, *inst, 1, 1, CHAN_X );
1958 emit_mul( func, 0, 1 );
1959 FETCH( func, *inst, 1, 0, CHAN_Y );
1960 FETCH( func, *inst, 2, 1, CHAN_Y );
1961 emit_mul( func, 1, 2 );
1962 emit_add( func, 0, 1 );
1963 FETCH( func, *inst, 1, 0, CHAN_Z );
1964 FETCH( func, *inst, 2, 1, CHAN_Z );
1965 emit_mul( func, 1, 2 );
1966 emit_add( func, 0, 1 );
1967 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1968 STORE( func, *inst, 0, 0, chan_index );
1969 }
1970 break;
1971
1972 case TGSI_OPCODE_DP4:
1973 FETCH( func, *inst, 0, 0, CHAN_X );
1974 FETCH( func, *inst, 1, 1, CHAN_X );
1975 emit_mul( func, 0, 1 );
1976 FETCH( func, *inst, 1, 0, CHAN_Y );
1977 FETCH( func, *inst, 2, 1, CHAN_Y );
1978 emit_mul( func, 1, 2 );
1979 emit_add( func, 0, 1 );
1980 FETCH( func, *inst, 1, 0, CHAN_Z );
1981 FETCH( func, *inst, 2, 1, CHAN_Z );
1982 emit_mul(func, 1, 2 );
1983 emit_add(func, 0, 1 );
1984 FETCH( func, *inst, 1, 0, CHAN_W );
1985 FETCH( func, *inst, 2, 1, CHAN_W );
1986 emit_mul( func, 1, 2 );
1987 emit_add( func, 0, 1 );
1988 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1989 STORE( func, *inst, 0, 0, chan_index );
1990 }
1991 break;
1992
1993 case TGSI_OPCODE_DST:
1994 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1995 emit_tempf(
1996 func,
1997 0,
1998 TEMP_ONE_I,
1999 TEMP_ONE_C );
2000 STORE( func, *inst, 0, 0, CHAN_X );
2001 }
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2003 FETCH( func, *inst, 0, 0, CHAN_Y );
2004 FETCH( func, *inst, 1, 1, CHAN_Y );
2005 emit_mul( func, 0, 1 );
2006 STORE( func, *inst, 0, 0, CHAN_Y );
2007 }
2008 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2009 FETCH( func, *inst, 0, 0, CHAN_Z );
2010 STORE( func, *inst, 0, 0, CHAN_Z );
2011 }
2012 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2013 FETCH( func, *inst, 0, 1, CHAN_W );
2014 STORE( func, *inst, 0, 0, CHAN_W );
2015 }
2016 break;
2017
2018 case TGSI_OPCODE_MIN:
2019 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2020 FETCH( func, *inst, 0, 0, chan_index );
2021 FETCH( func, *inst, 1, 1, chan_index );
2022 sse_minps(
2023 func,
2024 make_xmm( 0 ),
2025 make_xmm( 1 ) );
2026 STORE( func, *inst, 0, 0, chan_index );
2027 }
2028 break;
2029
2030 case TGSI_OPCODE_MAX:
2031 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2032 FETCH( func, *inst, 0, 0, chan_index );
2033 FETCH( func, *inst, 1, 1, chan_index );
2034 sse_maxps(
2035 func,
2036 make_xmm( 0 ),
2037 make_xmm( 1 ) );
2038 STORE( func, *inst, 0, 0, chan_index );
2039 }
2040 break;
2041
2042 case TGSI_OPCODE_SLT:
2043 emit_setcc( func, inst, cc_LessThan );
2044 break;
2045
2046 case TGSI_OPCODE_SGE:
2047 emit_setcc( func, inst, cc_NotLessThan );
2048 break;
2049
2050 case TGSI_OPCODE_MAD:
2051 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2052 FETCH( func, *inst, 0, 0, chan_index );
2053 FETCH( func, *inst, 1, 1, chan_index );
2054 FETCH( func, *inst, 2, 2, chan_index );
2055 emit_mul( func, 0, 1 );
2056 emit_add( func, 0, 2 );
2057 STORE( func, *inst, 0, 0, chan_index );
2058 }
2059 break;
2060
2061 case TGSI_OPCODE_SUB:
2062 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2063 FETCH( func, *inst, 0, 0, chan_index );
2064 FETCH( func, *inst, 1, 1, chan_index );
2065 emit_sub( func, 0, 1 );
2066 STORE( func, *inst, 0, 0, chan_index );
2067 }
2068 break;
2069
2070 case TGSI_OPCODE_LRP:
2071 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2072 FETCH( func, *inst, 0, 0, chan_index );
2073 FETCH( func, *inst, 1, 1, chan_index );
2074 FETCH( func, *inst, 2, 2, chan_index );
2075 emit_sub( func, 1, 2 );
2076 emit_mul( func, 0, 1 );
2077 emit_add( func, 0, 2 );
2078 STORE( func, *inst, 0, 0, chan_index );
2079 }
2080 break;
2081
2082 case TGSI_OPCODE_CND:
2083 return 0;
2084 break;
2085
2086 case TGSI_OPCODE_DP2A:
2087 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2088 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2089 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2090 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2091 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2092 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2093 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2094 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2095 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2096 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2097 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2098 }
2099 break;
2100
2101 case TGSI_OPCODE_FRC:
2102 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103 FETCH( func, *inst, 0, 0, chan_index );
2104 emit_frc( func, 0, 0 );
2105 STORE( func, *inst, 0, 0, chan_index );
2106 }
2107 break;
2108
2109 case TGSI_OPCODE_CLAMP:
2110 return 0;
2111 break;
2112
2113 case TGSI_OPCODE_FLR:
2114 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2115 FETCH( func, *inst, 0, 0, chan_index );
2116 emit_flr( func, 0, 0 );
2117 STORE( func, *inst, 0, 0, chan_index );
2118 }
2119 break;
2120
2121 case TGSI_OPCODE_ROUND:
2122 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2123 FETCH( func, *inst, 0, 0, chan_index );
2124 emit_rnd( func, 0, 0 );
2125 STORE( func, *inst, 0, 0, chan_index );
2126 }
2127 break;
2128
2129 case TGSI_OPCODE_EX2:
2130 FETCH( func, *inst, 0, 0, CHAN_X );
2131 emit_ex2( func, 0, 0 );
2132 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2133 STORE( func, *inst, 0, 0, chan_index );
2134 }
2135 break;
2136
2137 case TGSI_OPCODE_LG2:
2138 FETCH( func, *inst, 0, 0, CHAN_X );
2139 emit_lg2( func, 0, 0 );
2140 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2141 STORE( func, *inst, 0, 0, chan_index );
2142 }
2143 break;
2144
2145 case TGSI_OPCODE_POW:
2146 FETCH( func, *inst, 0, 0, CHAN_X );
2147 FETCH( func, *inst, 1, 1, CHAN_X );
2148 emit_pow( func, 0, 0, 0, 1 );
2149 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2150 STORE( func, *inst, 0, 0, chan_index );
2151 }
2152 break;
2153
2154 case TGSI_OPCODE_XPD:
2155 /* Note: we do all stores after all operands have been fetched
2156 * to avoid src/dst register aliasing issues for an instruction
2157 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2158 */
2159 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2160 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2161 FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2162 FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2163 }
2164 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2165 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2166 FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2167 FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2168 }
2169 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2170 emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */
2171 emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2172 emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */
2173 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2174 emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2175 /* store xmm[7] in dst.x below */
2176 }
2177 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2178 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2179 FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2180 FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2181 }
2182 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2183 emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2184 emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2185 emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2186 /* store xmm[3] in dst.y below */
2187 }
2188 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2189 emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2190 emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2191 emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2192 STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2193 }
2194 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2195 STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2196 }
2197 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2198 STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2199 }
2200 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2201 emit_tempf(
2202 func,
2203 0,
2204 TEMP_ONE_I,
2205 TEMP_ONE_C );
2206 STORE( func, *inst, 0, 0, CHAN_W );
2207 }
2208 break;
2209
2210 case TGSI_OPCODE_ABS:
2211 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2212 FETCH( func, *inst, 0, 0, chan_index );
2213 emit_abs( func, 0) ;
2214
2215 STORE( func, *inst, 0, 0, chan_index );
2216 }
2217 break;
2218
2219 case TGSI_OPCODE_RCC:
2220 return 0;
2221 break;
2222
2223 case TGSI_OPCODE_DPH:
2224 FETCH( func, *inst, 0, 0, CHAN_X );
2225 FETCH( func, *inst, 1, 1, CHAN_X );
2226 emit_mul( func, 0, 1 );
2227 FETCH( func, *inst, 1, 0, CHAN_Y );
2228 FETCH( func, *inst, 2, 1, CHAN_Y );
2229 emit_mul( func, 1, 2 );
2230 emit_add( func, 0, 1 );
2231 FETCH( func, *inst, 1, 0, CHAN_Z );
2232 FETCH( func, *inst, 2, 1, CHAN_Z );
2233 emit_mul( func, 1, 2 );
2234 emit_add( func, 0, 1 );
2235 FETCH( func, *inst, 1, 1, CHAN_W );
2236 emit_add( func, 0, 1 );
2237 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2238 STORE( func, *inst, 0, 0, chan_index );
2239 }
2240 break;
2241
2242 case TGSI_OPCODE_COS:
2243 FETCH( func, *inst, 0, 0, CHAN_X );
2244 emit_cos( func, 0, 0 );
2245 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2246 STORE( func, *inst, 0, 0, chan_index );
2247 }
2248 break;
2249
2250 case TGSI_OPCODE_DDX:
2251 return 0;
2252 break;
2253
2254 case TGSI_OPCODE_DDY:
2255 return 0;
2256 break;
2257
2258 case TGSI_OPCODE_KILP:
2259 /* predicated kill */
2260 emit_kilp( func );
2261 return 0; /* XXX fix me */
2262 break;
2263
2264 case TGSI_OPCODE_KIL:
2265 /* conditional kill */
2266 emit_kil( func, &inst->Src[0] );
2267 break;
2268
2269 case TGSI_OPCODE_PK2H:
2270 return 0;
2271 break;
2272
2273 case TGSI_OPCODE_PK2US:
2274 return 0;
2275 break;
2276
2277 case TGSI_OPCODE_PK4B:
2278 return 0;
2279 break;
2280
2281 case TGSI_OPCODE_PK4UB:
2282 return 0;
2283 break;
2284
2285 case TGSI_OPCODE_RFL:
2286 return 0;
2287 break;
2288
2289 case TGSI_OPCODE_SEQ:
2290 emit_setcc( func, inst, cc_Equal );
2291 break;
2292
2293 case TGSI_OPCODE_SFL:
2294 return 0;
2295 break;
2296
2297 case TGSI_OPCODE_SGT:
2298 emit_setcc( func, inst, cc_NotLessThanEqual );
2299 break;
2300
2301 case TGSI_OPCODE_SIN:
2302 FETCH( func, *inst, 0, 0, CHAN_X );
2303 emit_sin( func, 0, 0 );
2304 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2305 STORE( func, *inst, 0, 0, chan_index );
2306 }
2307 break;
2308
2309 case TGSI_OPCODE_SLE:
2310 emit_setcc( func, inst, cc_LessThanEqual );
2311 break;
2312
2313 case TGSI_OPCODE_SNE:
2314 emit_setcc( func, inst, cc_NotEqual );
2315 break;
2316
2317 case TGSI_OPCODE_STR:
2318 return 0;
2319 break;
2320
2321 case TGSI_OPCODE_TEX:
2322 emit_tex( func, inst, FALSE, FALSE );
2323 break;
2324
2325 case TGSI_OPCODE_TXD:
2326 return 0;
2327 break;
2328
2329 case TGSI_OPCODE_UP2H:
2330 return 0;
2331 break;
2332
2333 case TGSI_OPCODE_UP2US:
2334 return 0;
2335 break;
2336
2337 case TGSI_OPCODE_UP4B:
2338 return 0;
2339 break;
2340
2341 case TGSI_OPCODE_UP4UB:
2342 return 0;
2343 break;
2344
2345 case TGSI_OPCODE_X2D:
2346 return 0;
2347 break;
2348
2349 case TGSI_OPCODE_ARA:
2350 return 0;
2351 break;
2352
2353 case TGSI_OPCODE_ARR:
2354 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2355 FETCH( func, *inst, 0, 0, chan_index );
2356 emit_rnd( func, 0, 0 );
2357 emit_f2it( func, 0 );
2358 STORE( func, *inst, 0, 0, chan_index );
2359 }
2360 break;
2361
2362 case TGSI_OPCODE_BRA:
2363 return 0;
2364 break;
2365
2366 case TGSI_OPCODE_CAL:
2367 return 0;
2368 break;
2369
2370 case TGSI_OPCODE_RET:
2371 emit_ret( func );
2372 break;
2373
2374 case TGSI_OPCODE_END:
2375 break;
2376
2377 case TGSI_OPCODE_SSG:
2378 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2379 FETCH( func, *inst, 0, 0, chan_index );
2380 emit_sgn( func, 0, 0 );
2381 STORE( func, *inst, 0, 0, chan_index );
2382 }
2383 break;
2384
2385 case TGSI_OPCODE_CMP:
2386 emit_cmp (func, inst);
2387 break;
2388
2389 case TGSI_OPCODE_SCS:
2390 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2391 FETCH( func, *inst, 0, 0, CHAN_X );
2392 emit_cos( func, 0, 0 );
2393 STORE( func, *inst, 0, 0, CHAN_X );
2394 }
2395 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2396 FETCH( func, *inst, 0, 0, CHAN_X );
2397 emit_sin( func, 0, 0 );
2398 STORE( func, *inst, 0, 0, CHAN_Y );
2399 }
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2401 emit_tempf(
2402 func,
2403 0,
2404 TGSI_EXEC_TEMP_00000000_I,
2405 TGSI_EXEC_TEMP_00000000_C );
2406 STORE( func, *inst, 0, 0, CHAN_Z );
2407 }
2408 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2409 emit_tempf(
2410 func,
2411 0,
2412 TEMP_ONE_I,
2413 TEMP_ONE_C );
2414 STORE( func, *inst, 0, 0, CHAN_W );
2415 }
2416 break;
2417
2418 case TGSI_OPCODE_TXB:
2419 emit_tex( func, inst, TRUE, FALSE );
2420 break;
2421
2422 case TGSI_OPCODE_NRM:
2423 /* fall-through */
2424 case TGSI_OPCODE_NRM4:
2425 /* 3 or 4-component normalization */
2426 {
2427 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2428
2429 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2430 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2431 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2432 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2433
2434 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2435
2436 /* xmm4 = src.x */
2437 /* xmm0 = src.x * src.x */
2438 FETCH(func, *inst, 0, 0, CHAN_X);
2439 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2440 emit_MOV(func, 4, 0);
2441 }
2442 emit_mul(func, 0, 0);
2443
2444 /* xmm5 = src.y */
2445 /* xmm0 = xmm0 + src.y * src.y */
2446 FETCH(func, *inst, 1, 0, CHAN_Y);
2447 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2448 emit_MOV(func, 5, 1);
2449 }
2450 emit_mul(func, 1, 1);
2451 emit_add(func, 0, 1);
2452
2453 /* xmm6 = src.z */
2454 /* xmm0 = xmm0 + src.z * src.z */
2455 FETCH(func, *inst, 1, 0, CHAN_Z);
2456 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2457 emit_MOV(func, 6, 1);
2458 }
2459 emit_mul(func, 1, 1);
2460 emit_add(func, 0, 1);
2461
2462 if (dims == 4) {
2463 /* xmm7 = src.w */
2464 /* xmm0 = xmm0 + src.w * src.w */
2465 FETCH(func, *inst, 1, 0, CHAN_W);
2466 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2467 emit_MOV(func, 7, 1);
2468 }
2469 emit_mul(func, 1, 1);
2470 emit_add(func, 0, 1);
2471 }
2472
2473 /* xmm1 = 1 / sqrt(xmm0) */
2474 emit_rsqrt(func, 1, 0);
2475
2476 /* dst.x = xmm1 * src.x */
2477 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2478 emit_mul(func, 4, 1);
2479 STORE(func, *inst, 4, 0, CHAN_X);
2480 }
2481
2482 /* dst.y = xmm1 * src.y */
2483 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2484 emit_mul(func, 5, 1);
2485 STORE(func, *inst, 5, 0, CHAN_Y);
2486 }
2487
2488 /* dst.z = xmm1 * src.z */
2489 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2490 emit_mul(func, 6, 1);
2491 STORE(func, *inst, 6, 0, CHAN_Z);
2492 }
2493
2494 /* dst.w = xmm1 * src.w */
2495 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2496 emit_mul(func, 7, 1);
2497 STORE(func, *inst, 7, 0, CHAN_W);
2498 }
2499 }
2500
2501 /* dst0.w = 1.0 */
2502 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2503 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2504 STORE(func, *inst, 0, 0, CHAN_W);
2505 }
2506 }
2507 break;
2508
2509 case TGSI_OPCODE_DIV:
2510 return 0;
2511 break;
2512
2513 case TGSI_OPCODE_DP2:
2514 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2515 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2516 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2517 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2518 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2519 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2520 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2521 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2522 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2523 }
2524 break;
2525
2526 case TGSI_OPCODE_TXL:
2527 return 0;
2528 break;
2529
2530 case TGSI_OPCODE_TXP:
2531 emit_tex( func, inst, FALSE, TRUE );
2532 break;
2533
2534 case TGSI_OPCODE_BRK:
2535 return 0;
2536 break;
2537
2538 case TGSI_OPCODE_IF:
2539 return 0;
2540 break;
2541
2542 case TGSI_OPCODE_ELSE:
2543 return 0;
2544 break;
2545
2546 case TGSI_OPCODE_ENDIF:
2547 return 0;
2548 break;
2549
2550 case TGSI_OPCODE_PUSHA:
2551 return 0;
2552 break;
2553
2554 case TGSI_OPCODE_POPA:
2555 return 0;
2556 break;
2557
2558 case TGSI_OPCODE_CEIL:
2559 return 0;
2560 break;
2561
2562 case TGSI_OPCODE_I2F:
2563 return 0;
2564 break;
2565
2566 case TGSI_OPCODE_NOT:
2567 return 0;
2568 break;
2569
2570 case TGSI_OPCODE_TRUNC:
2571 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2572 FETCH( func, *inst, 0, 0, chan_index );
2573 emit_f2it( func, 0 );
2574 emit_i2f( func, 0 );
2575 STORE( func, *inst, 0, 0, chan_index );
2576 }
2577 break;
2578
2579 case TGSI_OPCODE_SHL:
2580 return 0;
2581 break;
2582
2583 case TGSI_OPCODE_ISHR:
2584 return 0;
2585 break;
2586
2587 case TGSI_OPCODE_AND:
2588 return 0;
2589 break;
2590
2591 case TGSI_OPCODE_OR:
2592 return 0;
2593 break;
2594
2595 case TGSI_OPCODE_MOD:
2596 return 0;
2597 break;
2598
2599 case TGSI_OPCODE_XOR:
2600 return 0;
2601 break;
2602
2603 case TGSI_OPCODE_SAD:
2604 return 0;
2605 break;
2606
2607 case TGSI_OPCODE_TXF:
2608 return 0;
2609 break;
2610
2611 case TGSI_OPCODE_TXQ:
2612 return 0;
2613 break;
2614
2615 case TGSI_OPCODE_CONT:
2616 return 0;
2617 break;
2618
2619 case TGSI_OPCODE_EMIT:
2620 return 0;
2621 break;
2622
2623 case TGSI_OPCODE_ENDPRIM:
2624 return 0;
2625 break;
2626
2627 default:
2628 return 0;
2629 }
2630
2631 return 1;
2632 }
2633
2634 static void
2635 emit_declaration(
2636 struct x86_function *func,
2637 struct tgsi_full_declaration *decl )
2638 {
2639 if( decl->Declaration.File == TGSI_FILE_INPUT ||
2640 decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
2641 unsigned first, last, mask;
2642 unsigned i, j;
2643
2644 first = decl->Range.First;
2645 last = decl->Range.Last;
2646 mask = decl->Declaration.UsageMask;
2647
2648 for( i = first; i <= last; i++ ) {
2649 for( j = 0; j < NUM_CHANNELS; j++ ) {
2650 if( mask & (1 << j) ) {
2651 switch( decl->Declaration.Interpolate ) {
2652 case TGSI_INTERPOLATE_CONSTANT:
2653 emit_coef_a0( func, 0, i, j );
2654 emit_inputs( func, 0, i, j );
2655 break;
2656
2657 case TGSI_INTERPOLATE_LINEAR:
2658 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2659 emit_coef_dadx( func, 1, i, j );
2660 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2661 emit_coef_dady( func, 3, i, j );
2662 emit_mul( func, 0, 1 ); /* x * dadx */
2663 emit_coef_a0( func, 4, i, j );
2664 emit_mul( func, 2, 3 ); /* y * dady */
2665 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2666 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2667 emit_inputs( func, 0, i, j );
2668 break;
2669
2670 case TGSI_INTERPOLATE_PERSPECTIVE:
2671 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2672 emit_coef_dadx( func, 1, i, j );
2673 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2674 emit_coef_dady( func, 3, i, j );
2675 emit_mul( func, 0, 1 ); /* x * dadx */
2676 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2677 emit_coef_a0( func, 5, i, j );
2678 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2679 emit_mul( func, 2, 3 ); /* y * dady */
2680 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2681 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2682 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2683 emit_inputs( func, 0, i, j );
2684 break;
2685
2686 default:
2687 assert( 0 );
2688 break;
2689 }
2690 }
2691 }
2692 }
2693 }
2694 }
2695
2696 static void aos_to_soa( struct x86_function *func,
2697 uint arg_aos,
2698 uint arg_machine,
2699 uint arg_num,
2700 uint arg_stride )
2701 {
2702 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2703 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2704 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2705 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2706 int inner_loop;
2707
2708
2709 /* Save EBX */
2710 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2711
2712 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2713 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2714 x86_lea( func, soa_input,
2715 x86_make_disp( soa_input,
2716 Offset(struct tgsi_exec_machine, Inputs) ) );
2717 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2718 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2719
2720 /* do */
2721 inner_loop = x86_get_label( func );
2722 {
2723 x86_push( func, aos_input );
2724 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2725 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2726 x86_add( func, aos_input, stride );
2727 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2728 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2729 x86_add( func, aos_input, stride );
2730 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2731 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2732 x86_add( func, aos_input, stride );
2733 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2734 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2735 x86_pop( func, aos_input );
2736
2737 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2738 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2739 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2740 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2741 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2742 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2743
2744 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2745 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2746 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2747 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2748
2749 /* Advance to next input */
2750 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2751 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2752 }
2753 /* while --num_inputs */
2754 x86_dec( func, num_inputs );
2755 x86_jcc( func, cc_NE, inner_loop );
2756
2757 /* Restore EBX */
2758 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2759 }
2760
2761 static void soa_to_aos( struct x86_function *func,
2762 uint arg_aos,
2763 uint arg_machine,
2764 uint arg_num,
2765 uint arg_stride )
2766 {
2767 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2768 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2769 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2770 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2771 int inner_loop;
2772
2773 /* Save EBX */
2774 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2775
2776 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2777 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2778 x86_lea( func, soa_output,
2779 x86_make_disp( soa_output,
2780 Offset(struct tgsi_exec_machine, Outputs) ) );
2781 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2782
2783 /* do */
2784 inner_loop = x86_get_label( func );
2785 {
2786 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2787 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2788 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2789 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2790
2791 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2792 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2793 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2794 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2795 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2796 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2797
2798 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2799 x86_push( func, aos_output );
2800 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2801 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2802 x86_add( func, aos_output, temp );
2803 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2804 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2805 x86_add( func, aos_output, temp );
2806 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2807 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2808 x86_add( func, aos_output, temp );
2809 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2810 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2811 x86_pop( func, aos_output );
2812
2813 /* Advance to next output */
2814 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2815 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2816 }
2817 /* while --num_outputs */
2818 x86_dec( func, num_outputs );
2819 x86_jcc( func, cc_NE, inner_loop );
2820
2821 /* Restore EBX */
2822 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2823 }
2824
2825
2826 /**
2827 * Check if the instructions dst register is the same as any src
2828 * register and warn if there's a posible SOA dependency.
2829 */
2830 static void
2831 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2832 {
2833 switch (inst->Instruction.Opcode) {
2834 case TGSI_OPCODE_ADD:
2835 case TGSI_OPCODE_MOV:
2836 case TGSI_OPCODE_MUL:
2837 case TGSI_OPCODE_XPD:
2838 /* OK - these opcodes correctly handle SOA dependencies */
2839 break;
2840 default:
2841 if (tgsi_check_soa_dependencies(inst)) {
2842 uint opcode = inst->Instruction.Opcode;
2843
2844 /* XXX: we only handle src/dst aliasing in a few opcodes
2845 * currently. Need to use an additional temporay to hold
2846 * the result in the cases where the code is too opaque to
2847 * fix.
2848 */
2849 if (opcode != TGSI_OPCODE_MOV) {
2850 debug_printf("Warning: src/dst aliasing in instruction"
2851 " is not handled:\n");
2852 tgsi_dump_instruction(inst, 1);
2853 }
2854 }
2855 }
2856 }
2857
2858
2859 /**
2860 * Translate a TGSI vertex/fragment shader to SSE2 code.
2861 * Slightly different things are done for vertex vs. fragment shaders.
2862 *
2863 * \param tokens the TGSI input shader
2864 * \param func the output SSE code/function
2865 * \param immediates buffer to place immediates, later passed to SSE func
2866 * \param return 1 for success, 0 if translation failed
2867 */
2868 unsigned
2869 tgsi_emit_sse2(
2870 const struct tgsi_token *tokens,
2871 struct x86_function *func,
2872 float (*immediates)[4],
2873 boolean do_swizzles )
2874 {
2875 struct tgsi_parse_context parse;
2876 unsigned ok = 1;
2877 uint num_immediates = 0;
2878
2879 util_init_math();
2880
2881 func->csr = func->store;
2882
2883 tgsi_parse_init( &parse, tokens );
2884
2885 /* Can't just use EDI, EBX without save/restoring them:
2886 */
2887 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2888 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2889
2890 /*
2891 * Different function args for vertex/fragment shaders:
2892 */
2893 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2894 if (do_swizzles)
2895 aos_to_soa( func,
2896 4, /* aos_input */
2897 1, /* machine */
2898 5, /* num_inputs */
2899 6 ); /* input_stride */
2900 }
2901
2902 x86_mov(
2903 func,
2904 get_machine_base(),
2905 x86_fn_arg( func, 1 ) );
2906 x86_mov(
2907 func,
2908 get_const_base(),
2909 x86_fn_arg( func, 2 ) );
2910 x86_mov(
2911 func,
2912 get_immediate_base(),
2913 x86_fn_arg( func, 3 ) );
2914
2915 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2916 x86_mov(
2917 func,
2918 get_coef_base(),
2919 x86_fn_arg( func, 4 ) );
2920 }
2921
2922 x86_mov(
2923 func,
2924 get_sampler_base(),
2925 x86_make_disp( get_machine_base(),
2926 Offset( struct tgsi_exec_machine, Samplers ) ) );
2927
2928 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2929 tgsi_parse_token( &parse );
2930
2931 switch( parse.FullToken.Token.Type ) {
2932 case TGSI_TOKEN_TYPE_DECLARATION:
2933 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2934 emit_declaration(
2935 func,
2936 &parse.FullToken.FullDeclaration );
2937 }
2938 break;
2939
2940 case TGSI_TOKEN_TYPE_INSTRUCTION:
2941 ok = emit_instruction(
2942 func,
2943 &parse.FullToken.FullInstruction );
2944
2945 if (!ok) {
2946 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2947 uint proc = parse.FullHeader.Processor.Processor;
2948 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2949 opcode,
2950 tgsi_get_opcode_name(opcode),
2951 tgsi_get_processor_name(proc));
2952 }
2953
2954 check_soa_dependencies(&parse.FullToken.FullInstruction);
2955 break;
2956
2957 case TGSI_TOKEN_TYPE_IMMEDIATE:
2958 /* simply copy the immediate values into the next immediates[] slot */
2959 {
2960 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2961 uint i;
2962 assert(size <= 4);
2963 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2964 for( i = 0; i < size; i++ ) {
2965 immediates[num_immediates][i] =
2966 parse.FullToken.FullImmediate.u[i].Float;
2967 }
2968 #if 0
2969 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2970 num_immediates,
2971 immediates[num_immediates][0],
2972 immediates[num_immediates][1],
2973 immediates[num_immediates][2],
2974 immediates[num_immediates][3]);
2975 #endif
2976 num_immediates++;
2977 }
2978 break;
2979 case TGSI_TOKEN_TYPE_PROPERTY:
2980 /* we just ignore them for now */
2981 break;
2982
2983 default:
2984 ok = 0;
2985 assert( 0 );
2986 }
2987 }
2988
2989 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2990 if (do_swizzles)
2991 soa_to_aos( func,
2992 7, /* aos_output */
2993 1, /* machine */
2994 8, /* num_outputs */
2995 9 ); /* output_stride */
2996 }
2997
2998 /* Can't just use EBX, EDI without save/restoring them:
2999 */
3000 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3001 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3002
3003 emit_ret( func );
3004
3005 tgsi_parse_free( &parse );
3006
3007 return ok;
3008 }
3009
3010 #endif /* PIPE_ARCH_X86 */