tgsi: Implement OPCODE_ROUND for SSE2 backend.
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_debug.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "tgsi/tgsi_util.h"
33 #include "tgsi_exec.h"
34 #include "tgsi_sse2.h"
35
36 #include "rtasm/rtasm_x86sse.h"
37
38 #ifdef PIPE_ARCH_X86
39
40 /* for 1/sqrt()
41 *
42 * This costs about 100fps (close to 10%) in gears:
43 */
44 #define HIGH_PRECISION 1
45
46 #define FAST_MATH 1
47
48
49 #define FOR_EACH_CHANNEL( CHAN )\
50 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
51
52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
54
55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
57
58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
59 FOR_EACH_CHANNEL( CHAN )\
60 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
61
62 #define CHAN_X 0
63 #define CHAN_Y 1
64 #define CHAN_Z 2
65 #define CHAN_W 3
66
67 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
68 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
69
70 #define TEMP_R0 TGSI_EXEC_TEMP_R0
71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
74
75
76 /**
77 * X86 utility functions.
78 */
79
80 static struct x86_reg
81 make_xmm(
82 unsigned xmm )
83 {
84 return x86_make_reg(
85 file_XMM,
86 (enum x86_reg_name) xmm );
87 }
88
89 /**
90 * X86 register mapping helpers.
91 */
92
93 static struct x86_reg
94 get_const_base( void )
95 {
96 return x86_make_reg(
97 file_REG32,
98 reg_CX );
99 }
100
101 static struct x86_reg
102 get_input_base( void )
103 {
104 return x86_make_reg(
105 file_REG32,
106 reg_AX );
107 }
108
109 static struct x86_reg
110 get_output_base( void )
111 {
112 return x86_make_reg(
113 file_REG32,
114 reg_DX );
115 }
116
117 static struct x86_reg
118 get_temp_base( void )
119 {
120 return x86_make_reg(
121 file_REG32,
122 reg_BX );
123 }
124
125 static struct x86_reg
126 get_coef_base( void )
127 {
128 return get_output_base();
129 }
130
131 static struct x86_reg
132 get_immediate_base( void )
133 {
134 return x86_make_reg(
135 file_REG32,
136 reg_DI );
137 }
138
139
140 /**
141 * Data access helpers.
142 */
143
144
145 static struct x86_reg
146 get_immediate(
147 unsigned vec,
148 unsigned chan )
149 {
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec * 4 + chan) * 4 );
153 }
154
155 static struct x86_reg
156 get_const(
157 unsigned vec,
158 unsigned chan )
159 {
160 return x86_make_disp(
161 get_const_base(),
162 (vec * 4 + chan) * 4 );
163 }
164
165 static struct x86_reg
166 get_input(
167 unsigned vec,
168 unsigned chan )
169 {
170 return x86_make_disp(
171 get_input_base(),
172 (vec * 4 + chan) * 16 );
173 }
174
175 static struct x86_reg
176 get_output(
177 unsigned vec,
178 unsigned chan )
179 {
180 return x86_make_disp(
181 get_output_base(),
182 (vec * 4 + chan) * 16 );
183 }
184
185 static struct x86_reg
186 get_temp(
187 unsigned vec,
188 unsigned chan )
189 {
190 return x86_make_disp(
191 get_temp_base(),
192 (vec * 4 + chan) * 16 );
193 }
194
195 static struct x86_reg
196 get_coef(
197 unsigned vec,
198 unsigned chan,
199 unsigned member )
200 {
201 return x86_make_disp(
202 get_coef_base(),
203 ((vec * 3 + member) * 4 + chan) * 4 );
204 }
205
206
207 static void
208 emit_ret(
209 struct x86_function *func )
210 {
211 x86_ret( func );
212 }
213
214
215 /**
216 * Data fetch helpers.
217 */
218
219 /**
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
224 */
225 static void
226 emit_const(
227 struct x86_function *func,
228 uint xmm,
229 int vec,
230 uint chan,
231 uint indirect,
232 uint indirectFile,
233 int indirectIndex )
234 {
235 if (indirect) {
236 /* 'vec' is the offset from the address register's value.
237 * We're loading CONST[ADDR+vec] into an xmm register.
238 */
239 struct x86_reg r0 = get_input_base();
240 struct x86_reg r1 = get_output_base();
241 uint i;
242
243 assert( indirectFile == TGSI_FILE_ADDRESS );
244 assert( indirectIndex == 0 );
245
246 x86_push( func, r0 );
247 x86_push( func, r1 );
248
249 /*
250 * Loop over the four pixels or vertices in the quad.
251 * Get the value of the address (offset) register for pixel/vertex[i],
252 * add it to the src offset and index into the constant buffer.
253 * Note that we're working on SOA data.
254 * If any of the pixel/vertex execution channels are unused their
255 * values will be garbage. It's very important that we don't use
256 * those garbage values as indexes into the constant buffer since
257 * that'll cause segfaults.
258 * The solution is to bitwise-AND the offset with the execution mask
259 * register whose values are either 0 or ~0.
260 * The caller must setup the execution mask register to indicate
261 * which channels are valid/alive before running the shader.
262 * The execution mask will also figure into loops and conditionals
263 * someday.
264 */
265 for (i = 0; i < QUAD_SIZE; i++) {
266 /* r1 = address register[i] */
267 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
268 /* r0 = execution mask[i] */
269 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
270 /* r1 = r1 & r0 */
271 x86_and( func, r1, r0 );
272 /* r0 = 'vec', the offset */
273 x86_lea( func, r0, get_const( vec, chan ) );
274
275 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
276 */
277 x86_add( func, r1, r1 );
278 x86_add( func, r1, r1 );
279 x86_add( func, r1, r1 );
280 x86_add( func, r1, r1 );
281
282 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
283 x86_mov( func, r1, x86_deref( r0 ) );
284 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
285 }
286
287 x86_pop( func, r1 );
288 x86_pop( func, r0 );
289
290 sse_movaps(
291 func,
292 make_xmm( xmm ),
293 get_temp( TEMP_R0, CHAN_X ) );
294 }
295 else {
296 /* 'vec' is the index into the src register file, such as TEMP[vec] */
297 assert( vec >= 0 );
298
299 sse_movss(
300 func,
301 make_xmm( xmm ),
302 get_const( vec, chan ) );
303 sse_shufps(
304 func,
305 make_xmm( xmm ),
306 make_xmm( xmm ),
307 SHUF( 0, 0, 0, 0 ) );
308 }
309 }
310
311 static void
312 emit_immediate(
313 struct x86_function *func,
314 unsigned xmm,
315 unsigned vec,
316 unsigned chan )
317 {
318 sse_movss(
319 func,
320 make_xmm( xmm ),
321 get_immediate( vec, chan ) );
322 sse_shufps(
323 func,
324 make_xmm( xmm ),
325 make_xmm( xmm ),
326 SHUF( 0, 0, 0, 0 ) );
327 }
328
329
330 /**
331 * Copy a shader input to xmm register
332 * \param xmm the destination xmm register
333 * \param vec the src input attrib
334 * \param chan src channel to fetch (X, Y, Z or W)
335 */
336 static void
337 emit_inputf(
338 struct x86_function *func,
339 unsigned xmm,
340 unsigned vec,
341 unsigned chan )
342 {
343 sse_movups(
344 func,
345 make_xmm( xmm ),
346 get_input( vec, chan ) );
347 }
348
349 /**
350 * Store an xmm register to a shader output
351 * \param xmm the source xmm register
352 * \param vec the dest output attrib
353 * \param chan src dest channel to store (X, Y, Z or W)
354 */
355 static void
356 emit_output(
357 struct x86_function *func,
358 unsigned xmm,
359 unsigned vec,
360 unsigned chan )
361 {
362 sse_movups(
363 func,
364 get_output( vec, chan ),
365 make_xmm( xmm ) );
366 }
367
368 /**
369 * Copy a shader temporary to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src temp register
372 * \param chan src channel to fetch (X, Y, Z or W)
373 */
374 static void
375 emit_tempf(
376 struct x86_function *func,
377 unsigned xmm,
378 unsigned vec,
379 unsigned chan )
380 {
381 sse_movaps(
382 func,
383 make_xmm( xmm ),
384 get_temp( vec, chan ) );
385 }
386
387 /**
388 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
389 * \param xmm the destination xmm register
390 * \param vec the src input/attribute coefficient index
391 * \param chan src channel to fetch (X, Y, Z or W)
392 * \param member 0=a0, 1=dadx, 2=dady
393 */
394 static void
395 emit_coef(
396 struct x86_function *func,
397 unsigned xmm,
398 unsigned vec,
399 unsigned chan,
400 unsigned member )
401 {
402 sse_movss(
403 func,
404 make_xmm( xmm ),
405 get_coef( vec, chan, member ) );
406 sse_shufps(
407 func,
408 make_xmm( xmm ),
409 make_xmm( xmm ),
410 SHUF( 0, 0, 0, 0 ) );
411 }
412
413 /**
414 * Data store helpers.
415 */
416
417 static void
418 emit_inputs(
419 struct x86_function *func,
420 unsigned xmm,
421 unsigned vec,
422 unsigned chan )
423 {
424 sse_movups(
425 func,
426 get_input( vec, chan ),
427 make_xmm( xmm ) );
428 }
429
430 static void
431 emit_temps(
432 struct x86_function *func,
433 unsigned xmm,
434 unsigned vec,
435 unsigned chan )
436 {
437 sse_movaps(
438 func,
439 get_temp( vec, chan ),
440 make_xmm( xmm ) );
441 }
442
443 static void
444 emit_addrs(
445 struct x86_function *func,
446 unsigned xmm,
447 unsigned vec,
448 unsigned chan )
449 {
450 assert( vec == 0 );
451
452 emit_temps(
453 func,
454 xmm,
455 vec + TGSI_EXEC_TEMP_ADDR,
456 chan );
457 }
458
459 /**
460 * Coefficent fetch helpers.
461 */
462
463 static void
464 emit_coef_a0(
465 struct x86_function *func,
466 unsigned xmm,
467 unsigned vec,
468 unsigned chan )
469 {
470 emit_coef(
471 func,
472 xmm,
473 vec,
474 chan,
475 0 );
476 }
477
478 static void
479 emit_coef_dadx(
480 struct x86_function *func,
481 unsigned xmm,
482 unsigned vec,
483 unsigned chan )
484 {
485 emit_coef(
486 func,
487 xmm,
488 vec,
489 chan,
490 1 );
491 }
492
493 static void
494 emit_coef_dady(
495 struct x86_function *func,
496 unsigned xmm,
497 unsigned vec,
498 unsigned chan )
499 {
500 emit_coef(
501 func,
502 xmm,
503 vec,
504 chan,
505 2 );
506 }
507
508 /**
509 * Function call helpers.
510 */
511
512 static void
513 emit_push_gp(
514 struct x86_function *func )
515 {
516 x86_push(
517 func,
518 x86_make_reg( file_REG32, reg_AX) );
519 x86_push(
520 func,
521 x86_make_reg( file_REG32, reg_CX) );
522 x86_push(
523 func,
524 x86_make_reg( file_REG32, reg_DX) );
525 }
526
527 static void
528 x86_pop_gp(
529 struct x86_function *func )
530 {
531 /* Restore GP registers in a reverse order.
532 */
533 x86_pop(
534 func,
535 x86_make_reg( file_REG32, reg_DX) );
536 x86_pop(
537 func,
538 x86_make_reg( file_REG32, reg_CX) );
539 x86_pop(
540 func,
541 x86_make_reg( file_REG32, reg_AX) );
542 }
543
544 static void
545 emit_func_call_dst(
546 struct x86_function *func,
547 unsigned xmm_dst,
548 void (PIPE_CDECL *code)() )
549 {
550 sse_movaps(
551 func,
552 get_temp( TEMP_R0, 0 ),
553 make_xmm( xmm_dst ) );
554
555 emit_push_gp(
556 func );
557
558 {
559 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
560
561 x86_lea(
562 func,
563 ecx,
564 get_temp( TEMP_R0, 0 ) );
565
566 x86_push( func, ecx );
567 x86_mov_reg_imm( func, ecx, (unsigned long) code );
568 x86_call( func, ecx );
569 x86_pop(func, ecx );
570 }
571
572
573 x86_pop_gp(
574 func );
575
576 sse_movaps(
577 func,
578 make_xmm( xmm_dst ),
579 get_temp( TEMP_R0, 0 ) );
580 }
581
582 static void
583 emit_func_call_dst_src(
584 struct x86_function *func,
585 unsigned xmm_dst,
586 unsigned xmm_src,
587 void (PIPE_CDECL *code)() )
588 {
589 sse_movaps(
590 func,
591 get_temp( TEMP_R0, 1 ),
592 make_xmm( xmm_src ) );
593
594 emit_func_call_dst(
595 func,
596 xmm_dst,
597 code );
598 }
599
600 /**
601 * Low-level instruction translators.
602 */
603
604 static void
605 emit_abs(
606 struct x86_function *func,
607 unsigned xmm )
608 {
609 sse_andps(
610 func,
611 make_xmm( xmm ),
612 get_temp(
613 TGSI_EXEC_TEMP_7FFFFFFF_I,
614 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
615 }
616
617 static void
618 emit_add(
619 struct x86_function *func,
620 unsigned xmm_dst,
621 unsigned xmm_src )
622 {
623 sse_addps(
624 func,
625 make_xmm( xmm_dst ),
626 make_xmm( xmm_src ) );
627 }
628
629 static void PIPE_CDECL
630 cos4f(
631 float *store )
632 {
633 store[0] = cosf( store[0] );
634 store[1] = cosf( store[1] );
635 store[2] = cosf( store[2] );
636 store[3] = cosf( store[3] );
637 }
638
639 static void
640 emit_cos(
641 struct x86_function *func,
642 unsigned xmm_dst )
643 {
644 emit_func_call_dst(
645 func,
646 xmm_dst,
647 cos4f );
648 }
649
650 static void PIPE_CDECL
651 ex24f(
652 float *store )
653 {
654 #if FAST_MATH
655 store[0] = util_fast_exp2( store[0] );
656 store[1] = util_fast_exp2( store[1] );
657 store[2] = util_fast_exp2( store[2] );
658 store[3] = util_fast_exp2( store[3] );
659 #else
660 store[0] = powf( 2.0f, store[0] );
661 store[1] = powf( 2.0f, store[1] );
662 store[2] = powf( 2.0f, store[2] );
663 store[3] = powf( 2.0f, store[3] );
664 #endif
665 }
666
667 static void
668 emit_ex2(
669 struct x86_function *func,
670 unsigned xmm_dst )
671 {
672 emit_func_call_dst(
673 func,
674 xmm_dst,
675 ex24f );
676 }
677
678 static void
679 emit_f2it(
680 struct x86_function *func,
681 unsigned xmm )
682 {
683 sse2_cvttps2dq(
684 func,
685 make_xmm( xmm ),
686 make_xmm( xmm ) );
687 }
688
689 static void
690 emit_i2f(
691 struct x86_function *func,
692 unsigned xmm )
693 {
694 sse2_cvtdq2ps(
695 func,
696 make_xmm( xmm ),
697 make_xmm( xmm ) );
698 }
699
700 static void PIPE_CDECL
701 flr4f(
702 float *store )
703 {
704 store[0] = floorf( store[0] );
705 store[1] = floorf( store[1] );
706 store[2] = floorf( store[2] );
707 store[3] = floorf( store[3] );
708 }
709
710 static void
711 emit_flr(
712 struct x86_function *func,
713 unsigned xmm_dst )
714 {
715 emit_func_call_dst(
716 func,
717 xmm_dst,
718 flr4f );
719 }
720
721 static void PIPE_CDECL
722 frc4f(
723 float *store )
724 {
725 store[0] -= floorf( store[0] );
726 store[1] -= floorf( store[1] );
727 store[2] -= floorf( store[2] );
728 store[3] -= floorf( store[3] );
729 }
730
731 static void
732 emit_frc(
733 struct x86_function *func,
734 unsigned xmm_dst )
735 {
736 emit_func_call_dst(
737 func,
738 xmm_dst,
739 frc4f );
740 }
741
742 static void PIPE_CDECL
743 lg24f(
744 float *store )
745 {
746 store[0] = util_fast_log2( store[0] );
747 store[1] = util_fast_log2( store[1] );
748 store[2] = util_fast_log2( store[2] );
749 store[3] = util_fast_log2( store[3] );
750 }
751
752 static void
753 emit_lg2(
754 struct x86_function *func,
755 unsigned xmm_dst )
756 {
757 emit_func_call_dst(
758 func,
759 xmm_dst,
760 lg24f );
761 }
762
763 static void
764 emit_MOV(
765 struct x86_function *func,
766 unsigned xmm_dst,
767 unsigned xmm_src )
768 {
769 sse_movups(
770 func,
771 make_xmm( xmm_dst ),
772 make_xmm( xmm_src ) );
773 }
774
775 static void
776 emit_mul (struct x86_function *func,
777 unsigned xmm_dst,
778 unsigned xmm_src)
779 {
780 sse_mulps(
781 func,
782 make_xmm( xmm_dst ),
783 make_xmm( xmm_src ) );
784 }
785
786 static void
787 emit_neg(
788 struct x86_function *func,
789 unsigned xmm )
790 {
791 sse_xorps(
792 func,
793 make_xmm( xmm ),
794 get_temp(
795 TGSI_EXEC_TEMP_80000000_I,
796 TGSI_EXEC_TEMP_80000000_C ) );
797 }
798
799 static void PIPE_CDECL
800 pow4f(
801 float *store )
802 {
803 #if FAST_MATH
804 store[0] = util_fast_pow( store[0], store[4] );
805 store[1] = util_fast_pow( store[1], store[5] );
806 store[2] = util_fast_pow( store[2], store[6] );
807 store[3] = util_fast_pow( store[3], store[7] );
808 #else
809 store[0] = powf( store[0], store[4] );
810 store[1] = powf( store[1], store[5] );
811 store[2] = powf( store[2], store[6] );
812 store[3] = powf( store[3], store[7] );
813 #endif
814 }
815
816 static void
817 emit_pow(
818 struct x86_function *func,
819 unsigned xmm_dst,
820 unsigned xmm_src )
821 {
822 emit_func_call_dst_src(
823 func,
824 xmm_dst,
825 xmm_src,
826 pow4f );
827 }
828
829 static void
830 emit_rcp (
831 struct x86_function *func,
832 unsigned xmm_dst,
833 unsigned xmm_src )
834 {
835 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
836 * good enough. Need to either emit a proper divide or use the
837 * iterative technique described below in emit_rsqrt().
838 */
839 sse2_rcpps(
840 func,
841 make_xmm( xmm_dst ),
842 make_xmm( xmm_src ) );
843 }
844
845 static void PIPE_CDECL
846 rnd4f(
847 float *store )
848 {
849 store[0] = floorf( store[0] + 0.5f );
850 store[1] = floorf( store[1] + 0.5f );
851 store[2] = floorf( store[2] + 0.5f );
852 store[3] = floorf( store[3] + 0.5f );
853 }
854
855 static void
856 emit_rnd(
857 struct x86_function *func,
858 unsigned xmm_save,
859 unsigned xmm_dst )
860 {
861 emit_func_call_dst(
862 func,
863 xmm_save,
864 xmm_dst,
865 rnd4f );
866 }
867
868 static void
869 emit_rsqrt(
870 struct x86_function *func,
871 unsigned xmm_dst,
872 unsigned xmm_src )
873 {
874 #if HIGH_PRECISION
875 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
876 * implementations, it is possible to improve its precision at
877 * fairly low cost, using a newton/raphson step, as below:
878 *
879 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
880 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
881 *
882 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
883 */
884 {
885 struct x86_reg dst = make_xmm( xmm_dst );
886 struct x86_reg src = make_xmm( xmm_src );
887 struct x86_reg tmp0 = make_xmm( 2 );
888 struct x86_reg tmp1 = make_xmm( 3 );
889
890 assert( xmm_dst != xmm_src );
891 assert( xmm_dst != 2 && xmm_dst != 3 );
892 assert( xmm_src != 2 && xmm_src != 3 );
893
894 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
895 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
896 sse_rsqrtps( func, tmp1, src );
897 sse_mulps( func, src, tmp1 );
898 sse_mulps( func, dst, tmp1 );
899 sse_mulps( func, src, tmp1 );
900 sse_subps( func, tmp0, src );
901 sse_mulps( func, dst, tmp0 );
902 }
903 #else
904 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
905 * good enough.
906 */
907 sse_rsqrtps(
908 func,
909 make_xmm( xmm_dst ),
910 make_xmm( xmm_src ) );
911 #endif
912 }
913
914 static void
915 emit_setsign(
916 struct x86_function *func,
917 unsigned xmm )
918 {
919 sse_orps(
920 func,
921 make_xmm( xmm ),
922 get_temp(
923 TGSI_EXEC_TEMP_80000000_I,
924 TGSI_EXEC_TEMP_80000000_C ) );
925 }
926
927 static void PIPE_CDECL
928 sgn4f(
929 float *store )
930 {
931 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
932 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
933 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
934 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
935 }
936
937 static void
938 emit_sgn(
939 struct x86_function *func,
940 unsigned xmm_save,
941 unsigned xmm_dst )
942 {
943 emit_func_call_dst(
944 func,
945 xmm_save,
946 xmm_dst,
947 sgn4f );
948 }
949
950 static void PIPE_CDECL
951 sin4f(
952 float *store )
953 {
954 store[0] = sinf( store[0] );
955 store[1] = sinf( store[1] );
956 store[2] = sinf( store[2] );
957 store[3] = sinf( store[3] );
958 }
959
960 static void
961 emit_sin (struct x86_function *func,
962 unsigned xmm_dst)
963 {
964 emit_func_call_dst(
965 func,
966 xmm_dst,
967 sin4f );
968 }
969
970 static void
971 emit_sub(
972 struct x86_function *func,
973 unsigned xmm_dst,
974 unsigned xmm_src )
975 {
976 sse_subps(
977 func,
978 make_xmm( xmm_dst ),
979 make_xmm( xmm_src ) );
980 }
981
982 /**
983 * Register fetch.
984 */
985
986 static void
987 emit_fetch(
988 struct x86_function *func,
989 unsigned xmm,
990 const struct tgsi_full_src_register *reg,
991 const unsigned chan_index )
992 {
993 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
994
995 switch (swizzle) {
996 case TGSI_EXTSWIZZLE_X:
997 case TGSI_EXTSWIZZLE_Y:
998 case TGSI_EXTSWIZZLE_Z:
999 case TGSI_EXTSWIZZLE_W:
1000 switch (reg->SrcRegister.File) {
1001 case TGSI_FILE_CONSTANT:
1002 emit_const(
1003 func,
1004 xmm,
1005 reg->SrcRegister.Index,
1006 swizzle,
1007 reg->SrcRegister.Indirect,
1008 reg->SrcRegisterInd.File,
1009 reg->SrcRegisterInd.Index );
1010 break;
1011
1012 case TGSI_FILE_IMMEDIATE:
1013 emit_immediate(
1014 func,
1015 xmm,
1016 reg->SrcRegister.Index,
1017 swizzle );
1018 break;
1019
1020 case TGSI_FILE_INPUT:
1021 emit_inputf(
1022 func,
1023 xmm,
1024 reg->SrcRegister.Index,
1025 swizzle );
1026 break;
1027
1028 case TGSI_FILE_TEMPORARY:
1029 emit_tempf(
1030 func,
1031 xmm,
1032 reg->SrcRegister.Index,
1033 swizzle );
1034 break;
1035
1036 default:
1037 assert( 0 );
1038 }
1039 break;
1040
1041 case TGSI_EXTSWIZZLE_ZERO:
1042 emit_tempf(
1043 func,
1044 xmm,
1045 TGSI_EXEC_TEMP_00000000_I,
1046 TGSI_EXEC_TEMP_00000000_C );
1047 break;
1048
1049 case TGSI_EXTSWIZZLE_ONE:
1050 emit_tempf(
1051 func,
1052 xmm,
1053 TEMP_ONE_I,
1054 TEMP_ONE_C );
1055 break;
1056
1057 default:
1058 assert( 0 );
1059 }
1060
1061 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1062 case TGSI_UTIL_SIGN_CLEAR:
1063 emit_abs( func, xmm );
1064 break;
1065
1066 case TGSI_UTIL_SIGN_SET:
1067 emit_setsign( func, xmm );
1068 break;
1069
1070 case TGSI_UTIL_SIGN_TOGGLE:
1071 emit_neg( func, xmm );
1072 break;
1073
1074 case TGSI_UTIL_SIGN_KEEP:
1075 break;
1076 }
1077 }
1078
1079 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1080 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1081
1082 /**
1083 * Register store.
1084 */
1085
1086 static void
1087 emit_store(
1088 struct x86_function *func,
1089 unsigned xmm,
1090 const struct tgsi_full_dst_register *reg,
1091 const struct tgsi_full_instruction *inst,
1092 unsigned chan_index )
1093 {
1094 switch( reg->DstRegister.File ) {
1095 case TGSI_FILE_OUTPUT:
1096 emit_output(
1097 func,
1098 xmm,
1099 reg->DstRegister.Index,
1100 chan_index );
1101 break;
1102
1103 case TGSI_FILE_TEMPORARY:
1104 emit_temps(
1105 func,
1106 xmm,
1107 reg->DstRegister.Index,
1108 chan_index );
1109 break;
1110
1111 case TGSI_FILE_ADDRESS:
1112 emit_addrs(
1113 func,
1114 xmm,
1115 reg->DstRegister.Index,
1116 chan_index );
1117 break;
1118
1119 default:
1120 assert( 0 );
1121 }
1122
1123 switch( inst->Instruction.Saturate ) {
1124 case TGSI_SAT_NONE:
1125 break;
1126
1127 case TGSI_SAT_ZERO_ONE:
1128 /* assert( 0 ); */
1129 break;
1130
1131 case TGSI_SAT_MINUS_PLUS_ONE:
1132 assert( 0 );
1133 break;
1134 }
1135 }
1136
1137 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1138 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1139
1140 /**
1141 * High-level instruction translators.
1142 */
1143
1144 static void
1145 emit_kil(
1146 struct x86_function *func,
1147 const struct tgsi_full_src_register *reg )
1148 {
1149 unsigned uniquemask;
1150 unsigned registers[4];
1151 unsigned nextregister = 0;
1152 unsigned firstchan = ~0;
1153 unsigned chan_index;
1154
1155 /* This mask stores component bits that were already tested. Note that
1156 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1157 * tested. */
1158 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1159
1160 FOR_EACH_CHANNEL( chan_index ) {
1161 unsigned swizzle;
1162
1163 /* unswizzle channel */
1164 swizzle = tgsi_util_get_full_src_register_extswizzle(
1165 reg,
1166 chan_index );
1167
1168 /* check if the component has not been already tested */
1169 if( !(uniquemask & (1 << swizzle)) ) {
1170 uniquemask |= 1 << swizzle;
1171
1172 /* allocate register */
1173 registers[chan_index] = nextregister;
1174 emit_fetch(
1175 func,
1176 nextregister,
1177 reg,
1178 chan_index );
1179 nextregister++;
1180
1181 /* mark the first channel used */
1182 if( firstchan == ~0 ) {
1183 firstchan = chan_index;
1184 }
1185 }
1186 }
1187
1188 x86_push(
1189 func,
1190 x86_make_reg( file_REG32, reg_AX ) );
1191 x86_push(
1192 func,
1193 x86_make_reg( file_REG32, reg_DX ) );
1194
1195 FOR_EACH_CHANNEL( chan_index ) {
1196 if( uniquemask & (1 << chan_index) ) {
1197 sse_cmpps(
1198 func,
1199 make_xmm( registers[chan_index] ),
1200 get_temp(
1201 TGSI_EXEC_TEMP_00000000_I,
1202 TGSI_EXEC_TEMP_00000000_C ),
1203 cc_LessThan );
1204
1205 if( chan_index == firstchan ) {
1206 sse_pmovmskb(
1207 func,
1208 x86_make_reg( file_REG32, reg_AX ),
1209 make_xmm( registers[chan_index] ) );
1210 }
1211 else {
1212 sse_pmovmskb(
1213 func,
1214 x86_make_reg( file_REG32, reg_DX ),
1215 make_xmm( registers[chan_index] ) );
1216 x86_or(
1217 func,
1218 x86_make_reg( file_REG32, reg_AX ),
1219 x86_make_reg( file_REG32, reg_DX ) );
1220 }
1221 }
1222 }
1223
1224 x86_or(
1225 func,
1226 get_temp(
1227 TGSI_EXEC_TEMP_KILMASK_I,
1228 TGSI_EXEC_TEMP_KILMASK_C ),
1229 x86_make_reg( file_REG32, reg_AX ) );
1230
1231 x86_pop(
1232 func,
1233 x86_make_reg( file_REG32, reg_DX ) );
1234 x86_pop(
1235 func,
1236 x86_make_reg( file_REG32, reg_AX ) );
1237 }
1238
1239
1240 static void
1241 emit_kilp(
1242 struct x86_function *func )
1243 {
1244 /* XXX todo / fix me */
1245 }
1246
1247
1248 static void
1249 emit_setcc(
1250 struct x86_function *func,
1251 struct tgsi_full_instruction *inst,
1252 enum sse_cc cc )
1253 {
1254 unsigned chan_index;
1255
1256 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1257 FETCH( func, *inst, 0, 0, chan_index );
1258 FETCH( func, *inst, 1, 1, chan_index );
1259 sse_cmpps(
1260 func,
1261 make_xmm( 0 ),
1262 make_xmm( 1 ),
1263 cc );
1264 sse_andps(
1265 func,
1266 make_xmm( 0 ),
1267 get_temp(
1268 TEMP_ONE_I,
1269 TEMP_ONE_C ) );
1270 STORE( func, *inst, 0, 0, chan_index );
1271 }
1272 }
1273
1274 static void
1275 emit_cmp(
1276 struct x86_function *func,
1277 struct tgsi_full_instruction *inst )
1278 {
1279 unsigned chan_index;
1280
1281 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1282 FETCH( func, *inst, 0, 0, chan_index );
1283 FETCH( func, *inst, 1, 1, chan_index );
1284 FETCH( func, *inst, 2, 2, chan_index );
1285 sse_cmpps(
1286 func,
1287 make_xmm( 0 ),
1288 get_temp(
1289 TGSI_EXEC_TEMP_00000000_I,
1290 TGSI_EXEC_TEMP_00000000_C ),
1291 cc_LessThan );
1292 sse_andps(
1293 func,
1294 make_xmm( 1 ),
1295 make_xmm( 0 ) );
1296 sse_andnps(
1297 func,
1298 make_xmm( 0 ),
1299 make_xmm( 2 ) );
1300 sse_orps(
1301 func,
1302 make_xmm( 0 ),
1303 make_xmm( 1 ) );
1304 STORE( func, *inst, 0, 0, chan_index );
1305 }
1306 }
1307
1308 static int
1309 emit_instruction(
1310 struct x86_function *func,
1311 struct tgsi_full_instruction *inst )
1312 {
1313 unsigned chan_index;
1314
1315 switch (inst->Instruction.Opcode) {
1316 case TGSI_OPCODE_ARL:
1317 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1318 FETCH( func, *inst, 0, 0, chan_index );
1319 emit_f2it( func, 0 );
1320 STORE( func, *inst, 0, 0, chan_index );
1321 }
1322 break;
1323
1324 case TGSI_OPCODE_MOV:
1325 case TGSI_OPCODE_SWZ:
1326 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1327 FETCH( func, *inst, 0, 0, chan_index );
1328 STORE( func, *inst, 0, 0, chan_index );
1329 }
1330 break;
1331
1332 case TGSI_OPCODE_LIT:
1333 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1334 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1335 emit_tempf(
1336 func,
1337 0,
1338 TEMP_ONE_I,
1339 TEMP_ONE_C);
1340 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1341 STORE( func, *inst, 0, 0, CHAN_X );
1342 }
1343 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1344 STORE( func, *inst, 0, 0, CHAN_W );
1345 }
1346 }
1347 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1348 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1349 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1350 FETCH( func, *inst, 0, 0, CHAN_X );
1351 sse_maxps(
1352 func,
1353 make_xmm( 0 ),
1354 get_temp(
1355 TGSI_EXEC_TEMP_00000000_I,
1356 TGSI_EXEC_TEMP_00000000_C ) );
1357 STORE( func, *inst, 0, 0, CHAN_Y );
1358 }
1359 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1360 /* XMM[1] = SrcReg[0].yyyy */
1361 FETCH( func, *inst, 1, 0, CHAN_Y );
1362 /* XMM[1] = max(XMM[1], 0) */
1363 sse_maxps(
1364 func,
1365 make_xmm( 1 ),
1366 get_temp(
1367 TGSI_EXEC_TEMP_00000000_I,
1368 TGSI_EXEC_TEMP_00000000_C ) );
1369 /* XMM[2] = SrcReg[0].wwww */
1370 FETCH( func, *inst, 2, 0, CHAN_W );
1371 /* XMM[2] = min(XMM[2], 128.0) */
1372 sse_minps(
1373 func,
1374 make_xmm( 2 ),
1375 get_temp(
1376 TGSI_EXEC_TEMP_128_I,
1377 TGSI_EXEC_TEMP_128_C ) );
1378 /* XMM[2] = max(XMM[2], -128.0) */
1379 sse_maxps(
1380 func,
1381 make_xmm( 2 ),
1382 get_temp(
1383 TGSI_EXEC_TEMP_MINUS_128_I,
1384 TGSI_EXEC_TEMP_MINUS_128_C ) );
1385 emit_pow( func, 1, 2 );
1386 FETCH( func, *inst, 0, 0, CHAN_X );
1387 sse_xorps(
1388 func,
1389 make_xmm( 2 ),
1390 make_xmm( 2 ) );
1391 sse_cmpps(
1392 func,
1393 make_xmm( 2 ),
1394 make_xmm( 0 ),
1395 cc_LessThanEqual );
1396 sse_andps(
1397 func,
1398 make_xmm( 2 ),
1399 make_xmm( 1 ) );
1400 STORE( func, *inst, 2, 0, CHAN_Z );
1401 }
1402 }
1403 break;
1404
1405 case TGSI_OPCODE_RCP:
1406 /* TGSI_OPCODE_RECIP */
1407 FETCH( func, *inst, 0, 0, CHAN_X );
1408 emit_rcp( func, 0, 0 );
1409 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1410 STORE( func, *inst, 0, 0, chan_index );
1411 }
1412 break;
1413
1414 case TGSI_OPCODE_RSQ:
1415 /* TGSI_OPCODE_RECIPSQRT */
1416 FETCH( func, *inst, 0, 0, CHAN_X );
1417 emit_rsqrt( func, 1, 0 );
1418 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1419 STORE( func, *inst, 1, 0, chan_index );
1420 }
1421 break;
1422
1423 case TGSI_OPCODE_EXP:
1424 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1425 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1426 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1427 FETCH( func, *inst, 0, 0, CHAN_X );
1428 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1429 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1430 emit_MOV( func, 1, 0 );
1431 emit_flr( func, 1 );
1432 /* dst.x = ex2(floor(src.x)) */
1433 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1434 emit_MOV( func, 2, 1 );
1435 emit_ex2( func, 2 );
1436 STORE( func, *inst, 2, 0, CHAN_X );
1437 }
1438 /* dst.y = src.x - floor(src.x) */
1439 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1440 emit_MOV( func, 2, 0 );
1441 emit_sub( func, 2, 1 );
1442 STORE( func, *inst, 2, 0, CHAN_Y );
1443 }
1444 }
1445 /* dst.z = ex2(src.x) */
1446 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1447 emit_ex2( func, 0 );
1448 STORE( func, *inst, 0, 0, CHAN_Z );
1449 }
1450 }
1451 /* dst.w = 1.0 */
1452 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1453 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1454 STORE( func, *inst, 0, 0, CHAN_W );
1455 }
1456 break;
1457
1458 case TGSI_OPCODE_LOG:
1459 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1460 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1461 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1462 FETCH( func, *inst, 0, 0, CHAN_X );
1463 emit_abs( func, 0 );
1464 emit_MOV( func, 1, 0 );
1465 emit_lg2( func, 1 );
1466 /* dst.z = lg2(abs(src.x)) */
1467 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1468 STORE( func, *inst, 1, 0, CHAN_Z );
1469 }
1470 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1471 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1472 emit_flr( func, 1 );
1473 /* dst.x = floor(lg2(abs(src.x))) */
1474 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1475 STORE( func, *inst, 1, 0, CHAN_X );
1476 }
1477 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1478 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1479 emit_ex2( func, 1 );
1480 emit_rcp( func, 1, 1 );
1481 emit_mul( func, 0, 1 );
1482 STORE( func, *inst, 0, 0, CHAN_Y );
1483 }
1484 }
1485 }
1486 /* dst.w = 1.0 */
1487 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1488 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1489 STORE( func, *inst, 0, 0, CHAN_W );
1490 }
1491 break;
1492
1493 case TGSI_OPCODE_MUL:
1494 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1495 FETCH( func, *inst, 0, 0, chan_index );
1496 FETCH( func, *inst, 1, 1, chan_index );
1497 emit_mul( func, 0, 1 );
1498 STORE( func, *inst, 0, 0, chan_index );
1499 }
1500 break;
1501
1502 case TGSI_OPCODE_ADD:
1503 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1504 FETCH( func, *inst, 0, 0, chan_index );
1505 FETCH( func, *inst, 1, 1, chan_index );
1506 emit_add( func, 0, 1 );
1507 STORE( func, *inst, 0, 0, chan_index );
1508 }
1509 break;
1510
1511 case TGSI_OPCODE_DP3:
1512 /* TGSI_OPCODE_DOT3 */
1513 FETCH( func, *inst, 0, 0, CHAN_X );
1514 FETCH( func, *inst, 1, 1, CHAN_X );
1515 emit_mul( func, 0, 1 );
1516 FETCH( func, *inst, 1, 0, CHAN_Y );
1517 FETCH( func, *inst, 2, 1, CHAN_Y );
1518 emit_mul( func, 1, 2 );
1519 emit_add( func, 0, 1 );
1520 FETCH( func, *inst, 1, 0, CHAN_Z );
1521 FETCH( func, *inst, 2, 1, CHAN_Z );
1522 emit_mul( func, 1, 2 );
1523 emit_add( func, 0, 1 );
1524 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1525 STORE( func, *inst, 0, 0, chan_index );
1526 }
1527 break;
1528
1529 case TGSI_OPCODE_DP4:
1530 /* TGSI_OPCODE_DOT4 */
1531 FETCH( func, *inst, 0, 0, CHAN_X );
1532 FETCH( func, *inst, 1, 1, CHAN_X );
1533 emit_mul( func, 0, 1 );
1534 FETCH( func, *inst, 1, 0, CHAN_Y );
1535 FETCH( func, *inst, 2, 1, CHAN_Y );
1536 emit_mul( func, 1, 2 );
1537 emit_add( func, 0, 1 );
1538 FETCH( func, *inst, 1, 0, CHAN_Z );
1539 FETCH( func, *inst, 2, 1, CHAN_Z );
1540 emit_mul(func, 1, 2 );
1541 emit_add(func, 0, 1 );
1542 FETCH( func, *inst, 1, 0, CHAN_W );
1543 FETCH( func, *inst, 2, 1, CHAN_W );
1544 emit_mul( func, 1, 2 );
1545 emit_add( func, 0, 1 );
1546 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1547 STORE( func, *inst, 0, 0, chan_index );
1548 }
1549 break;
1550
1551 case TGSI_OPCODE_DST:
1552 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1553 emit_tempf(
1554 func,
1555 0,
1556 TEMP_ONE_I,
1557 TEMP_ONE_C );
1558 STORE( func, *inst, 0, 0, CHAN_X );
1559 }
1560 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1561 FETCH( func, *inst, 0, 0, CHAN_Y );
1562 FETCH( func, *inst, 1, 1, CHAN_Y );
1563 emit_mul( func, 0, 1 );
1564 STORE( func, *inst, 0, 0, CHAN_Y );
1565 }
1566 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1567 FETCH( func, *inst, 0, 0, CHAN_Z );
1568 STORE( func, *inst, 0, 0, CHAN_Z );
1569 }
1570 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1571 FETCH( func, *inst, 0, 1, CHAN_W );
1572 STORE( func, *inst, 0, 0, CHAN_W );
1573 }
1574 break;
1575
1576 case TGSI_OPCODE_MIN:
1577 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1578 FETCH( func, *inst, 0, 0, chan_index );
1579 FETCH( func, *inst, 1, 1, chan_index );
1580 sse_minps(
1581 func,
1582 make_xmm( 0 ),
1583 make_xmm( 1 ) );
1584 STORE( func, *inst, 0, 0, chan_index );
1585 }
1586 break;
1587
1588 case TGSI_OPCODE_MAX:
1589 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1590 FETCH( func, *inst, 0, 0, chan_index );
1591 FETCH( func, *inst, 1, 1, chan_index );
1592 sse_maxps(
1593 func,
1594 make_xmm( 0 ),
1595 make_xmm( 1 ) );
1596 STORE( func, *inst, 0, 0, chan_index );
1597 }
1598 break;
1599
1600 case TGSI_OPCODE_SLT:
1601 /* TGSI_OPCODE_SETLT */
1602 emit_setcc( func, inst, cc_LessThan );
1603 break;
1604
1605 case TGSI_OPCODE_SGE:
1606 /* TGSI_OPCODE_SETGE */
1607 emit_setcc( func, inst, cc_NotLessThan );
1608 break;
1609
1610 case TGSI_OPCODE_MAD:
1611 /* TGSI_OPCODE_MADD */
1612 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1613 FETCH( func, *inst, 0, 0, chan_index );
1614 FETCH( func, *inst, 1, 1, chan_index );
1615 FETCH( func, *inst, 2, 2, chan_index );
1616 emit_mul( func, 0, 1 );
1617 emit_add( func, 0, 2 );
1618 STORE( func, *inst, 0, 0, chan_index );
1619 }
1620 break;
1621
1622 case TGSI_OPCODE_SUB:
1623 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1624 FETCH( func, *inst, 0, 0, chan_index );
1625 FETCH( func, *inst, 1, 1, chan_index );
1626 emit_sub( func, 0, 1 );
1627 STORE( func, *inst, 0, 0, chan_index );
1628 }
1629 break;
1630
1631 case TGSI_OPCODE_LERP:
1632 /* TGSI_OPCODE_LRP */
1633 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1634 FETCH( func, *inst, 0, 0, chan_index );
1635 FETCH( func, *inst, 1, 1, chan_index );
1636 FETCH( func, *inst, 2, 2, chan_index );
1637 emit_sub( func, 1, 2 );
1638 emit_mul( func, 0, 1 );
1639 emit_add( func, 0, 2 );
1640 STORE( func, *inst, 0, 0, chan_index );
1641 }
1642 break;
1643
1644 case TGSI_OPCODE_CND:
1645 return 0;
1646 break;
1647
1648 case TGSI_OPCODE_CND0:
1649 return 0;
1650 break;
1651
1652 case TGSI_OPCODE_DOT2ADD:
1653 /* TGSI_OPCODE_DP2A */
1654 return 0;
1655 break;
1656
1657 case TGSI_OPCODE_INDEX:
1658 return 0;
1659 break;
1660
1661 case TGSI_OPCODE_NEGATE:
1662 return 0;
1663 break;
1664
1665 case TGSI_OPCODE_FRAC:
1666 /* TGSI_OPCODE_FRC */
1667 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1668 FETCH( func, *inst, 0, 0, chan_index );
1669 emit_frc( func, 0 );
1670 STORE( func, *inst, 0, 0, chan_index );
1671 }
1672 break;
1673
1674 case TGSI_OPCODE_CLAMP:
1675 return 0;
1676 break;
1677
1678 case TGSI_OPCODE_FLOOR:
1679 /* TGSI_OPCODE_FLR */
1680 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1681 FETCH( func, *inst, 0, 0, chan_index );
1682 emit_flr( func, 0 );
1683 STORE( func, *inst, 0, 0, chan_index );
1684 }
1685 break;
1686
1687 case TGSI_OPCODE_ROUND:
1688 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1689 FETCH( func, *inst, 0, 0, chan_index );
1690 emit_rnd( func, 0, 0 );
1691 STORE( func, *inst, 0, 0, chan_index );
1692 }
1693 break;
1694
1695 case TGSI_OPCODE_EXPBASE2:
1696 /* TGSI_OPCODE_EX2 */
1697 FETCH( func, *inst, 0, 0, CHAN_X );
1698 emit_ex2( func, 0 );
1699 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1700 STORE( func, *inst, 0, 0, chan_index );
1701 }
1702 break;
1703
1704 case TGSI_OPCODE_LOGBASE2:
1705 /* TGSI_OPCODE_LG2 */
1706 FETCH( func, *inst, 0, 0, CHAN_X );
1707 emit_lg2( func, 0 );
1708 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1709 STORE( func, *inst, 0, 0, chan_index );
1710 }
1711 break;
1712
1713 case TGSI_OPCODE_POWER:
1714 /* TGSI_OPCODE_POW */
1715 FETCH( func, *inst, 0, 0, CHAN_X );
1716 FETCH( func, *inst, 1, 1, CHAN_X );
1717 emit_pow( func, 0, 1 );
1718 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1719 STORE( func, *inst, 0, 0, chan_index );
1720 }
1721 break;
1722
1723 case TGSI_OPCODE_CROSSPRODUCT:
1724 /* TGSI_OPCODE_XPD */
1725 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1726 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1727 FETCH( func, *inst, 1, 1, CHAN_Z );
1728 FETCH( func, *inst, 3, 0, CHAN_Z );
1729 }
1730 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1731 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1732 FETCH( func, *inst, 0, 0, CHAN_Y );
1733 FETCH( func, *inst, 4, 1, CHAN_Y );
1734 }
1735 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1736 emit_MOV( func, 2, 0 );
1737 emit_mul( func, 2, 1 );
1738 emit_MOV( func, 5, 3 );
1739 emit_mul( func, 5, 4 );
1740 emit_sub( func, 2, 5 );
1741 STORE( func, *inst, 2, 0, CHAN_X );
1742 }
1743 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1744 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1745 FETCH( func, *inst, 2, 1, CHAN_X );
1746 FETCH( func, *inst, 5, 0, CHAN_X );
1747 }
1748 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1749 emit_mul( func, 3, 2 );
1750 emit_mul( func, 1, 5 );
1751 emit_sub( func, 3, 1 );
1752 STORE( func, *inst, 3, 0, CHAN_Y );
1753 }
1754 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1755 emit_mul( func, 5, 4 );
1756 emit_mul( func, 0, 2 );
1757 emit_sub( func, 5, 0 );
1758 STORE( func, *inst, 5, 0, CHAN_Z );
1759 }
1760 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1761 emit_tempf(
1762 func,
1763 0,
1764 TEMP_ONE_I,
1765 TEMP_ONE_C );
1766 STORE( func, *inst, 0, 0, CHAN_W );
1767 }
1768 break;
1769
1770 case TGSI_OPCODE_MULTIPLYMATRIX:
1771 return 0;
1772 break;
1773
1774 case TGSI_OPCODE_ABS:
1775 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1776 FETCH( func, *inst, 0, 0, chan_index );
1777 emit_abs( func, 0) ;
1778
1779 STORE( func, *inst, 0, 0, chan_index );
1780 }
1781 break;
1782
1783 case TGSI_OPCODE_RCC:
1784 return 0;
1785 break;
1786
1787 case TGSI_OPCODE_DPH:
1788 FETCH( func, *inst, 0, 0, CHAN_X );
1789 FETCH( func, *inst, 1, 1, CHAN_X );
1790 emit_mul( func, 0, 1 );
1791 FETCH( func, *inst, 1, 0, CHAN_Y );
1792 FETCH( func, *inst, 2, 1, CHAN_Y );
1793 emit_mul( func, 1, 2 );
1794 emit_add( func, 0, 1 );
1795 FETCH( func, *inst, 1, 0, CHAN_Z );
1796 FETCH( func, *inst, 2, 1, CHAN_Z );
1797 emit_mul( func, 1, 2 );
1798 emit_add( func, 0, 1 );
1799 FETCH( func, *inst, 1, 1, CHAN_W );
1800 emit_add( func, 0, 1 );
1801 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1802 STORE( func, *inst, 0, 0, chan_index );
1803 }
1804 break;
1805
1806 case TGSI_OPCODE_COS:
1807 FETCH( func, *inst, 0, 0, CHAN_X );
1808 emit_cos( func, 0 );
1809 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1810 STORE( func, *inst, 0, 0, chan_index );
1811 }
1812 break;
1813
1814 case TGSI_OPCODE_DDX:
1815 return 0;
1816 break;
1817
1818 case TGSI_OPCODE_DDY:
1819 return 0;
1820 break;
1821
1822 case TGSI_OPCODE_KILP:
1823 /* predicated kill */
1824 emit_kilp( func );
1825 return 0; /* XXX fix me */
1826 break;
1827
1828 case TGSI_OPCODE_KIL:
1829 /* conditional kill */
1830 emit_kil( func, &inst->FullSrcRegisters[0] );
1831 break;
1832
1833 case TGSI_OPCODE_PK2H:
1834 return 0;
1835 break;
1836
1837 case TGSI_OPCODE_PK2US:
1838 return 0;
1839 break;
1840
1841 case TGSI_OPCODE_PK4B:
1842 return 0;
1843 break;
1844
1845 case TGSI_OPCODE_PK4UB:
1846 return 0;
1847 break;
1848
1849 case TGSI_OPCODE_RFL:
1850 return 0;
1851 break;
1852
1853 case TGSI_OPCODE_SEQ:
1854 return 0;
1855 break;
1856
1857 case TGSI_OPCODE_SFL:
1858 return 0;
1859 break;
1860
1861 case TGSI_OPCODE_SGT:
1862 return 0;
1863 break;
1864
1865 case TGSI_OPCODE_SIN:
1866 FETCH( func, *inst, 0, 0, CHAN_X );
1867 emit_sin( func, 0 );
1868 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1869 STORE( func, *inst, 0, 0, chan_index );
1870 }
1871 break;
1872
1873 case TGSI_OPCODE_SLE:
1874 return 0;
1875 break;
1876
1877 case TGSI_OPCODE_SNE:
1878 return 0;
1879 break;
1880
1881 case TGSI_OPCODE_STR:
1882 return 0;
1883 break;
1884
1885 case TGSI_OPCODE_TEX:
1886 if (0) {
1887 /* Disable dummy texture code:
1888 */
1889 emit_tempf(
1890 func,
1891 0,
1892 TEMP_ONE_I,
1893 TEMP_ONE_C );
1894 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1895 STORE( func, *inst, 0, 0, chan_index );
1896 }
1897 }
1898 else {
1899 return 0;
1900 }
1901 break;
1902
1903 case TGSI_OPCODE_TXD:
1904 return 0;
1905 break;
1906
1907 case TGSI_OPCODE_UP2H:
1908 return 0;
1909 break;
1910
1911 case TGSI_OPCODE_UP2US:
1912 return 0;
1913 break;
1914
1915 case TGSI_OPCODE_UP4B:
1916 return 0;
1917 break;
1918
1919 case TGSI_OPCODE_UP4UB:
1920 return 0;
1921 break;
1922
1923 case TGSI_OPCODE_X2D:
1924 return 0;
1925 break;
1926
1927 case TGSI_OPCODE_ARA:
1928 return 0;
1929 break;
1930
1931 #if 0
1932 case TGSI_OPCODE_ARR:
1933 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1934 FETCH( func, *inst, 0, 0, chan_index );
1935 emit_rnd( func, 0, 0 );
1936 emit_f2it( func, 0 );
1937 STORE( func, *inst, 0, 0, chan_index );
1938 }
1939 break;
1940 #endif
1941 case TGSI_OPCODE_BRA:
1942 return 0;
1943 break;
1944
1945 case TGSI_OPCODE_CAL:
1946 return 0;
1947 break;
1948
1949 case TGSI_OPCODE_RET:
1950 emit_ret( func );
1951 break;
1952
1953 case TGSI_OPCODE_END:
1954 break;
1955
1956 case TGSI_OPCODE_SSG:
1957 /* TGSI_OPCODE_SGN */
1958 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1959 FETCH( func, *inst, 0, 0, chan_index );
1960 emit_sgn( func, 0, 0 );
1961 STORE( func, *inst, 0, 0, chan_index );
1962 }
1963 break;
1964
1965 case TGSI_OPCODE_CMP:
1966 emit_cmp (func, inst);
1967 break;
1968
1969 case TGSI_OPCODE_SCS:
1970 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1971 FETCH( func, *inst, 0, 0, CHAN_X );
1972 emit_cos( func, 0 );
1973 STORE( func, *inst, 0, 0, CHAN_X );
1974 }
1975 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1976 FETCH( func, *inst, 0, 0, CHAN_X );
1977 emit_sin( func, 0 );
1978 STORE( func, *inst, 0, 0, CHAN_Y );
1979 }
1980 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1981 emit_tempf(
1982 func,
1983 0,
1984 TGSI_EXEC_TEMP_00000000_I,
1985 TGSI_EXEC_TEMP_00000000_C );
1986 STORE( func, *inst, 0, 0, CHAN_Z );
1987 }
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1989 emit_tempf(
1990 func,
1991 0,
1992 TEMP_ONE_I,
1993 TEMP_ONE_C );
1994 STORE( func, *inst, 0, 0, CHAN_W );
1995 }
1996 break;
1997
1998 case TGSI_OPCODE_TXB:
1999 return 0;
2000 break;
2001
2002 case TGSI_OPCODE_NRM:
2003 return 0;
2004 break;
2005
2006 case TGSI_OPCODE_DIV:
2007 return 0;
2008 break;
2009
2010 case TGSI_OPCODE_DP2:
2011 return 0;
2012 break;
2013
2014 case TGSI_OPCODE_TXL:
2015 return 0;
2016 break;
2017
2018 case TGSI_OPCODE_BRK:
2019 return 0;
2020 break;
2021
2022 case TGSI_OPCODE_IF:
2023 return 0;
2024 break;
2025
2026 case TGSI_OPCODE_LOOP:
2027 return 0;
2028 break;
2029
2030 case TGSI_OPCODE_REP:
2031 return 0;
2032 break;
2033
2034 case TGSI_OPCODE_ELSE:
2035 return 0;
2036 break;
2037
2038 case TGSI_OPCODE_ENDIF:
2039 return 0;
2040 break;
2041
2042 case TGSI_OPCODE_ENDLOOP:
2043 return 0;
2044 break;
2045
2046 case TGSI_OPCODE_ENDREP:
2047 return 0;
2048 break;
2049
2050 case TGSI_OPCODE_PUSHA:
2051 return 0;
2052 break;
2053
2054 case TGSI_OPCODE_POPA:
2055 return 0;
2056 break;
2057
2058 case TGSI_OPCODE_CEIL:
2059 return 0;
2060 break;
2061
2062 case TGSI_OPCODE_I2F:
2063 return 0;
2064 break;
2065
2066 case TGSI_OPCODE_NOT:
2067 return 0;
2068 break;
2069
2070 case TGSI_OPCODE_TRUNC:
2071 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2072 FETCH( func, *inst, 0, 0, chan_index );
2073 emit_f2it( func, 0 );
2074 emit_i2f( func, 0 );
2075 STORE( func, *inst, 0, 0, chan_index );
2076 }
2077 break;
2078
2079 case TGSI_OPCODE_SHL:
2080 return 0;
2081 break;
2082
2083 case TGSI_OPCODE_SHR:
2084 return 0;
2085 break;
2086
2087 case TGSI_OPCODE_AND:
2088 return 0;
2089 break;
2090
2091 case TGSI_OPCODE_OR:
2092 return 0;
2093 break;
2094
2095 case TGSI_OPCODE_MOD:
2096 return 0;
2097 break;
2098
2099 case TGSI_OPCODE_XOR:
2100 return 0;
2101 break;
2102
2103 case TGSI_OPCODE_SAD:
2104 return 0;
2105 break;
2106
2107 case TGSI_OPCODE_TXF:
2108 return 0;
2109 break;
2110
2111 case TGSI_OPCODE_TXQ:
2112 return 0;
2113 break;
2114
2115 case TGSI_OPCODE_CONT:
2116 return 0;
2117 break;
2118
2119 case TGSI_OPCODE_EMIT:
2120 return 0;
2121 break;
2122
2123 case TGSI_OPCODE_ENDPRIM:
2124 return 0;
2125 break;
2126
2127 default:
2128 return 0;
2129 }
2130
2131 return 1;
2132 }
2133
2134 static void
2135 emit_declaration(
2136 struct x86_function *func,
2137 struct tgsi_full_declaration *decl )
2138 {
2139 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2140 unsigned first, last, mask;
2141 unsigned i, j;
2142
2143 first = decl->DeclarationRange.First;
2144 last = decl->DeclarationRange.Last;
2145 mask = decl->Declaration.UsageMask;
2146
2147 for( i = first; i <= last; i++ ) {
2148 for( j = 0; j < NUM_CHANNELS; j++ ) {
2149 if( mask & (1 << j) ) {
2150 switch( decl->Declaration.Interpolate ) {
2151 case TGSI_INTERPOLATE_CONSTANT:
2152 emit_coef_a0( func, 0, i, j );
2153 emit_inputs( func, 0, i, j );
2154 break;
2155
2156 case TGSI_INTERPOLATE_LINEAR:
2157 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2158 emit_coef_dadx( func, 1, i, j );
2159 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2160 emit_coef_dady( func, 3, i, j );
2161 emit_mul( func, 0, 1 ); /* x * dadx */
2162 emit_coef_a0( func, 4, i, j );
2163 emit_mul( func, 2, 3 ); /* y * dady */
2164 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2165 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2166 emit_inputs( func, 0, i, j );
2167 break;
2168
2169 case TGSI_INTERPOLATE_PERSPECTIVE:
2170 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2171 emit_coef_dadx( func, 1, i, j );
2172 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2173 emit_coef_dady( func, 3, i, j );
2174 emit_mul( func, 0, 1 ); /* x * dadx */
2175 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2176 emit_coef_a0( func, 5, i, j );
2177 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2178 emit_mul( func, 2, 3 ); /* y * dady */
2179 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2180 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2181 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2182 emit_inputs( func, 0, i, j );
2183 break;
2184
2185 default:
2186 assert( 0 );
2187 break;
2188 }
2189 }
2190 }
2191 }
2192 }
2193 }
2194
2195 static void aos_to_soa( struct x86_function *func,
2196 uint arg_aos,
2197 uint arg_soa,
2198 uint arg_num,
2199 uint arg_stride )
2200 {
2201 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2202 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2203 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2204 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2205 int inner_loop;
2206
2207
2208 /* Save EBX */
2209 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2210
2211 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2212 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2213 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2214 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2215
2216 /* do */
2217 inner_loop = x86_get_label( func );
2218 {
2219 x86_push( func, aos_input );
2220 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2221 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2222 x86_add( func, aos_input, stride );
2223 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2224 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2225 x86_add( func, aos_input, stride );
2226 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2227 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2228 x86_add( func, aos_input, stride );
2229 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2230 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2231 x86_pop( func, aos_input );
2232
2233 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2234 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2235 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2236 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2237 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2238 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2239
2240 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2241 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2242 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2243 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2244
2245 /* Advance to next input */
2246 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2247 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2248 }
2249 /* while --num_inputs */
2250 x86_dec( func, num_inputs );
2251 x86_jcc( func, cc_NE, inner_loop );
2252
2253 /* Restore EBX */
2254 x86_pop( func, aos_input );
2255 }
2256
2257 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2258 {
2259 struct x86_reg soa_output;
2260 struct x86_reg aos_output;
2261 struct x86_reg num_outputs;
2262 struct x86_reg temp;
2263 int inner_loop;
2264
2265 soa_output = x86_make_reg( file_REG32, reg_AX );
2266 aos_output = x86_make_reg( file_REG32, reg_BX );
2267 num_outputs = x86_make_reg( file_REG32, reg_CX );
2268 temp = x86_make_reg( file_REG32, reg_DX );
2269
2270 /* Save EBX */
2271 x86_push( func, aos_output );
2272
2273 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2274 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2275 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2276
2277 /* do */
2278 inner_loop = x86_get_label( func );
2279 {
2280 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2281 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2282 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2283 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2284
2285 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2286 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2287 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2288 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2289 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2290 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2291
2292 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2293 x86_push( func, aos_output );
2294 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2295 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2296 x86_add( func, aos_output, temp );
2297 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2298 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2299 x86_add( func, aos_output, temp );
2300 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2301 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2302 x86_add( func, aos_output, temp );
2303 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2304 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2305 x86_pop( func, aos_output );
2306
2307 /* Advance to next output */
2308 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2309 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2310 }
2311 /* while --num_outputs */
2312 x86_dec( func, num_outputs );
2313 x86_jcc( func, cc_NE, inner_loop );
2314
2315 /* Restore EBX */
2316 x86_pop( func, aos_output );
2317 }
2318
2319 /**
2320 * Translate a TGSI vertex/fragment shader to SSE2 code.
2321 * Slightly different things are done for vertex vs. fragment shaders.
2322 *
2323 * Note that fragment shaders are responsible for interpolating shader
2324 * inputs. Because on x86 we have only 4 GP registers, and here we
2325 * have 5 shader arguments (input, output, const, temp and coef), the
2326 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2327 * GP register holding the output argument is aliased with the coeff
2328 * argument, as outputs are not needed in the DECLARATION phase.
2329 *
2330 * \param tokens the TGSI input shader
2331 * \param func the output SSE code/function
2332 * \param immediates buffer to place immediates, later passed to SSE func
2333 * \param return 1 for success, 0 if translation failed
2334 */
2335 unsigned
2336 tgsi_emit_sse2(
2337 const struct tgsi_token *tokens,
2338 struct x86_function *func,
2339 float (*immediates)[4],
2340 boolean do_swizzles )
2341 {
2342 struct tgsi_parse_context parse;
2343 boolean instruction_phase = FALSE;
2344 unsigned ok = 1;
2345 uint num_immediates = 0;
2346
2347 util_init_math();
2348
2349 func->csr = func->store;
2350
2351 tgsi_parse_init( &parse, tokens );
2352
2353 /* Can't just use EDI, EBX without save/restoring them:
2354 */
2355 x86_push(
2356 func,
2357 get_immediate_base() );
2358
2359 x86_push(
2360 func,
2361 get_temp_base() );
2362
2363
2364 /*
2365 * Different function args for vertex/fragment shaders:
2366 */
2367 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2368 /* DECLARATION phase, do not load output argument. */
2369 x86_mov(
2370 func,
2371 get_input_base(),
2372 x86_fn_arg( func, 1 ) );
2373 /* skipping outputs argument here */
2374 x86_mov(
2375 func,
2376 get_const_base(),
2377 x86_fn_arg( func, 3 ) );
2378 x86_mov(
2379 func,
2380 get_temp_base(),
2381 x86_fn_arg( func, 4 ) );
2382 x86_mov(
2383 func,
2384 get_coef_base(),
2385 x86_fn_arg( func, 5 ) );
2386 x86_mov(
2387 func,
2388 get_immediate_base(),
2389 x86_fn_arg( func, 6 ) );
2390 }
2391 else {
2392 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2393
2394 if (do_swizzles)
2395 aos_to_soa( func,
2396 6, /* aos_input */
2397 1, /* machine->input */
2398 7, /* num_inputs */
2399 8 ); /* input_stride */
2400
2401 x86_mov(
2402 func,
2403 get_input_base(),
2404 x86_fn_arg( func, 1 ) );
2405 x86_mov(
2406 func,
2407 get_output_base(),
2408 x86_fn_arg( func, 2 ) );
2409 x86_mov(
2410 func,
2411 get_const_base(),
2412 x86_fn_arg( func, 3 ) );
2413 x86_mov(
2414 func,
2415 get_temp_base(),
2416 x86_fn_arg( func, 4 ) );
2417 x86_mov(
2418 func,
2419 get_immediate_base(),
2420 x86_fn_arg( func, 5 ) );
2421 }
2422
2423 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2424 tgsi_parse_token( &parse );
2425
2426 switch( parse.FullToken.Token.Type ) {
2427 case TGSI_TOKEN_TYPE_DECLARATION:
2428 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2429 emit_declaration(
2430 func,
2431 &parse.FullToken.FullDeclaration );
2432 }
2433 break;
2434
2435 case TGSI_TOKEN_TYPE_INSTRUCTION:
2436 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2437 if( !instruction_phase ) {
2438 /* INSTRUCTION phase, overwrite coeff with output. */
2439 instruction_phase = TRUE;
2440 x86_mov(
2441 func,
2442 get_output_base(),
2443 x86_fn_arg( func, 2 ) );
2444 }
2445 }
2446
2447 ok = emit_instruction(
2448 func,
2449 &parse.FullToken.FullInstruction );
2450
2451 if (!ok) {
2452 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2453 parse.FullToken.FullInstruction.Instruction.Opcode,
2454 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2455 "vertex shader" : "fragment shader");
2456 }
2457 break;
2458
2459 case TGSI_TOKEN_TYPE_IMMEDIATE:
2460 /* simply copy the immediate values into the next immediates[] slot */
2461 {
2462 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2463 uint i;
2464 assert(size <= 4);
2465 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2466 for( i = 0; i < size; i++ ) {
2467 immediates[num_immediates][i] =
2468 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2469 }
2470 #if 0
2471 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2472 num_immediates,
2473 immediates[num_immediates][0],
2474 immediates[num_immediates][1],
2475 immediates[num_immediates][2],
2476 immediates[num_immediates][3]);
2477 #endif
2478 num_immediates++;
2479 }
2480 break;
2481
2482 default:
2483 ok = 0;
2484 assert( 0 );
2485 }
2486 }
2487
2488 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2489 if (do_swizzles)
2490 soa_to_aos( func, 9, 2, 10, 11 );
2491 }
2492
2493 /* Can't just use EBX, EDI without save/restoring them:
2494 */
2495 x86_pop(
2496 func,
2497 get_temp_base() );
2498
2499 x86_pop(
2500 func,
2501 get_immediate_base() );
2502
2503 emit_ret( func );
2504
2505 tgsi_parse_free( &parse );
2506
2507 return ok;
2508 }
2509
2510 #endif /* PIPE_ARCH_X86 */
2511