tgsi: Implement OPCODE_ARR.
[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "pipe/p_debug.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "tgsi/tgsi_util.h"
33 #include "tgsi_exec.h"
34 #include "tgsi_sse2.h"
35
36 #include "rtasm/rtasm_x86sse.h"
37
38 #ifdef PIPE_ARCH_X86
39
40 /* for 1/sqrt()
41 *
42 * This costs about 100fps (close to 10%) in gears:
43 */
44 #define HIGH_PRECISION 1
45
46 #define FAST_MATH 1
47
48
49 #define FOR_EACH_CHANNEL( CHAN )\
50 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
51
52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
54
55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
57
58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
59 FOR_EACH_CHANNEL( CHAN )\
60 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
61
62 #define CHAN_X 0
63 #define CHAN_Y 1
64 #define CHAN_Z 2
65 #define CHAN_W 3
66
67 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
68 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
69
70 #define TEMP_R0 TGSI_EXEC_TEMP_R0
71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
74
75
76 /**
77 * X86 utility functions.
78 */
79
80 static struct x86_reg
81 make_xmm(
82 unsigned xmm )
83 {
84 return x86_make_reg(
85 file_XMM,
86 (enum x86_reg_name) xmm );
87 }
88
89 /**
90 * X86 register mapping helpers.
91 */
92
93 static struct x86_reg
94 get_const_base( void )
95 {
96 return x86_make_reg(
97 file_REG32,
98 reg_CX );
99 }
100
101 static struct x86_reg
102 get_input_base( void )
103 {
104 return x86_make_reg(
105 file_REG32,
106 reg_AX );
107 }
108
109 static struct x86_reg
110 get_output_base( void )
111 {
112 return x86_make_reg(
113 file_REG32,
114 reg_DX );
115 }
116
117 static struct x86_reg
118 get_temp_base( void )
119 {
120 return x86_make_reg(
121 file_REG32,
122 reg_BX );
123 }
124
125 static struct x86_reg
126 get_coef_base( void )
127 {
128 return get_output_base();
129 }
130
131 static struct x86_reg
132 get_immediate_base( void )
133 {
134 return x86_make_reg(
135 file_REG32,
136 reg_DI );
137 }
138
139
140 /**
141 * Data access helpers.
142 */
143
144
145 static struct x86_reg
146 get_immediate(
147 unsigned vec,
148 unsigned chan )
149 {
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec * 4 + chan) * 4 );
153 }
154
155 static struct x86_reg
156 get_const(
157 unsigned vec,
158 unsigned chan )
159 {
160 return x86_make_disp(
161 get_const_base(),
162 (vec * 4 + chan) * 4 );
163 }
164
165 static struct x86_reg
166 get_input(
167 unsigned vec,
168 unsigned chan )
169 {
170 return x86_make_disp(
171 get_input_base(),
172 (vec * 4 + chan) * 16 );
173 }
174
175 static struct x86_reg
176 get_output(
177 unsigned vec,
178 unsigned chan )
179 {
180 return x86_make_disp(
181 get_output_base(),
182 (vec * 4 + chan) * 16 );
183 }
184
185 static struct x86_reg
186 get_temp(
187 unsigned vec,
188 unsigned chan )
189 {
190 return x86_make_disp(
191 get_temp_base(),
192 (vec * 4 + chan) * 16 );
193 }
194
195 static struct x86_reg
196 get_coef(
197 unsigned vec,
198 unsigned chan,
199 unsigned member )
200 {
201 return x86_make_disp(
202 get_coef_base(),
203 ((vec * 3 + member) * 4 + chan) * 4 );
204 }
205
206
207 static void
208 emit_ret(
209 struct x86_function *func )
210 {
211 x86_ret( func );
212 }
213
214
215 /**
216 * Data fetch helpers.
217 */
218
219 /**
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
224 */
225 static void
226 emit_const(
227 struct x86_function *func,
228 uint xmm,
229 int vec,
230 uint chan,
231 uint indirect,
232 uint indirectFile,
233 int indirectIndex )
234 {
235 if (indirect) {
236 /* 'vec' is the offset from the address register's value.
237 * We're loading CONST[ADDR+vec] into an xmm register.
238 */
239 struct x86_reg r0 = get_input_base();
240 struct x86_reg r1 = get_output_base();
241 uint i;
242
243 assert( indirectFile == TGSI_FILE_ADDRESS );
244 assert( indirectIndex == 0 );
245
246 x86_push( func, r0 );
247 x86_push( func, r1 );
248
249 /*
250 * Loop over the four pixels or vertices in the quad.
251 * Get the value of the address (offset) register for pixel/vertex[i],
252 * add it to the src offset and index into the constant buffer.
253 * Note that we're working on SOA data.
254 * If any of the pixel/vertex execution channels are unused their
255 * values will be garbage. It's very important that we don't use
256 * those garbage values as indexes into the constant buffer since
257 * that'll cause segfaults.
258 * The solution is to bitwise-AND the offset with the execution mask
259 * register whose values are either 0 or ~0.
260 * The caller must setup the execution mask register to indicate
261 * which channels are valid/alive before running the shader.
262 * The execution mask will also figure into loops and conditionals
263 * someday.
264 */
265 for (i = 0; i < QUAD_SIZE; i++) {
266 /* r1 = address register[i] */
267 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
268 /* r0 = execution mask[i] */
269 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
270 /* r1 = r1 & r0 */
271 x86_and( func, r1, r0 );
272 /* r0 = 'vec', the offset */
273 x86_lea( func, r0, get_const( vec, chan ) );
274
275 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
276 */
277 x86_add( func, r1, r1 );
278 x86_add( func, r1, r1 );
279 x86_add( func, r1, r1 );
280 x86_add( func, r1, r1 );
281
282 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
283 x86_mov( func, r1, x86_deref( r0 ) );
284 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
285 }
286
287 x86_pop( func, r1 );
288 x86_pop( func, r0 );
289
290 sse_movaps(
291 func,
292 make_xmm( xmm ),
293 get_temp( TEMP_R0, CHAN_X ) );
294 }
295 else {
296 /* 'vec' is the index into the src register file, such as TEMP[vec] */
297 assert( vec >= 0 );
298
299 sse_movss(
300 func,
301 make_xmm( xmm ),
302 get_const( vec, chan ) );
303 sse_shufps(
304 func,
305 make_xmm( xmm ),
306 make_xmm( xmm ),
307 SHUF( 0, 0, 0, 0 ) );
308 }
309 }
310
311 static void
312 emit_immediate(
313 struct x86_function *func,
314 unsigned xmm,
315 unsigned vec,
316 unsigned chan )
317 {
318 sse_movss(
319 func,
320 make_xmm( xmm ),
321 get_immediate( vec, chan ) );
322 sse_shufps(
323 func,
324 make_xmm( xmm ),
325 make_xmm( xmm ),
326 SHUF( 0, 0, 0, 0 ) );
327 }
328
329
330 /**
331 * Copy a shader input to xmm register
332 * \param xmm the destination xmm register
333 * \param vec the src input attrib
334 * \param chan src channel to fetch (X, Y, Z or W)
335 */
336 static void
337 emit_inputf(
338 struct x86_function *func,
339 unsigned xmm,
340 unsigned vec,
341 unsigned chan )
342 {
343 sse_movups(
344 func,
345 make_xmm( xmm ),
346 get_input( vec, chan ) );
347 }
348
349 /**
350 * Store an xmm register to a shader output
351 * \param xmm the source xmm register
352 * \param vec the dest output attrib
353 * \param chan src dest channel to store (X, Y, Z or W)
354 */
355 static void
356 emit_output(
357 struct x86_function *func,
358 unsigned xmm,
359 unsigned vec,
360 unsigned chan )
361 {
362 sse_movups(
363 func,
364 get_output( vec, chan ),
365 make_xmm( xmm ) );
366 }
367
368 /**
369 * Copy a shader temporary to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src temp register
372 * \param chan src channel to fetch (X, Y, Z or W)
373 */
374 static void
375 emit_tempf(
376 struct x86_function *func,
377 unsigned xmm,
378 unsigned vec,
379 unsigned chan )
380 {
381 sse_movaps(
382 func,
383 make_xmm( xmm ),
384 get_temp( vec, chan ) );
385 }
386
387 /**
388 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
389 * \param xmm the destination xmm register
390 * \param vec the src input/attribute coefficient index
391 * \param chan src channel to fetch (X, Y, Z or W)
392 * \param member 0=a0, 1=dadx, 2=dady
393 */
394 static void
395 emit_coef(
396 struct x86_function *func,
397 unsigned xmm,
398 unsigned vec,
399 unsigned chan,
400 unsigned member )
401 {
402 sse_movss(
403 func,
404 make_xmm( xmm ),
405 get_coef( vec, chan, member ) );
406 sse_shufps(
407 func,
408 make_xmm( xmm ),
409 make_xmm( xmm ),
410 SHUF( 0, 0, 0, 0 ) );
411 }
412
413 /**
414 * Data store helpers.
415 */
416
417 static void
418 emit_inputs(
419 struct x86_function *func,
420 unsigned xmm,
421 unsigned vec,
422 unsigned chan )
423 {
424 sse_movups(
425 func,
426 get_input( vec, chan ),
427 make_xmm( xmm ) );
428 }
429
430 static void
431 emit_temps(
432 struct x86_function *func,
433 unsigned xmm,
434 unsigned vec,
435 unsigned chan )
436 {
437 sse_movaps(
438 func,
439 get_temp( vec, chan ),
440 make_xmm( xmm ) );
441 }
442
443 static void
444 emit_addrs(
445 struct x86_function *func,
446 unsigned xmm,
447 unsigned vec,
448 unsigned chan )
449 {
450 assert( vec == 0 );
451
452 emit_temps(
453 func,
454 xmm,
455 vec + TGSI_EXEC_TEMP_ADDR,
456 chan );
457 }
458
459 /**
460 * Coefficent fetch helpers.
461 */
462
463 static void
464 emit_coef_a0(
465 struct x86_function *func,
466 unsigned xmm,
467 unsigned vec,
468 unsigned chan )
469 {
470 emit_coef(
471 func,
472 xmm,
473 vec,
474 chan,
475 0 );
476 }
477
478 static void
479 emit_coef_dadx(
480 struct x86_function *func,
481 unsigned xmm,
482 unsigned vec,
483 unsigned chan )
484 {
485 emit_coef(
486 func,
487 xmm,
488 vec,
489 chan,
490 1 );
491 }
492
493 static void
494 emit_coef_dady(
495 struct x86_function *func,
496 unsigned xmm,
497 unsigned vec,
498 unsigned chan )
499 {
500 emit_coef(
501 func,
502 xmm,
503 vec,
504 chan,
505 2 );
506 }
507
508 /**
509 * Function call helpers.
510 */
511
512 static void
513 emit_push_gp(
514 struct x86_function *func )
515 {
516 x86_push(
517 func,
518 x86_make_reg( file_REG32, reg_AX) );
519 x86_push(
520 func,
521 x86_make_reg( file_REG32, reg_CX) );
522 x86_push(
523 func,
524 x86_make_reg( file_REG32, reg_DX) );
525 }
526
527 static void
528 x86_pop_gp(
529 struct x86_function *func )
530 {
531 /* Restore GP registers in a reverse order.
532 */
533 x86_pop(
534 func,
535 x86_make_reg( file_REG32, reg_DX) );
536 x86_pop(
537 func,
538 x86_make_reg( file_REG32, reg_CX) );
539 x86_pop(
540 func,
541 x86_make_reg( file_REG32, reg_AX) );
542 }
543
544 static void
545 emit_func_call_dst(
546 struct x86_function *func,
547 unsigned xmm_dst,
548 void (PIPE_CDECL *code)() )
549 {
550 sse_movaps(
551 func,
552 get_temp( TEMP_R0, 0 ),
553 make_xmm( xmm_dst ) );
554
555 emit_push_gp(
556 func );
557
558 {
559 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
560
561 x86_lea(
562 func,
563 ecx,
564 get_temp( TEMP_R0, 0 ) );
565
566 x86_push( func, ecx );
567 x86_mov_reg_imm( func, ecx, (unsigned long) code );
568 x86_call( func, ecx );
569 x86_pop(func, ecx );
570 }
571
572
573 x86_pop_gp(
574 func );
575
576 sse_movaps(
577 func,
578 make_xmm( xmm_dst ),
579 get_temp( TEMP_R0, 0 ) );
580 }
581
582 static void
583 emit_func_call_dst_src(
584 struct x86_function *func,
585 unsigned xmm_dst,
586 unsigned xmm_src,
587 void (PIPE_CDECL *code)() )
588 {
589 sse_movaps(
590 func,
591 get_temp( TEMP_R0, 1 ),
592 make_xmm( xmm_src ) );
593
594 emit_func_call_dst(
595 func,
596 xmm_dst,
597 code );
598 }
599
600 /**
601 * Low-level instruction translators.
602 */
603
604 static void
605 emit_abs(
606 struct x86_function *func,
607 unsigned xmm )
608 {
609 sse_andps(
610 func,
611 make_xmm( xmm ),
612 get_temp(
613 TGSI_EXEC_TEMP_7FFFFFFF_I,
614 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
615 }
616
617 static void
618 emit_add(
619 struct x86_function *func,
620 unsigned xmm_dst,
621 unsigned xmm_src )
622 {
623 sse_addps(
624 func,
625 make_xmm( xmm_dst ),
626 make_xmm( xmm_src ) );
627 }
628
629 static void PIPE_CDECL
630 cos4f(
631 float *store )
632 {
633 store[0] = cosf( store[0] );
634 store[1] = cosf( store[1] );
635 store[2] = cosf( store[2] );
636 store[3] = cosf( store[3] );
637 }
638
639 static void
640 emit_cos(
641 struct x86_function *func,
642 unsigned xmm_dst )
643 {
644 emit_func_call_dst(
645 func,
646 xmm_dst,
647 cos4f );
648 }
649
650 static void PIPE_CDECL
651 ex24f(
652 float *store )
653 {
654 #if FAST_MATH
655 store[0] = util_fast_exp2( store[0] );
656 store[1] = util_fast_exp2( store[1] );
657 store[2] = util_fast_exp2( store[2] );
658 store[3] = util_fast_exp2( store[3] );
659 #else
660 store[0] = powf( 2.0f, store[0] );
661 store[1] = powf( 2.0f, store[1] );
662 store[2] = powf( 2.0f, store[2] );
663 store[3] = powf( 2.0f, store[3] );
664 #endif
665 }
666
667 static void
668 emit_ex2(
669 struct x86_function *func,
670 unsigned xmm_dst )
671 {
672 emit_func_call_dst(
673 func,
674 xmm_dst,
675 ex24f );
676 }
677
678 static void
679 emit_f2it(
680 struct x86_function *func,
681 unsigned xmm )
682 {
683 sse2_cvttps2dq(
684 func,
685 make_xmm( xmm ),
686 make_xmm( xmm ) );
687 }
688
689 static void
690 emit_i2f(
691 struct x86_function *func,
692 unsigned xmm )
693 {
694 sse2_cvtdq2ps(
695 func,
696 make_xmm( xmm ),
697 make_xmm( xmm ) );
698 }
699
700 static void PIPE_CDECL
701 flr4f(
702 float *store )
703 {
704 store[0] = floorf( store[0] );
705 store[1] = floorf( store[1] );
706 store[2] = floorf( store[2] );
707 store[3] = floorf( store[3] );
708 }
709
710 static void
711 emit_flr(
712 struct x86_function *func,
713 unsigned xmm_dst )
714 {
715 emit_func_call_dst(
716 func,
717 xmm_dst,
718 flr4f );
719 }
720
721 static void PIPE_CDECL
722 frc4f(
723 float *store )
724 {
725 store[0] -= floorf( store[0] );
726 store[1] -= floorf( store[1] );
727 store[2] -= floorf( store[2] );
728 store[3] -= floorf( store[3] );
729 }
730
731 static void
732 emit_frc(
733 struct x86_function *func,
734 unsigned xmm_dst )
735 {
736 emit_func_call_dst(
737 func,
738 xmm_dst,
739 frc4f );
740 }
741
742 static void PIPE_CDECL
743 lg24f(
744 float *store )
745 {
746 store[0] = util_fast_log2( store[0] );
747 store[1] = util_fast_log2( store[1] );
748 store[2] = util_fast_log2( store[2] );
749 store[3] = util_fast_log2( store[3] );
750 }
751
752 static void
753 emit_lg2(
754 struct x86_function *func,
755 unsigned xmm_dst )
756 {
757 emit_func_call_dst(
758 func,
759 xmm_dst,
760 lg24f );
761 }
762
763 static void
764 emit_MOV(
765 struct x86_function *func,
766 unsigned xmm_dst,
767 unsigned xmm_src )
768 {
769 sse_movups(
770 func,
771 make_xmm( xmm_dst ),
772 make_xmm( xmm_src ) );
773 }
774
775 static void
776 emit_mul (struct x86_function *func,
777 unsigned xmm_dst,
778 unsigned xmm_src)
779 {
780 sse_mulps(
781 func,
782 make_xmm( xmm_dst ),
783 make_xmm( xmm_src ) );
784 }
785
786 static void
787 emit_neg(
788 struct x86_function *func,
789 unsigned xmm )
790 {
791 sse_xorps(
792 func,
793 make_xmm( xmm ),
794 get_temp(
795 TGSI_EXEC_TEMP_80000000_I,
796 TGSI_EXEC_TEMP_80000000_C ) );
797 }
798
799 static void PIPE_CDECL
800 pow4f(
801 float *store )
802 {
803 #if FAST_MATH
804 store[0] = util_fast_pow( store[0], store[4] );
805 store[1] = util_fast_pow( store[1], store[5] );
806 store[2] = util_fast_pow( store[2], store[6] );
807 store[3] = util_fast_pow( store[3], store[7] );
808 #else
809 store[0] = powf( store[0], store[4] );
810 store[1] = powf( store[1], store[5] );
811 store[2] = powf( store[2], store[6] );
812 store[3] = powf( store[3], store[7] );
813 #endif
814 }
815
816 static void
817 emit_pow(
818 struct x86_function *func,
819 unsigned xmm_dst,
820 unsigned xmm_src )
821 {
822 emit_func_call_dst_src(
823 func,
824 xmm_dst,
825 xmm_src,
826 pow4f );
827 }
828
829 static void
830 emit_rcp (
831 struct x86_function *func,
832 unsigned xmm_dst,
833 unsigned xmm_src )
834 {
835 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
836 * good enough. Need to either emit a proper divide or use the
837 * iterative technique described below in emit_rsqrt().
838 */
839 sse2_rcpps(
840 func,
841 make_xmm( xmm_dst ),
842 make_xmm( xmm_src ) );
843 }
844
845 static void PIPE_CDECL
846 rnd4f(
847 float *store )
848 {
849 store[0] = floorf( store[0] + 0.5f );
850 store[1] = floorf( store[1] + 0.5f );
851 store[2] = floorf( store[2] + 0.5f );
852 store[3] = floorf( store[3] + 0.5f );
853 }
854
855 static void
856 emit_rnd(
857 struct x86_function *func,
858 unsigned xmm_save,
859 unsigned xmm_dst )
860 {
861 emit_func_call_dst(
862 func,
863 xmm_save,
864 xmm_dst,
865 rnd4f );
866 }
867
868 static void
869 emit_rsqrt(
870 struct x86_function *func,
871 unsigned xmm_dst,
872 unsigned xmm_src )
873 {
874 #if HIGH_PRECISION
875 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
876 * implementations, it is possible to improve its precision at
877 * fairly low cost, using a newton/raphson step, as below:
878 *
879 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
880 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
881 *
882 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
883 */
884 {
885 struct x86_reg dst = make_xmm( xmm_dst );
886 struct x86_reg src = make_xmm( xmm_src );
887 struct x86_reg tmp0 = make_xmm( 2 );
888 struct x86_reg tmp1 = make_xmm( 3 );
889
890 assert( xmm_dst != xmm_src );
891 assert( xmm_dst != 2 && xmm_dst != 3 );
892 assert( xmm_src != 2 && xmm_src != 3 );
893
894 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
895 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
896 sse_rsqrtps( func, tmp1, src );
897 sse_mulps( func, src, tmp1 );
898 sse_mulps( func, dst, tmp1 );
899 sse_mulps( func, src, tmp1 );
900 sse_subps( func, tmp0, src );
901 sse_mulps( func, dst, tmp0 );
902 }
903 #else
904 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
905 * good enough.
906 */
907 sse_rsqrtps(
908 func,
909 make_xmm( xmm_dst ),
910 make_xmm( xmm_src ) );
911 #endif
912 }
913
914 static void
915 emit_setsign(
916 struct x86_function *func,
917 unsigned xmm )
918 {
919 sse_orps(
920 func,
921 make_xmm( xmm ),
922 get_temp(
923 TGSI_EXEC_TEMP_80000000_I,
924 TGSI_EXEC_TEMP_80000000_C ) );
925 }
926
927 static void PIPE_CDECL
928 sin4f(
929 float *store )
930 {
931 store[0] = sinf( store[0] );
932 store[1] = sinf( store[1] );
933 store[2] = sinf( store[2] );
934 store[3] = sinf( store[3] );
935 }
936
937 static void
938 emit_sin (struct x86_function *func,
939 unsigned xmm_dst)
940 {
941 emit_func_call_dst(
942 func,
943 xmm_dst,
944 sin4f );
945 }
946
947 static void
948 emit_sub(
949 struct x86_function *func,
950 unsigned xmm_dst,
951 unsigned xmm_src )
952 {
953 sse_subps(
954 func,
955 make_xmm( xmm_dst ),
956 make_xmm( xmm_src ) );
957 }
958
959 /**
960 * Register fetch.
961 */
962
963 static void
964 emit_fetch(
965 struct x86_function *func,
966 unsigned xmm,
967 const struct tgsi_full_src_register *reg,
968 const unsigned chan_index )
969 {
970 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
971
972 switch (swizzle) {
973 case TGSI_EXTSWIZZLE_X:
974 case TGSI_EXTSWIZZLE_Y:
975 case TGSI_EXTSWIZZLE_Z:
976 case TGSI_EXTSWIZZLE_W:
977 switch (reg->SrcRegister.File) {
978 case TGSI_FILE_CONSTANT:
979 emit_const(
980 func,
981 xmm,
982 reg->SrcRegister.Index,
983 swizzle,
984 reg->SrcRegister.Indirect,
985 reg->SrcRegisterInd.File,
986 reg->SrcRegisterInd.Index );
987 break;
988
989 case TGSI_FILE_IMMEDIATE:
990 emit_immediate(
991 func,
992 xmm,
993 reg->SrcRegister.Index,
994 swizzle );
995 break;
996
997 case TGSI_FILE_INPUT:
998 emit_inputf(
999 func,
1000 xmm,
1001 reg->SrcRegister.Index,
1002 swizzle );
1003 break;
1004
1005 case TGSI_FILE_TEMPORARY:
1006 emit_tempf(
1007 func,
1008 xmm,
1009 reg->SrcRegister.Index,
1010 swizzle );
1011 break;
1012
1013 default:
1014 assert( 0 );
1015 }
1016 break;
1017
1018 case TGSI_EXTSWIZZLE_ZERO:
1019 emit_tempf(
1020 func,
1021 xmm,
1022 TGSI_EXEC_TEMP_00000000_I,
1023 TGSI_EXEC_TEMP_00000000_C );
1024 break;
1025
1026 case TGSI_EXTSWIZZLE_ONE:
1027 emit_tempf(
1028 func,
1029 xmm,
1030 TEMP_ONE_I,
1031 TEMP_ONE_C );
1032 break;
1033
1034 default:
1035 assert( 0 );
1036 }
1037
1038 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1039 case TGSI_UTIL_SIGN_CLEAR:
1040 emit_abs( func, xmm );
1041 break;
1042
1043 case TGSI_UTIL_SIGN_SET:
1044 emit_setsign( func, xmm );
1045 break;
1046
1047 case TGSI_UTIL_SIGN_TOGGLE:
1048 emit_neg( func, xmm );
1049 break;
1050
1051 case TGSI_UTIL_SIGN_KEEP:
1052 break;
1053 }
1054 }
1055
1056 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1057 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1058
1059 /**
1060 * Register store.
1061 */
1062
1063 static void
1064 emit_store(
1065 struct x86_function *func,
1066 unsigned xmm,
1067 const struct tgsi_full_dst_register *reg,
1068 const struct tgsi_full_instruction *inst,
1069 unsigned chan_index )
1070 {
1071 switch( reg->DstRegister.File ) {
1072 case TGSI_FILE_OUTPUT:
1073 emit_output(
1074 func,
1075 xmm,
1076 reg->DstRegister.Index,
1077 chan_index );
1078 break;
1079
1080 case TGSI_FILE_TEMPORARY:
1081 emit_temps(
1082 func,
1083 xmm,
1084 reg->DstRegister.Index,
1085 chan_index );
1086 break;
1087
1088 case TGSI_FILE_ADDRESS:
1089 emit_addrs(
1090 func,
1091 xmm,
1092 reg->DstRegister.Index,
1093 chan_index );
1094 break;
1095
1096 default:
1097 assert( 0 );
1098 }
1099
1100 switch( inst->Instruction.Saturate ) {
1101 case TGSI_SAT_NONE:
1102 break;
1103
1104 case TGSI_SAT_ZERO_ONE:
1105 /* assert( 0 ); */
1106 break;
1107
1108 case TGSI_SAT_MINUS_PLUS_ONE:
1109 assert( 0 );
1110 break;
1111 }
1112 }
1113
1114 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1115 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1116
1117 /**
1118 * High-level instruction translators.
1119 */
1120
1121 static void
1122 emit_kil(
1123 struct x86_function *func,
1124 const struct tgsi_full_src_register *reg )
1125 {
1126 unsigned uniquemask;
1127 unsigned registers[4];
1128 unsigned nextregister = 0;
1129 unsigned firstchan = ~0;
1130 unsigned chan_index;
1131
1132 /* This mask stores component bits that were already tested. Note that
1133 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1134 * tested. */
1135 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1136
1137 FOR_EACH_CHANNEL( chan_index ) {
1138 unsigned swizzle;
1139
1140 /* unswizzle channel */
1141 swizzle = tgsi_util_get_full_src_register_extswizzle(
1142 reg,
1143 chan_index );
1144
1145 /* check if the component has not been already tested */
1146 if( !(uniquemask & (1 << swizzle)) ) {
1147 uniquemask |= 1 << swizzle;
1148
1149 /* allocate register */
1150 registers[chan_index] = nextregister;
1151 emit_fetch(
1152 func,
1153 nextregister,
1154 reg,
1155 chan_index );
1156 nextregister++;
1157
1158 /* mark the first channel used */
1159 if( firstchan == ~0 ) {
1160 firstchan = chan_index;
1161 }
1162 }
1163 }
1164
1165 x86_push(
1166 func,
1167 x86_make_reg( file_REG32, reg_AX ) );
1168 x86_push(
1169 func,
1170 x86_make_reg( file_REG32, reg_DX ) );
1171
1172 FOR_EACH_CHANNEL( chan_index ) {
1173 if( uniquemask & (1 << chan_index) ) {
1174 sse_cmpps(
1175 func,
1176 make_xmm( registers[chan_index] ),
1177 get_temp(
1178 TGSI_EXEC_TEMP_00000000_I,
1179 TGSI_EXEC_TEMP_00000000_C ),
1180 cc_LessThan );
1181
1182 if( chan_index == firstchan ) {
1183 sse_pmovmskb(
1184 func,
1185 x86_make_reg( file_REG32, reg_AX ),
1186 make_xmm( registers[chan_index] ) );
1187 }
1188 else {
1189 sse_pmovmskb(
1190 func,
1191 x86_make_reg( file_REG32, reg_DX ),
1192 make_xmm( registers[chan_index] ) );
1193 x86_or(
1194 func,
1195 x86_make_reg( file_REG32, reg_AX ),
1196 x86_make_reg( file_REG32, reg_DX ) );
1197 }
1198 }
1199 }
1200
1201 x86_or(
1202 func,
1203 get_temp(
1204 TGSI_EXEC_TEMP_KILMASK_I,
1205 TGSI_EXEC_TEMP_KILMASK_C ),
1206 x86_make_reg( file_REG32, reg_AX ) );
1207
1208 x86_pop(
1209 func,
1210 x86_make_reg( file_REG32, reg_DX ) );
1211 x86_pop(
1212 func,
1213 x86_make_reg( file_REG32, reg_AX ) );
1214 }
1215
1216
1217 static void
1218 emit_kilp(
1219 struct x86_function *func )
1220 {
1221 /* XXX todo / fix me */
1222 }
1223
1224
1225 static void
1226 emit_setcc(
1227 struct x86_function *func,
1228 struct tgsi_full_instruction *inst,
1229 enum sse_cc cc )
1230 {
1231 unsigned chan_index;
1232
1233 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1234 FETCH( func, *inst, 0, 0, chan_index );
1235 FETCH( func, *inst, 1, 1, chan_index );
1236 sse_cmpps(
1237 func,
1238 make_xmm( 0 ),
1239 make_xmm( 1 ),
1240 cc );
1241 sse_andps(
1242 func,
1243 make_xmm( 0 ),
1244 get_temp(
1245 TEMP_ONE_I,
1246 TEMP_ONE_C ) );
1247 STORE( func, *inst, 0, 0, chan_index );
1248 }
1249 }
1250
1251 static void
1252 emit_cmp(
1253 struct x86_function *func,
1254 struct tgsi_full_instruction *inst )
1255 {
1256 unsigned chan_index;
1257
1258 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1259 FETCH( func, *inst, 0, 0, chan_index );
1260 FETCH( func, *inst, 1, 1, chan_index );
1261 FETCH( func, *inst, 2, 2, chan_index );
1262 sse_cmpps(
1263 func,
1264 make_xmm( 0 ),
1265 get_temp(
1266 TGSI_EXEC_TEMP_00000000_I,
1267 TGSI_EXEC_TEMP_00000000_C ),
1268 cc_LessThan );
1269 sse_andps(
1270 func,
1271 make_xmm( 1 ),
1272 make_xmm( 0 ) );
1273 sse_andnps(
1274 func,
1275 make_xmm( 0 ),
1276 make_xmm( 2 ) );
1277 sse_orps(
1278 func,
1279 make_xmm( 0 ),
1280 make_xmm( 1 ) );
1281 STORE( func, *inst, 0, 0, chan_index );
1282 }
1283 }
1284
1285 static int
1286 emit_instruction(
1287 struct x86_function *func,
1288 struct tgsi_full_instruction *inst )
1289 {
1290 unsigned chan_index;
1291
1292 switch (inst->Instruction.Opcode) {
1293 case TGSI_OPCODE_ARL:
1294 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1295 FETCH( func, *inst, 0, 0, chan_index );
1296 emit_f2it( func, 0 );
1297 STORE( func, *inst, 0, 0, chan_index );
1298 }
1299 break;
1300
1301 case TGSI_OPCODE_MOV:
1302 case TGSI_OPCODE_SWZ:
1303 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1304 FETCH( func, *inst, 0, 0, chan_index );
1305 STORE( func, *inst, 0, 0, chan_index );
1306 }
1307 break;
1308
1309 case TGSI_OPCODE_LIT:
1310 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1311 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1312 emit_tempf(
1313 func,
1314 0,
1315 TEMP_ONE_I,
1316 TEMP_ONE_C);
1317 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1318 STORE( func, *inst, 0, 0, CHAN_X );
1319 }
1320 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1321 STORE( func, *inst, 0, 0, CHAN_W );
1322 }
1323 }
1324 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1325 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1326 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1327 FETCH( func, *inst, 0, 0, CHAN_X );
1328 sse_maxps(
1329 func,
1330 make_xmm( 0 ),
1331 get_temp(
1332 TGSI_EXEC_TEMP_00000000_I,
1333 TGSI_EXEC_TEMP_00000000_C ) );
1334 STORE( func, *inst, 0, 0, CHAN_Y );
1335 }
1336 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1337 /* XMM[1] = SrcReg[0].yyyy */
1338 FETCH( func, *inst, 1, 0, CHAN_Y );
1339 /* XMM[1] = max(XMM[1], 0) */
1340 sse_maxps(
1341 func,
1342 make_xmm( 1 ),
1343 get_temp(
1344 TGSI_EXEC_TEMP_00000000_I,
1345 TGSI_EXEC_TEMP_00000000_C ) );
1346 /* XMM[2] = SrcReg[0].wwww */
1347 FETCH( func, *inst, 2, 0, CHAN_W );
1348 /* XMM[2] = min(XMM[2], 128.0) */
1349 sse_minps(
1350 func,
1351 make_xmm( 2 ),
1352 get_temp(
1353 TGSI_EXEC_TEMP_128_I,
1354 TGSI_EXEC_TEMP_128_C ) );
1355 /* XMM[2] = max(XMM[2], -128.0) */
1356 sse_maxps(
1357 func,
1358 make_xmm( 2 ),
1359 get_temp(
1360 TGSI_EXEC_TEMP_MINUS_128_I,
1361 TGSI_EXEC_TEMP_MINUS_128_C ) );
1362 emit_pow( func, 1, 2 );
1363 FETCH( func, *inst, 0, 0, CHAN_X );
1364 sse_xorps(
1365 func,
1366 make_xmm( 2 ),
1367 make_xmm( 2 ) );
1368 sse_cmpps(
1369 func,
1370 make_xmm( 2 ),
1371 make_xmm( 0 ),
1372 cc_LessThanEqual );
1373 sse_andps(
1374 func,
1375 make_xmm( 2 ),
1376 make_xmm( 1 ) );
1377 STORE( func, *inst, 2, 0, CHAN_Z );
1378 }
1379 }
1380 break;
1381
1382 case TGSI_OPCODE_RCP:
1383 /* TGSI_OPCODE_RECIP */
1384 FETCH( func, *inst, 0, 0, CHAN_X );
1385 emit_rcp( func, 0, 0 );
1386 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1387 STORE( func, *inst, 0, 0, chan_index );
1388 }
1389 break;
1390
1391 case TGSI_OPCODE_RSQ:
1392 /* TGSI_OPCODE_RECIPSQRT */
1393 FETCH( func, *inst, 0, 0, CHAN_X );
1394 emit_rsqrt( func, 1, 0 );
1395 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1396 STORE( func, *inst, 1, 0, chan_index );
1397 }
1398 break;
1399
1400 case TGSI_OPCODE_EXP:
1401 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1402 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1403 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1404 FETCH( func, *inst, 0, 0, CHAN_X );
1405 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1406 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1407 emit_MOV( func, 1, 0 );
1408 emit_flr( func, 1 );
1409 /* dst.x = ex2(floor(src.x)) */
1410 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1411 emit_MOV( func, 2, 1 );
1412 emit_ex2( func, 2 );
1413 STORE( func, *inst, 2, 0, CHAN_X );
1414 }
1415 /* dst.y = src.x - floor(src.x) */
1416 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1417 emit_MOV( func, 2, 0 );
1418 emit_sub( func, 2, 1 );
1419 STORE( func, *inst, 2, 0, CHAN_Y );
1420 }
1421 }
1422 /* dst.z = ex2(src.x) */
1423 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1424 emit_ex2( func, 0 );
1425 STORE( func, *inst, 0, 0, CHAN_Z );
1426 }
1427 }
1428 /* dst.w = 1.0 */
1429 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1430 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1431 STORE( func, *inst, 0, 0, CHAN_W );
1432 }
1433 break;
1434
1435 case TGSI_OPCODE_LOG:
1436 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1437 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1438 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1439 FETCH( func, *inst, 0, 0, CHAN_X );
1440 emit_abs( func, 0 );
1441 emit_MOV( func, 1, 0 );
1442 emit_lg2( func, 1 );
1443 /* dst.z = lg2(abs(src.x)) */
1444 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1445 STORE( func, *inst, 1, 0, CHAN_Z );
1446 }
1447 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1448 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1449 emit_flr( func, 1 );
1450 /* dst.x = floor(lg2(abs(src.x))) */
1451 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1452 STORE( func, *inst, 1, 0, CHAN_X );
1453 }
1454 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1455 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1456 emit_ex2( func, 1 );
1457 emit_rcp( func, 1, 1 );
1458 emit_mul( func, 0, 1 );
1459 STORE( func, *inst, 0, 0, CHAN_Y );
1460 }
1461 }
1462 }
1463 /* dst.w = 1.0 */
1464 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1465 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1466 STORE( func, *inst, 0, 0, CHAN_W );
1467 }
1468 break;
1469
1470 case TGSI_OPCODE_MUL:
1471 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1472 FETCH( func, *inst, 0, 0, chan_index );
1473 FETCH( func, *inst, 1, 1, chan_index );
1474 emit_mul( func, 0, 1 );
1475 STORE( func, *inst, 0, 0, chan_index );
1476 }
1477 break;
1478
1479 case TGSI_OPCODE_ADD:
1480 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1481 FETCH( func, *inst, 0, 0, chan_index );
1482 FETCH( func, *inst, 1, 1, chan_index );
1483 emit_add( func, 0, 1 );
1484 STORE( func, *inst, 0, 0, chan_index );
1485 }
1486 break;
1487
1488 case TGSI_OPCODE_DP3:
1489 /* TGSI_OPCODE_DOT3 */
1490 FETCH( func, *inst, 0, 0, CHAN_X );
1491 FETCH( func, *inst, 1, 1, CHAN_X );
1492 emit_mul( func, 0, 1 );
1493 FETCH( func, *inst, 1, 0, CHAN_Y );
1494 FETCH( func, *inst, 2, 1, CHAN_Y );
1495 emit_mul( func, 1, 2 );
1496 emit_add( func, 0, 1 );
1497 FETCH( func, *inst, 1, 0, CHAN_Z );
1498 FETCH( func, *inst, 2, 1, CHAN_Z );
1499 emit_mul( func, 1, 2 );
1500 emit_add( func, 0, 1 );
1501 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1502 STORE( func, *inst, 0, 0, chan_index );
1503 }
1504 break;
1505
1506 case TGSI_OPCODE_DP4:
1507 /* TGSI_OPCODE_DOT4 */
1508 FETCH( func, *inst, 0, 0, CHAN_X );
1509 FETCH( func, *inst, 1, 1, CHAN_X );
1510 emit_mul( func, 0, 1 );
1511 FETCH( func, *inst, 1, 0, CHAN_Y );
1512 FETCH( func, *inst, 2, 1, CHAN_Y );
1513 emit_mul( func, 1, 2 );
1514 emit_add( func, 0, 1 );
1515 FETCH( func, *inst, 1, 0, CHAN_Z );
1516 FETCH( func, *inst, 2, 1, CHAN_Z );
1517 emit_mul(func, 1, 2 );
1518 emit_add(func, 0, 1 );
1519 FETCH( func, *inst, 1, 0, CHAN_W );
1520 FETCH( func, *inst, 2, 1, CHAN_W );
1521 emit_mul( func, 1, 2 );
1522 emit_add( func, 0, 1 );
1523 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1524 STORE( func, *inst, 0, 0, chan_index );
1525 }
1526 break;
1527
1528 case TGSI_OPCODE_DST:
1529 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1530 emit_tempf(
1531 func,
1532 0,
1533 TEMP_ONE_I,
1534 TEMP_ONE_C );
1535 STORE( func, *inst, 0, 0, CHAN_X );
1536 }
1537 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1538 FETCH( func, *inst, 0, 0, CHAN_Y );
1539 FETCH( func, *inst, 1, 1, CHAN_Y );
1540 emit_mul( func, 0, 1 );
1541 STORE( func, *inst, 0, 0, CHAN_Y );
1542 }
1543 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1544 FETCH( func, *inst, 0, 0, CHAN_Z );
1545 STORE( func, *inst, 0, 0, CHAN_Z );
1546 }
1547 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1548 FETCH( func, *inst, 0, 1, CHAN_W );
1549 STORE( func, *inst, 0, 0, CHAN_W );
1550 }
1551 break;
1552
1553 case TGSI_OPCODE_MIN:
1554 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1555 FETCH( func, *inst, 0, 0, chan_index );
1556 FETCH( func, *inst, 1, 1, chan_index );
1557 sse_minps(
1558 func,
1559 make_xmm( 0 ),
1560 make_xmm( 1 ) );
1561 STORE( func, *inst, 0, 0, chan_index );
1562 }
1563 break;
1564
1565 case TGSI_OPCODE_MAX:
1566 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1567 FETCH( func, *inst, 0, 0, chan_index );
1568 FETCH( func, *inst, 1, 1, chan_index );
1569 sse_maxps(
1570 func,
1571 make_xmm( 0 ),
1572 make_xmm( 1 ) );
1573 STORE( func, *inst, 0, 0, chan_index );
1574 }
1575 break;
1576
1577 case TGSI_OPCODE_SLT:
1578 /* TGSI_OPCODE_SETLT */
1579 emit_setcc( func, inst, cc_LessThan );
1580 break;
1581
1582 case TGSI_OPCODE_SGE:
1583 /* TGSI_OPCODE_SETGE */
1584 emit_setcc( func, inst, cc_NotLessThan );
1585 break;
1586
1587 case TGSI_OPCODE_MAD:
1588 /* TGSI_OPCODE_MADD */
1589 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1590 FETCH( func, *inst, 0, 0, chan_index );
1591 FETCH( func, *inst, 1, 1, chan_index );
1592 FETCH( func, *inst, 2, 2, chan_index );
1593 emit_mul( func, 0, 1 );
1594 emit_add( func, 0, 2 );
1595 STORE( func, *inst, 0, 0, chan_index );
1596 }
1597 break;
1598
1599 case TGSI_OPCODE_SUB:
1600 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1601 FETCH( func, *inst, 0, 0, chan_index );
1602 FETCH( func, *inst, 1, 1, chan_index );
1603 emit_sub( func, 0, 1 );
1604 STORE( func, *inst, 0, 0, chan_index );
1605 }
1606 break;
1607
1608 case TGSI_OPCODE_LERP:
1609 /* TGSI_OPCODE_LRP */
1610 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1611 FETCH( func, *inst, 0, 0, chan_index );
1612 FETCH( func, *inst, 1, 1, chan_index );
1613 FETCH( func, *inst, 2, 2, chan_index );
1614 emit_sub( func, 1, 2 );
1615 emit_mul( func, 0, 1 );
1616 emit_add( func, 0, 2 );
1617 STORE( func, *inst, 0, 0, chan_index );
1618 }
1619 break;
1620
1621 case TGSI_OPCODE_CND:
1622 return 0;
1623 break;
1624
1625 case TGSI_OPCODE_CND0:
1626 return 0;
1627 break;
1628
1629 case TGSI_OPCODE_DOT2ADD:
1630 /* TGSI_OPCODE_DP2A */
1631 return 0;
1632 break;
1633
1634 case TGSI_OPCODE_INDEX:
1635 return 0;
1636 break;
1637
1638 case TGSI_OPCODE_NEGATE:
1639 return 0;
1640 break;
1641
1642 case TGSI_OPCODE_FRAC:
1643 /* TGSI_OPCODE_FRC */
1644 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1645 FETCH( func, *inst, 0, 0, chan_index );
1646 emit_frc( func, 0 );
1647 STORE( func, *inst, 0, 0, chan_index );
1648 }
1649 break;
1650
1651 case TGSI_OPCODE_CLAMP:
1652 return 0;
1653 break;
1654
1655 case TGSI_OPCODE_FLOOR:
1656 /* TGSI_OPCODE_FLR */
1657 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1658 FETCH( func, *inst, 0, 0, chan_index );
1659 emit_flr( func, 0 );
1660 STORE( func, *inst, 0, 0, chan_index );
1661 }
1662 break;
1663
1664 case TGSI_OPCODE_ROUND:
1665 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1666 FETCH( func, *inst, 0, 0, chan_index );
1667 emit_rnd( func, 0, 0 );
1668 STORE( func, *inst, 0, 0, chan_index );
1669 }
1670 break;
1671
1672 case TGSI_OPCODE_EXPBASE2:
1673 /* TGSI_OPCODE_EX2 */
1674 FETCH( func, *inst, 0, 0, CHAN_X );
1675 emit_ex2( func, 0 );
1676 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677 STORE( func, *inst, 0, 0, chan_index );
1678 }
1679 break;
1680
1681 case TGSI_OPCODE_LOGBASE2:
1682 /* TGSI_OPCODE_LG2 */
1683 FETCH( func, *inst, 0, 0, CHAN_X );
1684 emit_lg2( func, 0 );
1685 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1686 STORE( func, *inst, 0, 0, chan_index );
1687 }
1688 break;
1689
1690 case TGSI_OPCODE_POWER:
1691 /* TGSI_OPCODE_POW */
1692 FETCH( func, *inst, 0, 0, CHAN_X );
1693 FETCH( func, *inst, 1, 1, CHAN_X );
1694 emit_pow( func, 0, 1 );
1695 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1696 STORE( func, *inst, 0, 0, chan_index );
1697 }
1698 break;
1699
1700 case TGSI_OPCODE_CROSSPRODUCT:
1701 /* TGSI_OPCODE_XPD */
1702 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1703 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1704 FETCH( func, *inst, 1, 1, CHAN_Z );
1705 FETCH( func, *inst, 3, 0, CHAN_Z );
1706 }
1707 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1708 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1709 FETCH( func, *inst, 0, 0, CHAN_Y );
1710 FETCH( func, *inst, 4, 1, CHAN_Y );
1711 }
1712 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1713 emit_MOV( func, 2, 0 );
1714 emit_mul( func, 2, 1 );
1715 emit_MOV( func, 5, 3 );
1716 emit_mul( func, 5, 4 );
1717 emit_sub( func, 2, 5 );
1718 STORE( func, *inst, 2, 0, CHAN_X );
1719 }
1720 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1721 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1722 FETCH( func, *inst, 2, 1, CHAN_X );
1723 FETCH( func, *inst, 5, 0, CHAN_X );
1724 }
1725 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1726 emit_mul( func, 3, 2 );
1727 emit_mul( func, 1, 5 );
1728 emit_sub( func, 3, 1 );
1729 STORE( func, *inst, 3, 0, CHAN_Y );
1730 }
1731 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1732 emit_mul( func, 5, 4 );
1733 emit_mul( func, 0, 2 );
1734 emit_sub( func, 5, 0 );
1735 STORE( func, *inst, 5, 0, CHAN_Z );
1736 }
1737 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1738 emit_tempf(
1739 func,
1740 0,
1741 TEMP_ONE_I,
1742 TEMP_ONE_C );
1743 STORE( func, *inst, 0, 0, CHAN_W );
1744 }
1745 break;
1746
1747 case TGSI_OPCODE_MULTIPLYMATRIX:
1748 return 0;
1749 break;
1750
1751 case TGSI_OPCODE_ABS:
1752 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753 FETCH( func, *inst, 0, 0, chan_index );
1754 emit_abs( func, 0) ;
1755
1756 STORE( func, *inst, 0, 0, chan_index );
1757 }
1758 break;
1759
1760 case TGSI_OPCODE_RCC:
1761 return 0;
1762 break;
1763
1764 case TGSI_OPCODE_DPH:
1765 FETCH( func, *inst, 0, 0, CHAN_X );
1766 FETCH( func, *inst, 1, 1, CHAN_X );
1767 emit_mul( func, 0, 1 );
1768 FETCH( func, *inst, 1, 0, CHAN_Y );
1769 FETCH( func, *inst, 2, 1, CHAN_Y );
1770 emit_mul( func, 1, 2 );
1771 emit_add( func, 0, 1 );
1772 FETCH( func, *inst, 1, 0, CHAN_Z );
1773 FETCH( func, *inst, 2, 1, CHAN_Z );
1774 emit_mul( func, 1, 2 );
1775 emit_add( func, 0, 1 );
1776 FETCH( func, *inst, 1, 1, CHAN_W );
1777 emit_add( func, 0, 1 );
1778 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1779 STORE( func, *inst, 0, 0, chan_index );
1780 }
1781 break;
1782
1783 case TGSI_OPCODE_COS:
1784 FETCH( func, *inst, 0, 0, CHAN_X );
1785 emit_cos( func, 0 );
1786 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1787 STORE( func, *inst, 0, 0, chan_index );
1788 }
1789 break;
1790
1791 case TGSI_OPCODE_DDX:
1792 return 0;
1793 break;
1794
1795 case TGSI_OPCODE_DDY:
1796 return 0;
1797 break;
1798
1799 case TGSI_OPCODE_KILP:
1800 /* predicated kill */
1801 emit_kilp( func );
1802 return 0; /* XXX fix me */
1803 break;
1804
1805 case TGSI_OPCODE_KIL:
1806 /* conditional kill */
1807 emit_kil( func, &inst->FullSrcRegisters[0] );
1808 break;
1809
1810 case TGSI_OPCODE_PK2H:
1811 return 0;
1812 break;
1813
1814 case TGSI_OPCODE_PK2US:
1815 return 0;
1816 break;
1817
1818 case TGSI_OPCODE_PK4B:
1819 return 0;
1820 break;
1821
1822 case TGSI_OPCODE_PK4UB:
1823 return 0;
1824 break;
1825
1826 case TGSI_OPCODE_RFL:
1827 return 0;
1828 break;
1829
1830 case TGSI_OPCODE_SEQ:
1831 return 0;
1832 break;
1833
1834 case TGSI_OPCODE_SFL:
1835 return 0;
1836 break;
1837
1838 case TGSI_OPCODE_SGT:
1839 return 0;
1840 break;
1841
1842 case TGSI_OPCODE_SIN:
1843 FETCH( func, *inst, 0, 0, CHAN_X );
1844 emit_sin( func, 0 );
1845 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1846 STORE( func, *inst, 0, 0, chan_index );
1847 }
1848 break;
1849
1850 case TGSI_OPCODE_SLE:
1851 return 0;
1852 break;
1853
1854 case TGSI_OPCODE_SNE:
1855 return 0;
1856 break;
1857
1858 case TGSI_OPCODE_STR:
1859 return 0;
1860 break;
1861
1862 case TGSI_OPCODE_TEX:
1863 if (0) {
1864 /* Disable dummy texture code:
1865 */
1866 emit_tempf(
1867 func,
1868 0,
1869 TEMP_ONE_I,
1870 TEMP_ONE_C );
1871 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872 STORE( func, *inst, 0, 0, chan_index );
1873 }
1874 }
1875 else {
1876 return 0;
1877 }
1878 break;
1879
1880 case TGSI_OPCODE_TXD:
1881 return 0;
1882 break;
1883
1884 case TGSI_OPCODE_UP2H:
1885 return 0;
1886 break;
1887
1888 case TGSI_OPCODE_UP2US:
1889 return 0;
1890 break;
1891
1892 case TGSI_OPCODE_UP4B:
1893 return 0;
1894 break;
1895
1896 case TGSI_OPCODE_UP4UB:
1897 return 0;
1898 break;
1899
1900 case TGSI_OPCODE_X2D:
1901 return 0;
1902 break;
1903
1904 case TGSI_OPCODE_ARA:
1905 return 0;
1906 break;
1907
1908 case TGSI_OPCODE_ARR:
1909 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1910 FETCH( func, *inst, 0, 0, chan_index );
1911 emit_rnd( func, 0, 0 );
1912 emit_f2it( func, 0 );
1913 STORE( func, *inst, 0, 0, chan_index );
1914 }
1915 break;
1916
1917 case TGSI_OPCODE_BRA:
1918 return 0;
1919 break;
1920
1921 case TGSI_OPCODE_CAL:
1922 return 0;
1923 break;
1924
1925 case TGSI_OPCODE_RET:
1926 emit_ret( func );
1927 break;
1928
1929 case TGSI_OPCODE_END:
1930 break;
1931
1932 case TGSI_OPCODE_SSG:
1933 return 0;
1934 break;
1935
1936 case TGSI_OPCODE_CMP:
1937 emit_cmp (func, inst);
1938 break;
1939
1940 case TGSI_OPCODE_SCS:
1941 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1942 FETCH( func, *inst, 0, 0, CHAN_X );
1943 emit_cos( func, 0 );
1944 STORE( func, *inst, 0, 0, CHAN_X );
1945 }
1946 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1947 FETCH( func, *inst, 0, 0, CHAN_X );
1948 emit_sin( func, 0 );
1949 STORE( func, *inst, 0, 0, CHAN_Y );
1950 }
1951 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1952 emit_tempf(
1953 func,
1954 0,
1955 TGSI_EXEC_TEMP_00000000_I,
1956 TGSI_EXEC_TEMP_00000000_C );
1957 STORE( func, *inst, 0, 0, CHAN_Z );
1958 }
1959 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1960 emit_tempf(
1961 func,
1962 0,
1963 TEMP_ONE_I,
1964 TEMP_ONE_C );
1965 STORE( func, *inst, 0, 0, CHAN_W );
1966 }
1967 break;
1968
1969 case TGSI_OPCODE_TXB:
1970 return 0;
1971 break;
1972
1973 case TGSI_OPCODE_NRM:
1974 return 0;
1975 break;
1976
1977 case TGSI_OPCODE_DIV:
1978 return 0;
1979 break;
1980
1981 case TGSI_OPCODE_DP2:
1982 return 0;
1983 break;
1984
1985 case TGSI_OPCODE_TXL:
1986 return 0;
1987 break;
1988
1989 case TGSI_OPCODE_BRK:
1990 return 0;
1991 break;
1992
1993 case TGSI_OPCODE_IF:
1994 return 0;
1995 break;
1996
1997 case TGSI_OPCODE_LOOP:
1998 return 0;
1999 break;
2000
2001 case TGSI_OPCODE_REP:
2002 return 0;
2003 break;
2004
2005 case TGSI_OPCODE_ELSE:
2006 return 0;
2007 break;
2008
2009 case TGSI_OPCODE_ENDIF:
2010 return 0;
2011 break;
2012
2013 case TGSI_OPCODE_ENDLOOP:
2014 return 0;
2015 break;
2016
2017 case TGSI_OPCODE_ENDREP:
2018 return 0;
2019 break;
2020
2021 case TGSI_OPCODE_PUSHA:
2022 return 0;
2023 break;
2024
2025 case TGSI_OPCODE_POPA:
2026 return 0;
2027 break;
2028
2029 case TGSI_OPCODE_CEIL:
2030 return 0;
2031 break;
2032
2033 case TGSI_OPCODE_I2F:
2034 return 0;
2035 break;
2036
2037 case TGSI_OPCODE_NOT:
2038 return 0;
2039 break;
2040
2041 case TGSI_OPCODE_TRUNC:
2042 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043 FETCH( func, *inst, 0, 0, chan_index );
2044 emit_f2it( func, 0 );
2045 emit_i2f( func, 0 );
2046 STORE( func, *inst, 0, 0, chan_index );
2047 }
2048 break;
2049
2050 case TGSI_OPCODE_SHL:
2051 return 0;
2052 break;
2053
2054 case TGSI_OPCODE_SHR:
2055 return 0;
2056 break;
2057
2058 case TGSI_OPCODE_AND:
2059 return 0;
2060 break;
2061
2062 case TGSI_OPCODE_OR:
2063 return 0;
2064 break;
2065
2066 case TGSI_OPCODE_MOD:
2067 return 0;
2068 break;
2069
2070 case TGSI_OPCODE_XOR:
2071 return 0;
2072 break;
2073
2074 case TGSI_OPCODE_SAD:
2075 return 0;
2076 break;
2077
2078 case TGSI_OPCODE_TXF:
2079 return 0;
2080 break;
2081
2082 case TGSI_OPCODE_TXQ:
2083 return 0;
2084 break;
2085
2086 case TGSI_OPCODE_CONT:
2087 return 0;
2088 break;
2089
2090 case TGSI_OPCODE_EMIT:
2091 return 0;
2092 break;
2093
2094 case TGSI_OPCODE_ENDPRIM:
2095 return 0;
2096 break;
2097
2098 default:
2099 return 0;
2100 }
2101
2102 return 1;
2103 }
2104
2105 static void
2106 emit_declaration(
2107 struct x86_function *func,
2108 struct tgsi_full_declaration *decl )
2109 {
2110 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2111 unsigned first, last, mask;
2112 unsigned i, j;
2113
2114 first = decl->DeclarationRange.First;
2115 last = decl->DeclarationRange.Last;
2116 mask = decl->Declaration.UsageMask;
2117
2118 for( i = first; i <= last; i++ ) {
2119 for( j = 0; j < NUM_CHANNELS; j++ ) {
2120 if( mask & (1 << j) ) {
2121 switch( decl->Declaration.Interpolate ) {
2122 case TGSI_INTERPOLATE_CONSTANT:
2123 emit_coef_a0( func, 0, i, j );
2124 emit_inputs( func, 0, i, j );
2125 break;
2126
2127 case TGSI_INTERPOLATE_LINEAR:
2128 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2129 emit_coef_dadx( func, 1, i, j );
2130 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2131 emit_coef_dady( func, 3, i, j );
2132 emit_mul( func, 0, 1 ); /* x * dadx */
2133 emit_coef_a0( func, 4, i, j );
2134 emit_mul( func, 2, 3 ); /* y * dady */
2135 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2136 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2137 emit_inputs( func, 0, i, j );
2138 break;
2139
2140 case TGSI_INTERPOLATE_PERSPECTIVE:
2141 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2142 emit_coef_dadx( func, 1, i, j );
2143 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2144 emit_coef_dady( func, 3, i, j );
2145 emit_mul( func, 0, 1 ); /* x * dadx */
2146 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2147 emit_coef_a0( func, 5, i, j );
2148 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2149 emit_mul( func, 2, 3 ); /* y * dady */
2150 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2151 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2152 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2153 emit_inputs( func, 0, i, j );
2154 break;
2155
2156 default:
2157 assert( 0 );
2158 break;
2159 }
2160 }
2161 }
2162 }
2163 }
2164 }
2165
2166 static void aos_to_soa( struct x86_function *func,
2167 uint arg_aos,
2168 uint arg_soa,
2169 uint arg_num,
2170 uint arg_stride )
2171 {
2172 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2173 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2174 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2175 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2176 int inner_loop;
2177
2178
2179 /* Save EBX */
2180 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2181
2182 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2183 x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
2184 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2185 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2186
2187 /* do */
2188 inner_loop = x86_get_label( func );
2189 {
2190 x86_push( func, aos_input );
2191 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2192 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2193 x86_add( func, aos_input, stride );
2194 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2195 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2196 x86_add( func, aos_input, stride );
2197 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2198 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2199 x86_add( func, aos_input, stride );
2200 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2201 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2202 x86_pop( func, aos_input );
2203
2204 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2205 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2206 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2207 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2208 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2209 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2210
2211 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2212 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2213 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2214 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2215
2216 /* Advance to next input */
2217 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2218 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2219 }
2220 /* while --num_inputs */
2221 x86_dec( func, num_inputs );
2222 x86_jcc( func, cc_NE, inner_loop );
2223
2224 /* Restore EBX */
2225 x86_pop( func, aos_input );
2226 }
2227
2228 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2229 {
2230 struct x86_reg soa_output;
2231 struct x86_reg aos_output;
2232 struct x86_reg num_outputs;
2233 struct x86_reg temp;
2234 int inner_loop;
2235
2236 soa_output = x86_make_reg( file_REG32, reg_AX );
2237 aos_output = x86_make_reg( file_REG32, reg_BX );
2238 num_outputs = x86_make_reg( file_REG32, reg_CX );
2239 temp = x86_make_reg( file_REG32, reg_DX );
2240
2241 /* Save EBX */
2242 x86_push( func, aos_output );
2243
2244 x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2245 x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2246 x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2247
2248 /* do */
2249 inner_loop = x86_get_label( func );
2250 {
2251 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2252 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2253 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2254 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2255
2256 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2257 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2258 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2259 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2260 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2261 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2262
2263 x86_mov( func, temp, x86_fn_arg( func, stride ) );
2264 x86_push( func, aos_output );
2265 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2266 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2267 x86_add( func, aos_output, temp );
2268 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2269 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2270 x86_add( func, aos_output, temp );
2271 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2272 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2273 x86_add( func, aos_output, temp );
2274 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2275 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2276 x86_pop( func, aos_output );
2277
2278 /* Advance to next output */
2279 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2280 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2281 }
2282 /* while --num_outputs */
2283 x86_dec( func, num_outputs );
2284 x86_jcc( func, cc_NE, inner_loop );
2285
2286 /* Restore EBX */
2287 x86_pop( func, aos_output );
2288 }
2289
2290 /**
2291 * Translate a TGSI vertex/fragment shader to SSE2 code.
2292 * Slightly different things are done for vertex vs. fragment shaders.
2293 *
2294 * Note that fragment shaders are responsible for interpolating shader
2295 * inputs. Because on x86 we have only 4 GP registers, and here we
2296 * have 5 shader arguments (input, output, const, temp and coef), the
2297 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2298 * GP register holding the output argument is aliased with the coeff
2299 * argument, as outputs are not needed in the DECLARATION phase.
2300 *
2301 * \param tokens the TGSI input shader
2302 * \param func the output SSE code/function
2303 * \param immediates buffer to place immediates, later passed to SSE func
2304 * \param return 1 for success, 0 if translation failed
2305 */
2306 unsigned
2307 tgsi_emit_sse2(
2308 const struct tgsi_token *tokens,
2309 struct x86_function *func,
2310 float (*immediates)[4],
2311 boolean do_swizzles )
2312 {
2313 struct tgsi_parse_context parse;
2314 boolean instruction_phase = FALSE;
2315 unsigned ok = 1;
2316 uint num_immediates = 0;
2317
2318 util_init_math();
2319
2320 func->csr = func->store;
2321
2322 tgsi_parse_init( &parse, tokens );
2323
2324 /* Can't just use EDI, EBX without save/restoring them:
2325 */
2326 x86_push(
2327 func,
2328 get_immediate_base() );
2329
2330 x86_push(
2331 func,
2332 get_temp_base() );
2333
2334
2335 /*
2336 * Different function args for vertex/fragment shaders:
2337 */
2338 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2339 /* DECLARATION phase, do not load output argument. */
2340 x86_mov(
2341 func,
2342 get_input_base(),
2343 x86_fn_arg( func, 1 ) );
2344 /* skipping outputs argument here */
2345 x86_mov(
2346 func,
2347 get_const_base(),
2348 x86_fn_arg( func, 3 ) );
2349 x86_mov(
2350 func,
2351 get_temp_base(),
2352 x86_fn_arg( func, 4 ) );
2353 x86_mov(
2354 func,
2355 get_coef_base(),
2356 x86_fn_arg( func, 5 ) );
2357 x86_mov(
2358 func,
2359 get_immediate_base(),
2360 x86_fn_arg( func, 6 ) );
2361 }
2362 else {
2363 assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2364
2365 if (do_swizzles)
2366 aos_to_soa( func,
2367 6, /* aos_input */
2368 1, /* machine->input */
2369 7, /* num_inputs */
2370 8 ); /* input_stride */
2371
2372 x86_mov(
2373 func,
2374 get_input_base(),
2375 x86_fn_arg( func, 1 ) );
2376 x86_mov(
2377 func,
2378 get_output_base(),
2379 x86_fn_arg( func, 2 ) );
2380 x86_mov(
2381 func,
2382 get_const_base(),
2383 x86_fn_arg( func, 3 ) );
2384 x86_mov(
2385 func,
2386 get_temp_base(),
2387 x86_fn_arg( func, 4 ) );
2388 x86_mov(
2389 func,
2390 get_immediate_base(),
2391 x86_fn_arg( func, 5 ) );
2392 }
2393
2394 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2395 tgsi_parse_token( &parse );
2396
2397 switch( parse.FullToken.Token.Type ) {
2398 case TGSI_TOKEN_TYPE_DECLARATION:
2399 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2400 emit_declaration(
2401 func,
2402 &parse.FullToken.FullDeclaration );
2403 }
2404 break;
2405
2406 case TGSI_TOKEN_TYPE_INSTRUCTION:
2407 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2408 if( !instruction_phase ) {
2409 /* INSTRUCTION phase, overwrite coeff with output. */
2410 instruction_phase = TRUE;
2411 x86_mov(
2412 func,
2413 get_output_base(),
2414 x86_fn_arg( func, 2 ) );
2415 }
2416 }
2417
2418 ok = emit_instruction(
2419 func,
2420 &parse.FullToken.FullInstruction );
2421
2422 if (!ok) {
2423 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2424 parse.FullToken.FullInstruction.Instruction.Opcode,
2425 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2426 "vertex shader" : "fragment shader");
2427 }
2428 break;
2429
2430 case TGSI_TOKEN_TYPE_IMMEDIATE:
2431 /* simply copy the immediate values into the next immediates[] slot */
2432 {
2433 const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2434 uint i;
2435 assert(size <= 4);
2436 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2437 for( i = 0; i < size; i++ ) {
2438 immediates[num_immediates][i] =
2439 parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2440 }
2441 #if 0
2442 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2443 num_immediates,
2444 immediates[num_immediates][0],
2445 immediates[num_immediates][1],
2446 immediates[num_immediates][2],
2447 immediates[num_immediates][3]);
2448 #endif
2449 num_immediates++;
2450 }
2451 break;
2452
2453 default:
2454 ok = 0;
2455 assert( 0 );
2456 }
2457 }
2458
2459 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2460 if (do_swizzles)
2461 soa_to_aos( func, 9, 2, 10, 11 );
2462 }
2463
2464 /* Can't just use EBX, EDI without save/restoring them:
2465 */
2466 x86_pop(
2467 func,
2468 get_temp_base() );
2469
2470 x86_pop(
2471 func,
2472 get_immediate_base() );
2473
2474 emit_ret( func );
2475
2476 tgsi_parse_free( &parse );
2477
2478 return ok;
2479 }
2480
2481 #endif /* PIPE_ARCH_X86 */
2482