1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #if defined(PIPE_ARCH_SSE)
36 #include "util/u_sse.h"
38 #include "tgsi/tgsi_parse.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi_exec.h"
41 #include "tgsi_sse2.h"
43 #include "rtasm/rtasm_x86sse.h"
47 * This costs about 100fps (close to 10%) in gears:
49 #define HIGH_PRECISION 1
54 #define FOR_EACH_CHANNEL( CHAN )\
55 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
58 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
61 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
64 FOR_EACH_CHANNEL( CHAN )\
65 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
72 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
73 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
75 #define TEMP_R0 TGSI_EXEC_TEMP_R0
76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
82 * X86 utility functions.
91 (enum x86_reg_name
) xmm
);
95 * X86 register mapping helpers.
99 get_const_base( void )
106 static struct x86_reg
107 get_input_base( void )
114 static struct x86_reg
115 get_output_base( void )
122 static struct x86_reg
123 get_temp_base( void )
130 static struct x86_reg
131 get_coef_base( void )
133 return get_output_base();
136 static struct x86_reg
137 get_immediate_base( void )
146 * Data access helpers.
150 static struct x86_reg
155 return x86_make_disp(
156 get_immediate_base(),
157 (vec
* 4 + chan
) * 4 );
160 static struct x86_reg
165 return x86_make_disp(
167 (vec
* 4 + chan
) * 4 );
170 static struct x86_reg
175 return x86_make_disp(
177 (vec
* 4 + chan
) * 16 );
180 static struct x86_reg
185 return x86_make_disp(
187 (vec
* 4 + chan
) * 16 );
190 static struct x86_reg
195 return x86_make_disp(
197 (vec
* 4 + chan
) * 16 );
200 static struct x86_reg
206 return x86_make_disp(
208 ((vec
* 3 + member
) * 4 + chan
) * 4 );
214 struct x86_function
*func
)
221 * Data fetch helpers.
225 * Copy a shader constant to xmm register
226 * \param xmm the destination xmm register
227 * \param vec the src const buffer index
228 * \param chan src channel to fetch (X, Y, Z or W)
232 struct x86_function
*func
,
241 /* 'vec' is the offset from the address register's value.
242 * We're loading CONST[ADDR+vec] into an xmm register.
244 struct x86_reg r0
= get_input_base();
245 struct x86_reg r1
= get_output_base();
248 assert( indirectFile
== TGSI_FILE_ADDRESS
);
249 assert( indirectIndex
== 0 );
251 x86_push( func
, r0
);
252 x86_push( func
, r1
);
255 * Loop over the four pixels or vertices in the quad.
256 * Get the value of the address (offset) register for pixel/vertex[i],
257 * add it to the src offset and index into the constant buffer.
258 * Note that we're working on SOA data.
259 * If any of the pixel/vertex execution channels are unused their
260 * values will be garbage. It's very important that we don't use
261 * those garbage values as indexes into the constant buffer since
262 * that'll cause segfaults.
263 * The solution is to bitwise-AND the offset with the execution mask
264 * register whose values are either 0 or ~0.
265 * The caller must setup the execution mask register to indicate
266 * which channels are valid/alive before running the shader.
267 * The execution mask will also figure into loops and conditionals
270 for (i
= 0; i
< QUAD_SIZE
; i
++) {
271 /* r1 = address register[i] */
272 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
273 /* r0 = execution mask[i] */
274 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
276 x86_and( func
, r1
, r0
);
277 /* r0 = 'vec', the offset */
278 x86_lea( func
, r0
, get_const( vec
, chan
) );
280 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
282 x86_add( func
, r1
, r1
);
283 x86_add( func
, r1
, r1
);
284 x86_add( func
, r1
, r1
);
285 x86_add( func
, r1
, r1
);
287 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
288 x86_mov( func
, r1
, x86_deref( r0
) );
289 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
298 get_temp( TEMP_R0
, CHAN_X
) );
301 /* 'vec' is the index into the src register file, such as TEMP[vec] */
307 get_const( vec
, chan
) );
312 SHUF( 0, 0, 0, 0 ) );
318 struct x86_function
*func
,
326 get_immediate( vec
, chan
) );
331 SHUF( 0, 0, 0, 0 ) );
336 * Copy a shader input to xmm register
337 * \param xmm the destination xmm register
338 * \param vec the src input attrib
339 * \param chan src channel to fetch (X, Y, Z or W)
343 struct x86_function
*func
,
351 get_input( vec
, chan
) );
355 * Store an xmm register to a shader output
356 * \param xmm the source xmm register
357 * \param vec the dest output attrib
358 * \param chan src dest channel to store (X, Y, Z or W)
362 struct x86_function
*func
,
369 get_output( vec
, chan
),
374 * Copy a shader temporary to xmm register
375 * \param xmm the destination xmm register
376 * \param vec the src temp register
377 * \param chan src channel to fetch (X, Y, Z or W)
381 struct x86_function
*func
,
389 get_temp( vec
, chan
) );
393 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
394 * \param xmm the destination xmm register
395 * \param vec the src input/attribute coefficient index
396 * \param chan src channel to fetch (X, Y, Z or W)
397 * \param member 0=a0, 1=dadx, 2=dady
401 struct x86_function
*func
,
410 get_coef( vec
, chan
, member
) );
415 SHUF( 0, 0, 0, 0 ) );
419 * Data store helpers.
424 struct x86_function
*func
,
431 get_input( vec
, chan
),
437 struct x86_function
*func
,
444 get_temp( vec
, chan
),
450 struct x86_function
*func
,
460 vec
+ TGSI_EXEC_TEMP_ADDR
,
465 * Coefficent fetch helpers.
470 struct x86_function
*func
,
485 struct x86_function
*func
,
500 struct x86_function
*func
,
514 * Function call helpers.
518 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
519 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
520 * that the stack pointer is 16 byte aligned, as expected.
524 struct x86_function
*func
,
527 void (PIPE_CDECL
*code
)() )
529 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
533 /* Bitmask of the xmm registers to save */
534 xmm_mask
= (1 << xmm_save
) - 1;
535 xmm_mask
&= ~(1 << xmm_dst
);
539 get_temp( TEMP_R0
, 0 ),
540 make_xmm( xmm_dst
) );
544 x86_make_reg( file_REG32
, reg_AX
) );
547 x86_make_reg( file_REG32
, reg_CX
) );
550 x86_make_reg( file_REG32
, reg_DX
) );
552 for(i
= 0, n
= 0; i
< 8; ++i
)
553 if(xmm_mask
& (1 << i
))
558 x86_make_reg( file_REG32
, reg_SP
),
561 for(i
= 0, n
= 0; i
< 8; ++i
)
562 if(xmm_mask
& (1 << i
)) {
565 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
573 get_temp( TEMP_R0
, 0 ) );
575 x86_push( func
, ecx
);
576 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
577 x86_call( func
, ecx
);
580 for(i
= 0, n
= 0; i
< 8; ++i
)
581 if(xmm_mask
& (1 << i
)) {
585 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
591 x86_make_reg( file_REG32
, reg_SP
),
594 /* Restore GP registers in a reverse order.
598 x86_make_reg( file_REG32
, reg_DX
) );
601 x86_make_reg( file_REG32
, reg_CX
) );
604 x86_make_reg( file_REG32
, reg_AX
) );
609 get_temp( TEMP_R0
, 0 ) );
613 emit_func_call_dst_src(
614 struct x86_function
*func
,
618 void (PIPE_CDECL
*code
)() )
622 get_temp( TEMP_R0
, 1 ),
623 make_xmm( xmm_src
) );
633 #if defined(PIPE_ARCH_SSE)
636 * Fast SSE2 implementation of special math functions.
639 #define POLY0(x, c0) _mm_set1_ps(c0)
640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
646 #define EXP_POLY_DEGREE 3
647 #define LOG_POLY_DEGREE 5
650 * See http://www.devmaster.net/forums/showthread.php?p=43580
656 __m128 fpart
, expipart
, expfpart
;
658 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
659 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
661 /* ipart = int(x - 0.5) */
662 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
664 /* fpart = x - ipart */
665 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
667 /* expipart = (float) (1 << ipart) */
668 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
670 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
671 #if EXP_POLY_DEGREE == 5
672 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
673 #elif EXP_POLY_DEGREE == 4
674 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
675 #elif EXP_POLY_DEGREE == 3
676 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
677 #elif EXP_POLY_DEGREE == 2
678 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
683 return _mm_mul_ps(expipart
, expfpart
);
688 * See http://www.devmaster.net/forums/showthread.php?p=43580
693 __m128i expmask
= _mm_set1_epi32(0x7f800000);
694 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
695 __m128 one
= _mm_set1_ps(1.0f
);
697 __m128i i
= _mm_castps_si128(x
);
699 /* exp = (float) exponent(x) */
700 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
702 /* mant = (float) mantissa(x) */
703 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
707 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
708 * These coefficients can be generate with
709 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
711 #if LOG_POLY_DEGREE == 6
712 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
713 #elif LOG_POLY_DEGREE == 5
714 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
715 #elif LOG_POLY_DEGREE == 4
716 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
717 #elif LOG_POLY_DEGREE == 3
718 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
723 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
724 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
726 return _mm_add_ps(logmant
, exp
);
731 powf4(__m128 x
, __m128 y
)
733 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
736 #endif /* PIPE_ARCH_SSE */
741 * Low-level instruction translators.
746 struct x86_function
*func
,
753 TGSI_EXEC_TEMP_7FFFFFFF_I
,
754 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
759 struct x86_function
*func
,
766 make_xmm( xmm_src
) );
769 static void PIPE_CDECL
773 store
[0] = cosf( store
[0] );
774 store
[1] = cosf( store
[1] );
775 store
[2] = cosf( store
[2] );
776 store
[3] = cosf( store
[3] );
781 struct x86_function
*func
,
792 static void PIPE_CDECL
793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
794 __attribute__((force_align_arg_pointer
))
799 #if defined(PIPE_ARCH_SSE)
800 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
802 store
[0] = util_fast_exp2( store
[0] );
803 store
[1] = util_fast_exp2( store
[1] );
804 store
[2] = util_fast_exp2( store
[2] );
805 store
[3] = util_fast_exp2( store
[3] );
811 struct x86_function
*func
,
824 struct x86_function
*func
,
835 struct x86_function
*func
,
844 static void PIPE_CDECL
848 store
[0] = floorf( store
[0] );
849 store
[1] = floorf( store
[1] );
850 store
[2] = floorf( store
[2] );
851 store
[3] = floorf( store
[3] );
856 struct x86_function
*func
,
867 static void PIPE_CDECL
871 store
[0] -= floorf( store
[0] );
872 store
[1] -= floorf( store
[1] );
873 store
[2] -= floorf( store
[2] );
874 store
[3] -= floorf( store
[3] );
879 struct x86_function
*func
,
890 static void PIPE_CDECL
891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
892 __attribute__((force_align_arg_pointer
))
897 #if defined(PIPE_ARCH_SSE)
898 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
900 store
[0] = util_fast_log2( store
[0] );
901 store
[1] = util_fast_log2( store
[1] );
902 store
[2] = util_fast_log2( store
[2] );
903 store
[3] = util_fast_log2( store
[3] );
909 struct x86_function
*func
,
922 struct x86_function
*func
,
929 make_xmm( xmm_src
) );
933 emit_mul (struct x86_function
*func
,
940 make_xmm( xmm_src
) );
945 struct x86_function
*func
,
952 TGSI_EXEC_TEMP_80000000_I
,
953 TGSI_EXEC_TEMP_80000000_C
) );
956 static void PIPE_CDECL
957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
958 __attribute__((force_align_arg_pointer
))
963 #if defined(PIPE_ARCH_SSE)
964 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
966 store
[0] = util_fast_pow( store
[0], store
[4] );
967 store
[1] = util_fast_pow( store
[1], store
[5] );
968 store
[2] = util_fast_pow( store
[2], store
[6] );
969 store
[3] = util_fast_pow( store
[3], store
[7] );
975 struct x86_function
*func
,
980 emit_func_call_dst_src(
990 struct x86_function
*func
,
994 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
995 * good enough. Need to either emit a proper divide or use the
996 * iterative technique described below in emit_rsqrt().
1000 make_xmm( xmm_dst
),
1001 make_xmm( xmm_src
) );
1004 static void PIPE_CDECL
1008 store
[0] = floorf( store
[0] + 0.5f
);
1009 store
[1] = floorf( store
[1] + 0.5f
);
1010 store
[2] = floorf( store
[2] + 0.5f
);
1011 store
[3] = floorf( store
[3] + 0.5f
);
1016 struct x86_function
*func
,
1029 struct x86_function
*func
,
1034 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1035 * implementations, it is possible to improve its precision at
1036 * fairly low cost, using a newton/raphson step, as below:
1038 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1039 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1041 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1044 struct x86_reg dst
= make_xmm( xmm_dst
);
1045 struct x86_reg src
= make_xmm( xmm_src
);
1046 struct x86_reg tmp0
= make_xmm( 2 );
1047 struct x86_reg tmp1
= make_xmm( 3 );
1049 assert( xmm_dst
!= xmm_src
);
1050 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1051 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1053 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1054 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1055 sse_rsqrtps( func
, tmp1
, src
);
1056 sse_mulps( func
, src
, tmp1
);
1057 sse_mulps( func
, dst
, tmp1
);
1058 sse_mulps( func
, src
, tmp1
);
1059 sse_subps( func
, tmp0
, src
);
1060 sse_mulps( func
, dst
, tmp0
);
1063 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1068 make_xmm( xmm_dst
),
1069 make_xmm( xmm_src
) );
1075 struct x86_function
*func
,
1082 TGSI_EXEC_TEMP_80000000_I
,
1083 TGSI_EXEC_TEMP_80000000_C
) );
1086 static void PIPE_CDECL
1090 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1091 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1092 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1093 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1098 struct x86_function
*func
,
1109 static void PIPE_CDECL
1113 store
[0] = sinf( store
[0] );
1114 store
[1] = sinf( store
[1] );
1115 store
[2] = sinf( store
[2] );
1116 store
[3] = sinf( store
[3] );
1120 emit_sin (struct x86_function
*func
,
1133 struct x86_function
*func
,
1139 make_xmm( xmm_dst
),
1140 make_xmm( xmm_src
) );
1149 struct x86_function
*func
,
1151 const struct tgsi_full_src_register
*reg
,
1152 const unsigned chan_index
)
1154 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1157 case TGSI_EXTSWIZZLE_X
:
1158 case TGSI_EXTSWIZZLE_Y
:
1159 case TGSI_EXTSWIZZLE_Z
:
1160 case TGSI_EXTSWIZZLE_W
:
1161 switch (reg
->SrcRegister
.File
) {
1162 case TGSI_FILE_CONSTANT
:
1166 reg
->SrcRegister
.Index
,
1168 reg
->SrcRegister
.Indirect
,
1169 reg
->SrcRegisterInd
.File
,
1170 reg
->SrcRegisterInd
.Index
);
1173 case TGSI_FILE_IMMEDIATE
:
1177 reg
->SrcRegister
.Index
,
1181 case TGSI_FILE_INPUT
:
1185 reg
->SrcRegister
.Index
,
1189 case TGSI_FILE_TEMPORARY
:
1193 reg
->SrcRegister
.Index
,
1202 case TGSI_EXTSWIZZLE_ZERO
:
1206 TGSI_EXEC_TEMP_00000000_I
,
1207 TGSI_EXEC_TEMP_00000000_C
);
1210 case TGSI_EXTSWIZZLE_ONE
:
1222 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1223 case TGSI_UTIL_SIGN_CLEAR
:
1224 emit_abs( func
, xmm
);
1227 case TGSI_UTIL_SIGN_SET
:
1228 emit_setsign( func
, xmm
);
1231 case TGSI_UTIL_SIGN_TOGGLE
:
1232 emit_neg( func
, xmm
);
1235 case TGSI_UTIL_SIGN_KEEP
:
1240 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1241 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1249 struct x86_function
*func
,
1251 const struct tgsi_full_dst_register
*reg
,
1252 const struct tgsi_full_instruction
*inst
,
1253 unsigned chan_index
)
1255 switch( reg
->DstRegister
.File
) {
1256 case TGSI_FILE_OUTPUT
:
1260 reg
->DstRegister
.Index
,
1264 case TGSI_FILE_TEMPORARY
:
1268 reg
->DstRegister
.Index
,
1272 case TGSI_FILE_ADDRESS
:
1276 reg
->DstRegister
.Index
,
1284 switch( inst
->Instruction
.Saturate
) {
1288 case TGSI_SAT_ZERO_ONE
:
1292 case TGSI_SAT_MINUS_PLUS_ONE
:
1298 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1299 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1302 * High-level instruction translators.
1307 struct x86_function
*func
,
1308 const struct tgsi_full_src_register
*reg
)
1310 unsigned uniquemask
;
1311 unsigned registers
[4];
1312 unsigned nextregister
= 0;
1313 unsigned firstchan
= ~0;
1314 unsigned chan_index
;
1316 /* This mask stores component bits that were already tested. Note that
1317 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1319 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1321 FOR_EACH_CHANNEL( chan_index
) {
1324 /* unswizzle channel */
1325 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1329 /* check if the component has not been already tested */
1330 if( !(uniquemask
& (1 << swizzle
)) ) {
1331 uniquemask
|= 1 << swizzle
;
1333 /* allocate register */
1334 registers
[chan_index
] = nextregister
;
1342 /* mark the first channel used */
1343 if( firstchan
== ~0 ) {
1344 firstchan
= chan_index
;
1351 x86_make_reg( file_REG32
, reg_AX
) );
1354 x86_make_reg( file_REG32
, reg_DX
) );
1356 FOR_EACH_CHANNEL( chan_index
) {
1357 if( uniquemask
& (1 << chan_index
) ) {
1360 make_xmm( registers
[chan_index
] ),
1362 TGSI_EXEC_TEMP_00000000_I
,
1363 TGSI_EXEC_TEMP_00000000_C
),
1366 if( chan_index
== firstchan
) {
1369 x86_make_reg( file_REG32
, reg_AX
),
1370 make_xmm( registers
[chan_index
] ) );
1375 x86_make_reg( file_REG32
, reg_DX
),
1376 make_xmm( registers
[chan_index
] ) );
1379 x86_make_reg( file_REG32
, reg_AX
),
1380 x86_make_reg( file_REG32
, reg_DX
) );
1388 TGSI_EXEC_TEMP_KILMASK_I
,
1389 TGSI_EXEC_TEMP_KILMASK_C
),
1390 x86_make_reg( file_REG32
, reg_AX
) );
1394 x86_make_reg( file_REG32
, reg_DX
) );
1397 x86_make_reg( file_REG32
, reg_AX
) );
1403 struct x86_function
*func
)
1405 /* XXX todo / fix me */
1411 struct x86_function
*func
,
1412 struct tgsi_full_instruction
*inst
,
1415 unsigned chan_index
;
1417 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1418 FETCH( func
, *inst
, 0, 0, chan_index
);
1419 FETCH( func
, *inst
, 1, 1, chan_index
);
1431 STORE( func
, *inst
, 0, 0, chan_index
);
1437 struct x86_function
*func
,
1438 struct tgsi_full_instruction
*inst
)
1440 unsigned chan_index
;
1442 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1443 FETCH( func
, *inst
, 0, 0, chan_index
);
1444 FETCH( func
, *inst
, 1, 1, chan_index
);
1445 FETCH( func
, *inst
, 2, 2, chan_index
);
1450 TGSI_EXEC_TEMP_00000000_I
,
1451 TGSI_EXEC_TEMP_00000000_C
),
1465 STORE( func
, *inst
, 0, 0, chan_index
);
1471 struct x86_function
*func
,
1472 struct tgsi_full_instruction
*inst
)
1474 unsigned chan_index
;
1476 switch (inst
->Instruction
.Opcode
) {
1477 case TGSI_OPCODE_ARL
:
1478 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1479 FETCH( func
, *inst
, 0, 0, chan_index
);
1480 emit_f2it( func
, 0 );
1481 STORE( func
, *inst
, 0, 0, chan_index
);
1485 case TGSI_OPCODE_MOV
:
1486 case TGSI_OPCODE_SWZ
:
1487 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1488 FETCH( func
, *inst
, 0, 0, chan_index
);
1489 STORE( func
, *inst
, 0, 0, chan_index
);
1493 case TGSI_OPCODE_LIT
:
1494 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1495 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1501 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1502 STORE( func
, *inst
, 0, 0, CHAN_X
);
1504 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1505 STORE( func
, *inst
, 0, 0, CHAN_W
);
1508 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1509 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1510 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1511 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1516 TGSI_EXEC_TEMP_00000000_I
,
1517 TGSI_EXEC_TEMP_00000000_C
) );
1518 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1520 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1521 /* XMM[1] = SrcReg[0].yyyy */
1522 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1523 /* XMM[1] = max(XMM[1], 0) */
1528 TGSI_EXEC_TEMP_00000000_I
,
1529 TGSI_EXEC_TEMP_00000000_C
) );
1530 /* XMM[2] = SrcReg[0].wwww */
1531 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1532 /* XMM[2] = min(XMM[2], 128.0) */
1537 TGSI_EXEC_TEMP_128_I
,
1538 TGSI_EXEC_TEMP_128_C
) );
1539 /* XMM[2] = max(XMM[2], -128.0) */
1544 TGSI_EXEC_TEMP_MINUS_128_I
,
1545 TGSI_EXEC_TEMP_MINUS_128_C
) );
1546 emit_pow( func
, 3, 1, 2 );
1547 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1561 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1566 case TGSI_OPCODE_RCP
:
1567 /* TGSI_OPCODE_RECIP */
1568 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1569 emit_rcp( func
, 0, 0 );
1570 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1571 STORE( func
, *inst
, 0, 0, chan_index
);
1575 case TGSI_OPCODE_RSQ
:
1576 /* TGSI_OPCODE_RECIPSQRT */
1577 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1578 emit_abs( func
, 0 );
1579 emit_rsqrt( func
, 1, 0 );
1580 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1581 STORE( func
, *inst
, 1, 0, chan_index
);
1585 case TGSI_OPCODE_EXP
:
1586 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1587 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1588 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1589 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1590 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1591 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1592 emit_MOV( func
, 1, 0 );
1593 emit_flr( func
, 2, 1 );
1594 /* dst.x = ex2(floor(src.x)) */
1595 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1596 emit_MOV( func
, 2, 1 );
1597 emit_ex2( func
, 3, 2 );
1598 STORE( func
, *inst
, 2, 0, CHAN_X
);
1600 /* dst.y = src.x - floor(src.x) */
1601 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1602 emit_MOV( func
, 2, 0 );
1603 emit_sub( func
, 2, 1 );
1604 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1607 /* dst.z = ex2(src.x) */
1608 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1609 emit_ex2( func
, 3, 0 );
1610 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1614 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1615 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1616 STORE( func
, *inst
, 0, 0, CHAN_W
);
1620 case TGSI_OPCODE_LOG
:
1621 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1622 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1623 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1624 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1625 emit_abs( func
, 0 );
1626 emit_MOV( func
, 1, 0 );
1627 emit_lg2( func
, 2, 1 );
1628 /* dst.z = lg2(abs(src.x)) */
1629 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1630 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1632 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1633 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1634 emit_flr( func
, 2, 1 );
1635 /* dst.x = floor(lg2(abs(src.x))) */
1636 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1637 STORE( func
, *inst
, 1, 0, CHAN_X
);
1639 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1640 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1641 emit_ex2( func
, 2, 1 );
1642 emit_rcp( func
, 1, 1 );
1643 emit_mul( func
, 0, 1 );
1644 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1649 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1650 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1651 STORE( func
, *inst
, 0, 0, CHAN_W
);
1655 case TGSI_OPCODE_MUL
:
1656 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1657 FETCH( func
, *inst
, 0, 0, chan_index
);
1658 FETCH( func
, *inst
, 1, 1, chan_index
);
1659 emit_mul( func
, 0, 1 );
1660 STORE( func
, *inst
, 0, 0, chan_index
);
1664 case TGSI_OPCODE_ADD
:
1665 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1666 FETCH( func
, *inst
, 0, 0, chan_index
);
1667 FETCH( func
, *inst
, 1, 1, chan_index
);
1668 emit_add( func
, 0, 1 );
1669 STORE( func
, *inst
, 0, 0, chan_index
);
1673 case TGSI_OPCODE_DP3
:
1674 /* TGSI_OPCODE_DOT3 */
1675 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1676 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1677 emit_mul( func
, 0, 1 );
1678 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1679 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1680 emit_mul( func
, 1, 2 );
1681 emit_add( func
, 0, 1 );
1682 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1683 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1684 emit_mul( func
, 1, 2 );
1685 emit_add( func
, 0, 1 );
1686 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1687 STORE( func
, *inst
, 0, 0, chan_index
);
1691 case TGSI_OPCODE_DP4
:
1692 /* TGSI_OPCODE_DOT4 */
1693 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1694 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1695 emit_mul( func
, 0, 1 );
1696 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1697 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1698 emit_mul( func
, 1, 2 );
1699 emit_add( func
, 0, 1 );
1700 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1701 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1702 emit_mul(func
, 1, 2 );
1703 emit_add(func
, 0, 1 );
1704 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1705 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1706 emit_mul( func
, 1, 2 );
1707 emit_add( func
, 0, 1 );
1708 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1709 STORE( func
, *inst
, 0, 0, chan_index
);
1713 case TGSI_OPCODE_DST
:
1714 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1720 STORE( func
, *inst
, 0, 0, CHAN_X
);
1722 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1723 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1724 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1725 emit_mul( func
, 0, 1 );
1726 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1728 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1729 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1730 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1732 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1733 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1734 STORE( func
, *inst
, 0, 0, CHAN_W
);
1738 case TGSI_OPCODE_MIN
:
1739 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1740 FETCH( func
, *inst
, 0, 0, chan_index
);
1741 FETCH( func
, *inst
, 1, 1, chan_index
);
1746 STORE( func
, *inst
, 0, 0, chan_index
);
1750 case TGSI_OPCODE_MAX
:
1751 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1752 FETCH( func
, *inst
, 0, 0, chan_index
);
1753 FETCH( func
, *inst
, 1, 1, chan_index
);
1758 STORE( func
, *inst
, 0, 0, chan_index
);
1762 case TGSI_OPCODE_SLT
:
1763 /* TGSI_OPCODE_SETLT */
1764 emit_setcc( func
, inst
, cc_LessThan
);
1767 case TGSI_OPCODE_SGE
:
1768 /* TGSI_OPCODE_SETGE */
1769 emit_setcc( func
, inst
, cc_NotLessThan
);
1772 case TGSI_OPCODE_MAD
:
1773 /* TGSI_OPCODE_MADD */
1774 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1775 FETCH( func
, *inst
, 0, 0, chan_index
);
1776 FETCH( func
, *inst
, 1, 1, chan_index
);
1777 FETCH( func
, *inst
, 2, 2, chan_index
);
1778 emit_mul( func
, 0, 1 );
1779 emit_add( func
, 0, 2 );
1780 STORE( func
, *inst
, 0, 0, chan_index
);
1784 case TGSI_OPCODE_SUB
:
1785 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1786 FETCH( func
, *inst
, 0, 0, chan_index
);
1787 FETCH( func
, *inst
, 1, 1, chan_index
);
1788 emit_sub( func
, 0, 1 );
1789 STORE( func
, *inst
, 0, 0, chan_index
);
1793 case TGSI_OPCODE_LERP
:
1794 /* TGSI_OPCODE_LRP */
1795 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1796 FETCH( func
, *inst
, 0, 0, chan_index
);
1797 FETCH( func
, *inst
, 1, 1, chan_index
);
1798 FETCH( func
, *inst
, 2, 2, chan_index
);
1799 emit_sub( func
, 1, 2 );
1800 emit_mul( func
, 0, 1 );
1801 emit_add( func
, 0, 2 );
1802 STORE( func
, *inst
, 0, 0, chan_index
);
1806 case TGSI_OPCODE_CND
:
1810 case TGSI_OPCODE_CND0
:
1814 case TGSI_OPCODE_DOT2ADD
:
1815 /* TGSI_OPCODE_DP2A */
1816 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
1817 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
1818 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1819 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
1820 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
1821 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1822 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1823 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
1824 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1825 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1826 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
1830 case TGSI_OPCODE_INDEX
:
1834 case TGSI_OPCODE_NEGATE
:
1838 case TGSI_OPCODE_FRAC
:
1839 /* TGSI_OPCODE_FRC */
1840 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1841 FETCH( func
, *inst
, 0, 0, chan_index
);
1842 emit_frc( func
, 0, 0 );
1843 STORE( func
, *inst
, 0, 0, chan_index
);
1847 case TGSI_OPCODE_CLAMP
:
1851 case TGSI_OPCODE_FLOOR
:
1852 /* TGSI_OPCODE_FLR */
1853 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1854 FETCH( func
, *inst
, 0, 0, chan_index
);
1855 emit_flr( func
, 0, 0 );
1856 STORE( func
, *inst
, 0, 0, chan_index
);
1860 case TGSI_OPCODE_ROUND
:
1861 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1862 FETCH( func
, *inst
, 0, 0, chan_index
);
1863 emit_rnd( func
, 0, 0 );
1864 STORE( func
, *inst
, 0, 0, chan_index
);
1868 case TGSI_OPCODE_EXPBASE2
:
1869 /* TGSI_OPCODE_EX2 */
1870 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1871 emit_ex2( func
, 0, 0 );
1872 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1873 STORE( func
, *inst
, 0, 0, chan_index
);
1877 case TGSI_OPCODE_LOGBASE2
:
1878 /* TGSI_OPCODE_LG2 */
1879 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1880 emit_lg2( func
, 0, 0 );
1881 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1882 STORE( func
, *inst
, 0, 0, chan_index
);
1886 case TGSI_OPCODE_POWER
:
1887 /* TGSI_OPCODE_POW */
1888 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1889 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1890 emit_pow( func
, 0, 0, 1 );
1891 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1892 STORE( func
, *inst
, 0, 0, chan_index
);
1896 case TGSI_OPCODE_CROSSPRODUCT
:
1897 /* TGSI_OPCODE_XPD */
1898 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1899 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1900 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1901 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1903 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1904 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1905 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1906 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1908 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1909 emit_MOV( func
, 2, 0 );
1910 emit_mul( func
, 2, 1 );
1911 emit_MOV( func
, 5, 3 );
1912 emit_mul( func
, 5, 4 );
1913 emit_sub( func
, 2, 5 );
1914 STORE( func
, *inst
, 2, 0, CHAN_X
);
1916 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1917 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1918 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1919 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1921 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1922 emit_mul( func
, 3, 2 );
1923 emit_mul( func
, 1, 5 );
1924 emit_sub( func
, 3, 1 );
1925 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1927 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1928 emit_mul( func
, 5, 4 );
1929 emit_mul( func
, 0, 2 );
1930 emit_sub( func
, 5, 0 );
1931 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1933 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1939 STORE( func
, *inst
, 0, 0, CHAN_W
);
1943 case TGSI_OPCODE_MULTIPLYMATRIX
:
1947 case TGSI_OPCODE_ABS
:
1948 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1949 FETCH( func
, *inst
, 0, 0, chan_index
);
1950 emit_abs( func
, 0) ;
1952 STORE( func
, *inst
, 0, 0, chan_index
);
1956 case TGSI_OPCODE_RCC
:
1960 case TGSI_OPCODE_DPH
:
1961 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1962 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1963 emit_mul( func
, 0, 1 );
1964 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1965 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1966 emit_mul( func
, 1, 2 );
1967 emit_add( func
, 0, 1 );
1968 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1969 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1970 emit_mul( func
, 1, 2 );
1971 emit_add( func
, 0, 1 );
1972 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1973 emit_add( func
, 0, 1 );
1974 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1975 STORE( func
, *inst
, 0, 0, chan_index
);
1979 case TGSI_OPCODE_COS
:
1980 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1981 emit_cos( func
, 0, 0 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1983 STORE( func
, *inst
, 0, 0, chan_index
);
1987 case TGSI_OPCODE_DDX
:
1991 case TGSI_OPCODE_DDY
:
1995 case TGSI_OPCODE_KILP
:
1996 /* predicated kill */
1998 return 0; /* XXX fix me */
2001 case TGSI_OPCODE_KIL
:
2002 /* conditional kill */
2003 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
2006 case TGSI_OPCODE_PK2H
:
2010 case TGSI_OPCODE_PK2US
:
2014 case TGSI_OPCODE_PK4B
:
2018 case TGSI_OPCODE_PK4UB
:
2022 case TGSI_OPCODE_RFL
:
2026 case TGSI_OPCODE_SEQ
:
2030 case TGSI_OPCODE_SFL
:
2034 case TGSI_OPCODE_SGT
:
2038 case TGSI_OPCODE_SIN
:
2039 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2040 emit_sin( func
, 0, 0 );
2041 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2042 STORE( func
, *inst
, 0, 0, chan_index
);
2046 case TGSI_OPCODE_SLE
:
2050 case TGSI_OPCODE_SNE
:
2054 case TGSI_OPCODE_STR
:
2058 case TGSI_OPCODE_TEX
:
2060 /* Disable dummy texture code:
2067 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2068 STORE( func
, *inst
, 0, 0, chan_index
);
2076 case TGSI_OPCODE_TXD
:
2080 case TGSI_OPCODE_UP2H
:
2084 case TGSI_OPCODE_UP2US
:
2088 case TGSI_OPCODE_UP4B
:
2092 case TGSI_OPCODE_UP4UB
:
2096 case TGSI_OPCODE_X2D
:
2100 case TGSI_OPCODE_ARA
:
2104 case TGSI_OPCODE_ARR
:
2105 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2106 FETCH( func
, *inst
, 0, 0, chan_index
);
2107 emit_rnd( func
, 0, 0 );
2108 emit_f2it( func
, 0 );
2109 STORE( func
, *inst
, 0, 0, chan_index
);
2113 case TGSI_OPCODE_BRA
:
2117 case TGSI_OPCODE_CAL
:
2121 case TGSI_OPCODE_RET
:
2125 case TGSI_OPCODE_END
:
2128 case TGSI_OPCODE_SSG
:
2129 /* TGSI_OPCODE_SGN */
2130 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2131 FETCH( func
, *inst
, 0, 0, chan_index
);
2132 emit_sgn( func
, 0, 0 );
2133 STORE( func
, *inst
, 0, 0, chan_index
);
2137 case TGSI_OPCODE_CMP
:
2138 emit_cmp (func
, inst
);
2141 case TGSI_OPCODE_SCS
:
2142 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2143 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2144 emit_cos( func
, 0, 0 );
2145 STORE( func
, *inst
, 0, 0, CHAN_X
);
2147 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2148 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2149 emit_sin( func
, 0, 0 );
2150 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2152 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2156 TGSI_EXEC_TEMP_00000000_I
,
2157 TGSI_EXEC_TEMP_00000000_C
);
2158 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2160 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2166 STORE( func
, *inst
, 0, 0, CHAN_W
);
2170 case TGSI_OPCODE_TXB
:
2174 case TGSI_OPCODE_NRM
:
2176 case TGSI_OPCODE_NRM4
:
2177 /* 3 or 4-component normalization */
2179 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2180 /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2181 FETCH( func
, *inst
, 4, 0, CHAN_X
); /* xmm4 = src[0].x */
2182 FETCH( func
, *inst
, 5, 0, CHAN_Y
); /* xmm5 = src[0].y */
2183 FETCH( func
, *inst
, 6, 0, CHAN_Z
); /* xmm6 = src[0].z */
2185 FETCH( func
, *inst
, 7, 0, CHAN_W
); /* xmm7 = src[0].w */
2187 emit_MOV( func
, 0, 4 ); /* xmm0 = xmm3 */
2188 emit_mul( func
, 0, 4 ); /* xmm0 *= xmm3 */
2189 emit_MOV( func
, 1, 5 ); /* xmm1 = xmm4 */
2190 emit_mul( func
, 1, 5 ); /* xmm1 *= xmm4 */
2191 emit_add( func
, 0, 1 ); /* xmm0 += xmm1 */
2192 emit_MOV( func
, 1, 6 ); /* xmm1 = xmm5 */
2193 emit_mul( func
, 1, 6 ); /* xmm1 *= xmm5 */
2194 emit_add( func
, 0, 1 ); /* xmm0 += xmm1 */
2196 emit_MOV( func
, 1, 7 ); /* xmm1 = xmm7 */
2197 emit_mul( func
, 1, 7 ); /* xmm1 *= xmm7 */
2198 emit_add( func
, 0, 0 ); /* xmm0 += xmm1 */
2200 emit_rsqrt( func
, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
2201 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2202 if (chan_index
< dims
) {
2203 emit_mul( func
, 4+chan_index
, 1); /* xmm[4+ch] *= xmm1 */
2204 STORE( func
, *inst
, 4+chan_index
, 0, chan_index
);
2210 case TGSI_OPCODE_DIV
:
2214 case TGSI_OPCODE_DP2
:
2215 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2216 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2217 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2218 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2219 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2220 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2221 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2222 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2223 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2227 case TGSI_OPCODE_TXL
:
2231 case TGSI_OPCODE_BRK
:
2235 case TGSI_OPCODE_IF
:
2239 case TGSI_OPCODE_LOOP
:
2243 case TGSI_OPCODE_REP
:
2247 case TGSI_OPCODE_ELSE
:
2251 case TGSI_OPCODE_ENDIF
:
2255 case TGSI_OPCODE_ENDLOOP
:
2259 case TGSI_OPCODE_ENDREP
:
2263 case TGSI_OPCODE_PUSHA
:
2267 case TGSI_OPCODE_POPA
:
2271 case TGSI_OPCODE_CEIL
:
2275 case TGSI_OPCODE_I2F
:
2279 case TGSI_OPCODE_NOT
:
2283 case TGSI_OPCODE_TRUNC
:
2284 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2285 FETCH( func
, *inst
, 0, 0, chan_index
);
2286 emit_f2it( func
, 0 );
2287 emit_i2f( func
, 0 );
2288 STORE( func
, *inst
, 0, 0, chan_index
);
2292 case TGSI_OPCODE_SHL
:
2296 case TGSI_OPCODE_SHR
:
2300 case TGSI_OPCODE_AND
:
2304 case TGSI_OPCODE_OR
:
2308 case TGSI_OPCODE_MOD
:
2312 case TGSI_OPCODE_XOR
:
2316 case TGSI_OPCODE_SAD
:
2320 case TGSI_OPCODE_TXF
:
2324 case TGSI_OPCODE_TXQ
:
2328 case TGSI_OPCODE_CONT
:
2332 case TGSI_OPCODE_EMIT
:
2336 case TGSI_OPCODE_ENDPRIM
:
2349 struct x86_function
*func
,
2350 struct tgsi_full_declaration
*decl
)
2352 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2353 unsigned first
, last
, mask
;
2356 first
= decl
->DeclarationRange
.First
;
2357 last
= decl
->DeclarationRange
.Last
;
2358 mask
= decl
->Declaration
.UsageMask
;
2360 for( i
= first
; i
<= last
; i
++ ) {
2361 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2362 if( mask
& (1 << j
) ) {
2363 switch( decl
->Declaration
.Interpolate
) {
2364 case TGSI_INTERPOLATE_CONSTANT
:
2365 emit_coef_a0( func
, 0, i
, j
);
2366 emit_inputs( func
, 0, i
, j
);
2369 case TGSI_INTERPOLATE_LINEAR
:
2370 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2371 emit_coef_dadx( func
, 1, i
, j
);
2372 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2373 emit_coef_dady( func
, 3, i
, j
);
2374 emit_mul( func
, 0, 1 ); /* x * dadx */
2375 emit_coef_a0( func
, 4, i
, j
);
2376 emit_mul( func
, 2, 3 ); /* y * dady */
2377 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2378 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2379 emit_inputs( func
, 0, i
, j
);
2382 case TGSI_INTERPOLATE_PERSPECTIVE
:
2383 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2384 emit_coef_dadx( func
, 1, i
, j
);
2385 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2386 emit_coef_dady( func
, 3, i
, j
);
2387 emit_mul( func
, 0, 1 ); /* x * dadx */
2388 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2389 emit_coef_a0( func
, 5, i
, j
);
2390 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2391 emit_mul( func
, 2, 3 ); /* y * dady */
2392 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2393 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2394 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2395 emit_inputs( func
, 0, i
, j
);
2408 static void aos_to_soa( struct x86_function
*func
,
2414 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2415 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2416 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2417 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2422 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2424 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2425 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2426 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2427 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2430 inner_loop
= x86_get_label( func
);
2432 x86_push( func
, aos_input
);
2433 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2434 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2435 x86_add( func
, aos_input
, stride
);
2436 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2437 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2438 x86_add( func
, aos_input
, stride
);
2439 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2440 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2441 x86_add( func
, aos_input
, stride
);
2442 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2443 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2444 x86_pop( func
, aos_input
);
2446 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2447 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2448 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2449 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2450 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2451 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2453 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2454 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2455 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2456 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2458 /* Advance to next input */
2459 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2460 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2462 /* while --num_inputs */
2463 x86_dec( func
, num_inputs
);
2464 x86_jcc( func
, cc_NE
, inner_loop
);
2467 x86_pop( func
, aos_input
);
2470 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2472 struct x86_reg soa_output
;
2473 struct x86_reg aos_output
;
2474 struct x86_reg num_outputs
;
2475 struct x86_reg temp
;
2478 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2479 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2480 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2481 temp
= x86_make_reg( file_REG32
, reg_DX
);
2484 x86_push( func
, aos_output
);
2486 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2487 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2488 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2491 inner_loop
= x86_get_label( func
);
2493 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2494 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2495 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2496 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2498 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2499 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2500 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2501 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2502 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2503 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2505 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2506 x86_push( func
, aos_output
);
2507 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2508 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2509 x86_add( func
, aos_output
, temp
);
2510 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2511 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2512 x86_add( func
, aos_output
, temp
);
2513 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2514 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2515 x86_add( func
, aos_output
, temp
);
2516 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2517 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2518 x86_pop( func
, aos_output
);
2520 /* Advance to next output */
2521 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2522 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2524 /* while --num_outputs */
2525 x86_dec( func
, num_outputs
);
2526 x86_jcc( func
, cc_NE
, inner_loop
);
2529 x86_pop( func
, aos_output
);
2533 * Translate a TGSI vertex/fragment shader to SSE2 code.
2534 * Slightly different things are done for vertex vs. fragment shaders.
2536 * Note that fragment shaders are responsible for interpolating shader
2537 * inputs. Because on x86 we have only 4 GP registers, and here we
2538 * have 5 shader arguments (input, output, const, temp and coef), the
2539 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2540 * GP register holding the output argument is aliased with the coeff
2541 * argument, as outputs are not needed in the DECLARATION phase.
2543 * \param tokens the TGSI input shader
2544 * \param func the output SSE code/function
2545 * \param immediates buffer to place immediates, later passed to SSE func
2546 * \param return 1 for success, 0 if translation failed
2550 const struct tgsi_token
*tokens
,
2551 struct x86_function
*func
,
2552 float (*immediates
)[4],
2553 boolean do_swizzles
)
2555 struct tgsi_parse_context parse
;
2556 boolean instruction_phase
= FALSE
;
2558 uint num_immediates
= 0;
2562 func
->csr
= func
->store
;
2564 tgsi_parse_init( &parse
, tokens
);
2566 /* Can't just use EDI, EBX without save/restoring them:
2570 get_immediate_base() );
2578 * Different function args for vertex/fragment shaders:
2580 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2581 /* DECLARATION phase, do not load output argument. */
2585 x86_fn_arg( func
, 1 ) );
2586 /* skipping outputs argument here */
2590 x86_fn_arg( func
, 3 ) );
2594 x86_fn_arg( func
, 4 ) );
2598 x86_fn_arg( func
, 5 ) );
2601 get_immediate_base(),
2602 x86_fn_arg( func
, 6 ) );
2605 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2610 1, /* machine->input */
2612 8 ); /* input_stride */
2617 x86_fn_arg( func
, 1 ) );
2621 x86_fn_arg( func
, 2 ) );
2625 x86_fn_arg( func
, 3 ) );
2629 x86_fn_arg( func
, 4 ) );
2632 get_immediate_base(),
2633 x86_fn_arg( func
, 5 ) );
2636 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2637 tgsi_parse_token( &parse
);
2639 switch( parse
.FullToken
.Token
.Type
) {
2640 case TGSI_TOKEN_TYPE_DECLARATION
:
2641 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2644 &parse
.FullToken
.FullDeclaration
);
2648 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2649 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2650 if( !instruction_phase
) {
2651 /* INSTRUCTION phase, overwrite coeff with output. */
2652 instruction_phase
= TRUE
;
2656 x86_fn_arg( func
, 2 ) );
2660 ok
= emit_instruction(
2662 &parse
.FullToken
.FullInstruction
);
2665 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2666 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2667 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2668 "vertex shader" : "fragment shader");
2672 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2673 /* simply copy the immediate values into the next immediates[] slot */
2675 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2678 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2679 for( i
= 0; i
< size
; i
++ ) {
2680 immediates
[num_immediates
][i
] =
2681 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2684 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2686 immediates
[num_immediates
][0],
2687 immediates
[num_immediates
][1],
2688 immediates
[num_immediates
][2],
2689 immediates
[num_immediates
][3]);
2701 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2703 soa_to_aos( func
, 9, 2, 10, 11 );
2706 /* Can't just use EBX, EDI without save/restoring them:
2714 get_immediate_base() );
2718 tgsi_parse_free( &parse
);
2723 #endif /* PIPE_ARCH_X86 */