1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_sse.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi_exec.h"
39 #include "tgsi_sse2.h"
41 #include "rtasm/rtasm_x86sse.h"
45 * This costs about 100fps (close to 10%) in gears:
47 #define HIGH_PRECISION 1
52 #define FOR_EACH_CHANNEL( CHAN )\
53 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
62 FOR_EACH_CHANNEL( CHAN )\
63 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
70 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
71 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
73 #define TEMP_R0 TGSI_EXEC_TEMP_R0
74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
75 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
76 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
80 * X86 utility functions.
89 (enum x86_reg_name
) xmm
);
93 * X86 register mapping helpers.
97 get_const_base( void )
104 static struct x86_reg
105 get_input_base( void )
112 static struct x86_reg
113 get_output_base( void )
120 static struct x86_reg
121 get_temp_base( void )
128 static struct x86_reg
129 get_coef_base( void )
131 return get_output_base();
134 static struct x86_reg
135 get_immediate_base( void )
144 * Data access helpers.
148 static struct x86_reg
153 return x86_make_disp(
154 get_immediate_base(),
155 (vec
* 4 + chan
) * 4 );
158 static struct x86_reg
163 return x86_make_disp(
165 (vec
* 4 + chan
) * 4 );
168 static struct x86_reg
173 return x86_make_disp(
175 (vec
* 4 + chan
) * 16 );
178 static struct x86_reg
183 return x86_make_disp(
185 (vec
* 4 + chan
) * 16 );
188 static struct x86_reg
193 return x86_make_disp(
195 (vec
* 4 + chan
) * 16 );
198 static struct x86_reg
204 return x86_make_disp(
206 ((vec
* 3 + member
) * 4 + chan
) * 4 );
212 struct x86_function
*func
)
219 * Data fetch helpers.
223 * Copy a shader constant to xmm register
224 * \param xmm the destination xmm register
225 * \param vec the src const buffer index
226 * \param chan src channel to fetch (X, Y, Z or W)
230 struct x86_function
*func
,
239 /* 'vec' is the offset from the address register's value.
240 * We're loading CONST[ADDR+vec] into an xmm register.
242 struct x86_reg r0
= get_input_base();
243 struct x86_reg r1
= get_output_base();
246 assert( indirectFile
== TGSI_FILE_ADDRESS
);
247 assert( indirectIndex
== 0 );
249 x86_push( func
, r0
);
250 x86_push( func
, r1
);
253 * Loop over the four pixels or vertices in the quad.
254 * Get the value of the address (offset) register for pixel/vertex[i],
255 * add it to the src offset and index into the constant buffer.
256 * Note that we're working on SOA data.
257 * If any of the pixel/vertex execution channels are unused their
258 * values will be garbage. It's very important that we don't use
259 * those garbage values as indexes into the constant buffer since
260 * that'll cause segfaults.
261 * The solution is to bitwise-AND the offset with the execution mask
262 * register whose values are either 0 or ~0.
263 * The caller must setup the execution mask register to indicate
264 * which channels are valid/alive before running the shader.
265 * The execution mask will also figure into loops and conditionals
268 for (i
= 0; i
< QUAD_SIZE
; i
++) {
269 /* r1 = address register[i] */
270 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
271 /* r0 = execution mask[i] */
272 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
274 x86_and( func
, r1
, r0
);
275 /* r0 = 'vec', the offset */
276 x86_lea( func
, r0
, get_const( vec
, chan
) );
278 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
280 x86_add( func
, r1
, r1
);
281 x86_add( func
, r1
, r1
);
282 x86_add( func
, r1
, r1
);
283 x86_add( func
, r1
, r1
);
285 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
286 x86_mov( func
, r1
, x86_deref( r0
) );
287 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
296 get_temp( TEMP_R0
, CHAN_X
) );
299 /* 'vec' is the index into the src register file, such as TEMP[vec] */
305 get_const( vec
, chan
) );
310 SHUF( 0, 0, 0, 0 ) );
316 struct x86_function
*func
,
324 get_immediate( vec
, chan
) );
329 SHUF( 0, 0, 0, 0 ) );
334 * Copy a shader input to xmm register
335 * \param xmm the destination xmm register
336 * \param vec the src input attrib
337 * \param chan src channel to fetch (X, Y, Z or W)
341 struct x86_function
*func
,
349 get_input( vec
, chan
) );
353 * Store an xmm register to a shader output
354 * \param xmm the source xmm register
355 * \param vec the dest output attrib
356 * \param chan src dest channel to store (X, Y, Z or W)
360 struct x86_function
*func
,
367 get_output( vec
, chan
),
372 * Copy a shader temporary to xmm register
373 * \param xmm the destination xmm register
374 * \param vec the src temp register
375 * \param chan src channel to fetch (X, Y, Z or W)
379 struct x86_function
*func
,
387 get_temp( vec
, chan
) );
391 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
392 * \param xmm the destination xmm register
393 * \param vec the src input/attribute coefficient index
394 * \param chan src channel to fetch (X, Y, Z or W)
395 * \param member 0=a0, 1=dadx, 2=dady
399 struct x86_function
*func
,
408 get_coef( vec
, chan
, member
) );
413 SHUF( 0, 0, 0, 0 ) );
417 * Data store helpers.
422 struct x86_function
*func
,
429 get_input( vec
, chan
),
435 struct x86_function
*func
,
442 get_temp( vec
, chan
),
448 struct x86_function
*func
,
458 vec
+ TGSI_EXEC_TEMP_ADDR
,
463 * Coefficent fetch helpers.
468 struct x86_function
*func
,
483 struct x86_function
*func
,
498 struct x86_function
*func
,
512 * Function call helpers.
516 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
517 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
518 * that the stack pointer is 16 byte aligned, as expected.
522 struct x86_function
*func
,
525 void (PIPE_CDECL
*code
)() )
527 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
531 /* Bitmask of the xmm registers to save */
532 xmm_mask
= (1 << xmm_save
) - 1;
533 xmm_mask
&= ~(1 << xmm_dst
);
537 get_temp( TEMP_R0
, 0 ),
538 make_xmm( xmm_dst
) );
542 x86_make_reg( file_REG32
, reg_AX
) );
545 x86_make_reg( file_REG32
, reg_CX
) );
548 x86_make_reg( file_REG32
, reg_DX
) );
550 for(i
= 0, n
= 0; i
< 8; ++i
)
551 if(xmm_mask
& (1 << i
))
556 x86_make_reg( file_REG32
, reg_SP
),
559 for(i
= 0, n
= 0; i
< 8; ++i
)
560 if(xmm_mask
& (1 << i
)) {
563 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
571 get_temp( TEMP_R0
, 0 ) );
573 x86_push( func
, ecx
);
574 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
575 x86_call( func
, ecx
);
578 for(i
= 0, n
= 0; i
< 8; ++i
)
579 if(xmm_mask
& (1 << i
)) {
583 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
589 x86_make_reg( file_REG32
, reg_SP
),
592 /* Restore GP registers in a reverse order.
596 x86_make_reg( file_REG32
, reg_DX
) );
599 x86_make_reg( file_REG32
, reg_CX
) );
602 x86_make_reg( file_REG32
, reg_AX
) );
607 get_temp( TEMP_R0
, 0 ) );
611 emit_func_call_dst_src(
612 struct x86_function
*func
,
616 void (PIPE_CDECL
*code
)() )
620 get_temp( TEMP_R0
, 1 ),
621 make_xmm( xmm_src
) );
631 * Fast SSE2 implementation of special math functions.
634 #define POLY0(x, c0) _mm_set1_ps(c0)
635 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
636 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
637 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
638 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
639 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
641 #define EXP_POLY_DEGREE 3
642 #define LOG_POLY_DEGREE 5
645 * See http://www.devmaster.net/forums/showthread.php?p=43580
651 __m128 fpart
, expipart
, expfpart
;
653 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
654 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
656 /* ipart = int(x - 0.5) */
657 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
659 /* fpart = x - ipart */
660 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
662 /* expipart = (float) (1 << ipart) */
663 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
665 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
666 #if EXP_POLY_DEGREE == 5
667 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
668 #elif EXP_POLY_DEGREE == 4
669 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
670 #elif EXP_POLY_DEGREE == 3
671 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
672 #elif EXP_POLY_DEGREE == 2
673 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
678 return _mm_mul_ps(expipart
, expfpart
);
682 * See http://www.devmaster.net/forums/showthread.php?p=43580
687 __m128i expmask
= _mm_set1_epi32(0x7f800000);
688 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
689 __m128 one
= _mm_set1_ps(1.0f
);
691 __m128i i
= _mm_castps_si128(x
);
693 /* exp = (float) exponent(x) */
694 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
696 /* mant = (float) mantissa(x) */
697 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
701 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
702 * These coefficients can be generate with
703 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
705 #if LOG_POLY_DEGREE == 6
706 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
707 #elif LOG_POLY_DEGREE == 5
708 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
709 #elif LOG_POLY_DEGREE == 4
710 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
711 #elif LOG_POLY_DEGREE == 3
712 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
717 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
718 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
720 return _mm_add_ps(logmant
, exp
);
724 powf4(__m128 x
, __m128 y
)
726 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
731 * Low-level instruction translators.
736 struct x86_function
*func
,
743 TGSI_EXEC_TEMP_7FFFFFFF_I
,
744 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
749 struct x86_function
*func
,
756 make_xmm( xmm_src
) );
759 static void PIPE_CDECL
763 store
[0] = cosf( store
[0] );
764 store
[1] = cosf( store
[1] );
765 store
[2] = cosf( store
[2] );
766 store
[3] = cosf( store
[3] );
771 struct x86_function
*func
,
782 static void PIPE_CDECL
783 #if defined(PIPE_CC_GCC)
784 __attribute__((force_align_arg_pointer
))
789 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
794 struct x86_function
*func
,
807 struct x86_function
*func
,
818 struct x86_function
*func
,
827 static void PIPE_CDECL
831 store
[0] = floorf( store
[0] );
832 store
[1] = floorf( store
[1] );
833 store
[2] = floorf( store
[2] );
834 store
[3] = floorf( store
[3] );
839 struct x86_function
*func
,
850 static void PIPE_CDECL
854 store
[0] -= floorf( store
[0] );
855 store
[1] -= floorf( store
[1] );
856 store
[2] -= floorf( store
[2] );
857 store
[3] -= floorf( store
[3] );
862 struct x86_function
*func
,
873 static void PIPE_CDECL
874 #if defined(PIPE_CC_GCC)
875 __attribute__((force_align_arg_pointer
))
880 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
885 struct x86_function
*func
,
898 struct x86_function
*func
,
905 make_xmm( xmm_src
) );
909 emit_mul (struct x86_function
*func
,
916 make_xmm( xmm_src
) );
921 struct x86_function
*func
,
928 TGSI_EXEC_TEMP_80000000_I
,
929 TGSI_EXEC_TEMP_80000000_C
) );
932 static void PIPE_CDECL
933 #if defined(PIPE_CC_GCC)
934 __attribute__((force_align_arg_pointer
))
940 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
942 store
[0] = powf( store
[0], store
[4] );
943 store
[1] = powf( store
[1], store
[5] );
944 store
[2] = powf( store
[2], store
[6] );
945 store
[3] = powf( store
[3], store
[7] );
951 struct x86_function
*func
,
956 emit_func_call_dst_src(
966 struct x86_function
*func
,
970 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
971 * good enough. Need to either emit a proper divide or use the
972 * iterative technique described below in emit_rsqrt().
977 make_xmm( xmm_src
) );
982 struct x86_function
*func
,
987 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
988 * implementations, it is possible to improve its precision at
989 * fairly low cost, using a newton/raphson step, as below:
991 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
992 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
994 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
997 struct x86_reg dst
= make_xmm( xmm_dst
);
998 struct x86_reg src
= make_xmm( xmm_src
);
999 struct x86_reg tmp0
= make_xmm( 2 );
1000 struct x86_reg tmp1
= make_xmm( 3 );
1002 assert( xmm_dst
!= xmm_src
);
1003 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1004 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1006 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1007 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1008 sse_rsqrtps( func
, tmp1
, src
);
1009 sse_mulps( func
, src
, tmp1
);
1010 sse_mulps( func
, dst
, tmp1
);
1011 sse_mulps( func
, src
, tmp1
);
1012 sse_subps( func
, tmp0
, src
);
1013 sse_mulps( func
, dst
, tmp0
);
1016 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1021 make_xmm( xmm_dst
),
1022 make_xmm( xmm_src
) );
1028 struct x86_function
*func
,
1035 TGSI_EXEC_TEMP_80000000_I
,
1036 TGSI_EXEC_TEMP_80000000_C
) );
1039 static void PIPE_CDECL
1043 store
[0] = sinf( store
[0] );
1044 store
[1] = sinf( store
[1] );
1045 store
[2] = sinf( store
[2] );
1046 store
[3] = sinf( store
[3] );
1050 emit_sin (struct x86_function
*func
,
1063 struct x86_function
*func
,
1069 make_xmm( xmm_dst
),
1070 make_xmm( xmm_src
) );
1079 struct x86_function
*func
,
1081 const struct tgsi_full_src_register
*reg
,
1082 const unsigned chan_index
)
1084 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1087 case TGSI_EXTSWIZZLE_X
:
1088 case TGSI_EXTSWIZZLE_Y
:
1089 case TGSI_EXTSWIZZLE_Z
:
1090 case TGSI_EXTSWIZZLE_W
:
1091 switch (reg
->SrcRegister
.File
) {
1092 case TGSI_FILE_CONSTANT
:
1096 reg
->SrcRegister
.Index
,
1098 reg
->SrcRegister
.Indirect
,
1099 reg
->SrcRegisterInd
.File
,
1100 reg
->SrcRegisterInd
.Index
);
1103 case TGSI_FILE_IMMEDIATE
:
1107 reg
->SrcRegister
.Index
,
1111 case TGSI_FILE_INPUT
:
1115 reg
->SrcRegister
.Index
,
1119 case TGSI_FILE_TEMPORARY
:
1123 reg
->SrcRegister
.Index
,
1132 case TGSI_EXTSWIZZLE_ZERO
:
1136 TGSI_EXEC_TEMP_00000000_I
,
1137 TGSI_EXEC_TEMP_00000000_C
);
1140 case TGSI_EXTSWIZZLE_ONE
:
1152 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1153 case TGSI_UTIL_SIGN_CLEAR
:
1154 emit_abs( func
, xmm
);
1157 case TGSI_UTIL_SIGN_SET
:
1158 emit_setsign( func
, xmm
);
1161 case TGSI_UTIL_SIGN_TOGGLE
:
1162 emit_neg( func
, xmm
);
1165 case TGSI_UTIL_SIGN_KEEP
:
1170 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1171 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1179 struct x86_function
*func
,
1181 const struct tgsi_full_dst_register
*reg
,
1182 const struct tgsi_full_instruction
*inst
,
1183 unsigned chan_index
)
1185 switch( reg
->DstRegister
.File
) {
1186 case TGSI_FILE_OUTPUT
:
1190 reg
->DstRegister
.Index
,
1194 case TGSI_FILE_TEMPORARY
:
1198 reg
->DstRegister
.Index
,
1202 case TGSI_FILE_ADDRESS
:
1206 reg
->DstRegister
.Index
,
1214 switch( inst
->Instruction
.Saturate
) {
1218 case TGSI_SAT_ZERO_ONE
:
1222 case TGSI_SAT_MINUS_PLUS_ONE
:
1228 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1229 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1232 * High-level instruction translators.
1237 struct x86_function
*func
,
1238 const struct tgsi_full_src_register
*reg
)
1240 unsigned uniquemask
;
1241 unsigned registers
[4];
1242 unsigned nextregister
= 0;
1243 unsigned firstchan
= ~0;
1244 unsigned chan_index
;
1246 /* This mask stores component bits that were already tested. Note that
1247 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1249 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1251 FOR_EACH_CHANNEL( chan_index
) {
1254 /* unswizzle channel */
1255 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1259 /* check if the component has not been already tested */
1260 if( !(uniquemask
& (1 << swizzle
)) ) {
1261 uniquemask
|= 1 << swizzle
;
1263 /* allocate register */
1264 registers
[chan_index
] = nextregister
;
1272 /* mark the first channel used */
1273 if( firstchan
== ~0 ) {
1274 firstchan
= chan_index
;
1281 x86_make_reg( file_REG32
, reg_AX
) );
1284 x86_make_reg( file_REG32
, reg_DX
) );
1286 FOR_EACH_CHANNEL( chan_index
) {
1287 if( uniquemask
& (1 << chan_index
) ) {
1290 make_xmm( registers
[chan_index
] ),
1292 TGSI_EXEC_TEMP_00000000_I
,
1293 TGSI_EXEC_TEMP_00000000_C
),
1296 if( chan_index
== firstchan
) {
1299 x86_make_reg( file_REG32
, reg_AX
),
1300 make_xmm( registers
[chan_index
] ) );
1305 x86_make_reg( file_REG32
, reg_DX
),
1306 make_xmm( registers
[chan_index
] ) );
1309 x86_make_reg( file_REG32
, reg_AX
),
1310 x86_make_reg( file_REG32
, reg_DX
) );
1318 TGSI_EXEC_TEMP_KILMASK_I
,
1319 TGSI_EXEC_TEMP_KILMASK_C
),
1320 x86_make_reg( file_REG32
, reg_AX
) );
1324 x86_make_reg( file_REG32
, reg_DX
) );
1327 x86_make_reg( file_REG32
, reg_AX
) );
1333 struct x86_function
*func
)
1335 /* XXX todo / fix me */
1341 struct x86_function
*func
,
1342 struct tgsi_full_instruction
*inst
,
1345 unsigned chan_index
;
1347 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1348 FETCH( func
, *inst
, 0, 0, chan_index
);
1349 FETCH( func
, *inst
, 1, 1, chan_index
);
1361 STORE( func
, *inst
, 0, 0, chan_index
);
1367 struct x86_function
*func
,
1368 struct tgsi_full_instruction
*inst
)
1370 unsigned chan_index
;
1372 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1373 FETCH( func
, *inst
, 0, 0, chan_index
);
1374 FETCH( func
, *inst
, 1, 1, chan_index
);
1375 FETCH( func
, *inst
, 2, 2, chan_index
);
1380 TGSI_EXEC_TEMP_00000000_I
,
1381 TGSI_EXEC_TEMP_00000000_C
),
1395 STORE( func
, *inst
, 0, 0, chan_index
);
1401 struct x86_function
*func
,
1402 struct tgsi_full_instruction
*inst
)
1404 unsigned chan_index
;
1406 switch (inst
->Instruction
.Opcode
) {
1407 case TGSI_OPCODE_ARL
:
1408 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1409 FETCH( func
, *inst
, 0, 0, chan_index
);
1410 emit_f2it( func
, 0 );
1411 STORE( func
, *inst
, 0, 0, chan_index
);
1415 case TGSI_OPCODE_MOV
:
1416 case TGSI_OPCODE_SWZ
:
1417 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1418 FETCH( func
, *inst
, 0, 0, chan_index
);
1419 STORE( func
, *inst
, 0, 0, chan_index
);
1423 case TGSI_OPCODE_LIT
:
1424 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1425 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1431 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1432 STORE( func
, *inst
, 0, 0, CHAN_X
);
1434 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1435 STORE( func
, *inst
, 0, 0, CHAN_W
);
1438 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1439 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1440 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1441 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1446 TGSI_EXEC_TEMP_00000000_I
,
1447 TGSI_EXEC_TEMP_00000000_C
) );
1448 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1450 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1451 /* XMM[1] = SrcReg[0].yyyy */
1452 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1453 /* XMM[1] = max(XMM[1], 0) */
1458 TGSI_EXEC_TEMP_00000000_I
,
1459 TGSI_EXEC_TEMP_00000000_C
) );
1460 /* XMM[2] = SrcReg[0].wwww */
1461 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1462 /* XMM[2] = min(XMM[2], 128.0) */
1467 TGSI_EXEC_TEMP_128_I
,
1468 TGSI_EXEC_TEMP_128_C
) );
1469 /* XMM[2] = max(XMM[2], -128.0) */
1474 TGSI_EXEC_TEMP_MINUS_128_I
,
1475 TGSI_EXEC_TEMP_MINUS_128_C
) );
1476 emit_pow( func
, 3, 1, 2 );
1477 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1491 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1496 case TGSI_OPCODE_RCP
:
1497 /* TGSI_OPCODE_RECIP */
1498 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1499 emit_rcp( func
, 0, 0 );
1500 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1501 STORE( func
, *inst
, 0, 0, chan_index
);
1505 case TGSI_OPCODE_RSQ
:
1506 /* TGSI_OPCODE_RECIPSQRT */
1507 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1508 emit_rsqrt( func
, 1, 0 );
1509 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1510 STORE( func
, *inst
, 1, 0, chan_index
);
1514 case TGSI_OPCODE_EXP
:
1515 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1516 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1517 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1518 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1519 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1520 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1521 emit_MOV( func
, 1, 0 );
1522 emit_flr( func
, 2, 1 );
1523 /* dst.x = ex2(floor(src.x)) */
1524 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1525 emit_MOV( func
, 2, 1 );
1526 emit_ex2( func
, 3, 2 );
1527 STORE( func
, *inst
, 2, 0, CHAN_X
);
1529 /* dst.y = src.x - floor(src.x) */
1530 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1531 emit_MOV( func
, 2, 0 );
1532 emit_sub( func
, 2, 1 );
1533 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1536 /* dst.z = ex2(src.x) */
1537 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1538 emit_ex2( func
, 3, 0 );
1539 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1543 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1544 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1545 STORE( func
, *inst
, 0, 0, CHAN_W
);
1549 case TGSI_OPCODE_LOG
:
1550 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1551 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1552 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1553 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1554 emit_abs( func
, 0 );
1555 emit_MOV( func
, 1, 0 );
1556 emit_lg2( func
, 2, 1 );
1557 /* dst.z = lg2(abs(src.x)) */
1558 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1559 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1561 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1562 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1563 emit_flr( func
, 2, 1 );
1564 /* dst.x = floor(lg2(abs(src.x))) */
1565 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1566 STORE( func
, *inst
, 1, 0, CHAN_X
);
1568 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1569 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1570 emit_ex2( func
, 2, 1 );
1571 emit_rcp( func
, 1, 1 );
1572 emit_mul( func
, 0, 1 );
1573 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1578 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1579 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1580 STORE( func
, *inst
, 0, 0, CHAN_W
);
1584 case TGSI_OPCODE_MUL
:
1585 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1586 FETCH( func
, *inst
, 0, 0, chan_index
);
1587 FETCH( func
, *inst
, 1, 1, chan_index
);
1588 emit_mul( func
, 0, 1 );
1589 STORE( func
, *inst
, 0, 0, chan_index
);
1593 case TGSI_OPCODE_ADD
:
1594 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1595 FETCH( func
, *inst
, 0, 0, chan_index
);
1596 FETCH( func
, *inst
, 1, 1, chan_index
);
1597 emit_add( func
, 0, 1 );
1598 STORE( func
, *inst
, 0, 0, chan_index
);
1602 case TGSI_OPCODE_DP3
:
1603 /* TGSI_OPCODE_DOT3 */
1604 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1605 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1606 emit_mul( func
, 0, 1 );
1607 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1608 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1609 emit_mul( func
, 1, 2 );
1610 emit_add( func
, 0, 1 );
1611 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1612 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1613 emit_mul( func
, 1, 2 );
1614 emit_add( func
, 0, 1 );
1615 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1616 STORE( func
, *inst
, 0, 0, chan_index
);
1620 case TGSI_OPCODE_DP4
:
1621 /* TGSI_OPCODE_DOT4 */
1622 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1623 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1624 emit_mul( func
, 0, 1 );
1625 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1626 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1627 emit_mul( func
, 1, 2 );
1628 emit_add( func
, 0, 1 );
1629 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1630 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1631 emit_mul(func
, 1, 2 );
1632 emit_add(func
, 0, 1 );
1633 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1634 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1635 emit_mul( func
, 1, 2 );
1636 emit_add( func
, 0, 1 );
1637 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1638 STORE( func
, *inst
, 0, 0, chan_index
);
1642 case TGSI_OPCODE_DST
:
1643 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1649 STORE( func
, *inst
, 0, 0, CHAN_X
);
1651 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1652 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1653 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1654 emit_mul( func
, 0, 1 );
1655 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1657 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1658 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1659 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1661 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1662 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1663 STORE( func
, *inst
, 0, 0, CHAN_W
);
1667 case TGSI_OPCODE_MIN
:
1668 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1669 FETCH( func
, *inst
, 0, 0, chan_index
);
1670 FETCH( func
, *inst
, 1, 1, chan_index
);
1675 STORE( func
, *inst
, 0, 0, chan_index
);
1679 case TGSI_OPCODE_MAX
:
1680 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1681 FETCH( func
, *inst
, 0, 0, chan_index
);
1682 FETCH( func
, *inst
, 1, 1, chan_index
);
1687 STORE( func
, *inst
, 0, 0, chan_index
);
1691 case TGSI_OPCODE_SLT
:
1692 /* TGSI_OPCODE_SETLT */
1693 emit_setcc( func
, inst
, cc_LessThan
);
1696 case TGSI_OPCODE_SGE
:
1697 /* TGSI_OPCODE_SETGE */
1698 emit_setcc( func
, inst
, cc_NotLessThan
);
1701 case TGSI_OPCODE_MAD
:
1702 /* TGSI_OPCODE_MADD */
1703 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1704 FETCH( func
, *inst
, 0, 0, chan_index
);
1705 FETCH( func
, *inst
, 1, 1, chan_index
);
1706 FETCH( func
, *inst
, 2, 2, chan_index
);
1707 emit_mul( func
, 0, 1 );
1708 emit_add( func
, 0, 2 );
1709 STORE( func
, *inst
, 0, 0, chan_index
);
1713 case TGSI_OPCODE_SUB
:
1714 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1715 FETCH( func
, *inst
, 0, 0, chan_index
);
1716 FETCH( func
, *inst
, 1, 1, chan_index
);
1717 emit_sub( func
, 0, 1 );
1718 STORE( func
, *inst
, 0, 0, chan_index
);
1722 case TGSI_OPCODE_LERP
:
1723 /* TGSI_OPCODE_LRP */
1724 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1725 FETCH( func
, *inst
, 0, 0, chan_index
);
1726 FETCH( func
, *inst
, 1, 1, chan_index
);
1727 FETCH( func
, *inst
, 2, 2, chan_index
);
1728 emit_sub( func
, 1, 2 );
1729 emit_mul( func
, 0, 1 );
1730 emit_add( func
, 0, 2 );
1731 STORE( func
, *inst
, 0, 0, chan_index
);
1735 case TGSI_OPCODE_CND
:
1739 case TGSI_OPCODE_CND0
:
1743 case TGSI_OPCODE_DOT2ADD
:
1744 /* TGSI_OPCODE_DP2A */
1745 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
1746 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
1747 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1748 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
1749 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
1750 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1751 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1752 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
1753 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1754 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1755 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
1759 case TGSI_OPCODE_INDEX
:
1763 case TGSI_OPCODE_NEGATE
:
1767 case TGSI_OPCODE_FRAC
:
1768 /* TGSI_OPCODE_FRC */
1769 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1770 FETCH( func
, *inst
, 0, 0, chan_index
);
1771 emit_frc( func
, 0, 0 );
1772 STORE( func
, *inst
, 0, 0, chan_index
);
1776 case TGSI_OPCODE_CLAMP
:
1780 case TGSI_OPCODE_FLOOR
:
1781 /* TGSI_OPCODE_FLR */
1782 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1783 FETCH( func
, *inst
, 0, 0, chan_index
);
1784 emit_flr( func
, 0, 0 );
1785 STORE( func
, *inst
, 0, 0, chan_index
);
1789 case TGSI_OPCODE_ROUND
:
1793 case TGSI_OPCODE_EXPBASE2
:
1794 /* TGSI_OPCODE_EX2 */
1795 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1796 emit_ex2( func
, 0, 0 );
1797 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1798 STORE( func
, *inst
, 0, 0, chan_index
);
1802 case TGSI_OPCODE_LOGBASE2
:
1803 /* TGSI_OPCODE_LG2 */
1804 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1805 emit_lg2( func
, 0, 0 );
1806 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1807 STORE( func
, *inst
, 0, 0, chan_index
);
1811 case TGSI_OPCODE_POWER
:
1812 /* TGSI_OPCODE_POW */
1813 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1814 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1815 emit_pow( func
, 0, 0, 1 );
1816 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1817 STORE( func
, *inst
, 0, 0, chan_index
);
1821 case TGSI_OPCODE_CROSSPRODUCT
:
1822 /* TGSI_OPCODE_XPD */
1823 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1824 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1825 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1826 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1828 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1829 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1830 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1831 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1833 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1834 emit_MOV( func
, 2, 0 );
1835 emit_mul( func
, 2, 1 );
1836 emit_MOV( func
, 5, 3 );
1837 emit_mul( func
, 5, 4 );
1838 emit_sub( func
, 2, 5 );
1839 STORE( func
, *inst
, 2, 0, CHAN_X
);
1841 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1842 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1843 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1844 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1846 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1847 emit_mul( func
, 3, 2 );
1848 emit_mul( func
, 1, 5 );
1849 emit_sub( func
, 3, 1 );
1850 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1852 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1853 emit_mul( func
, 5, 4 );
1854 emit_mul( func
, 0, 2 );
1855 emit_sub( func
, 5, 0 );
1856 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1858 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1864 STORE( func
, *inst
, 0, 0, CHAN_W
);
1868 case TGSI_OPCODE_MULTIPLYMATRIX
:
1872 case TGSI_OPCODE_ABS
:
1873 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1874 FETCH( func
, *inst
, 0, 0, chan_index
);
1875 emit_abs( func
, 0) ;
1877 STORE( func
, *inst
, 0, 0, chan_index
);
1881 case TGSI_OPCODE_RCC
:
1885 case TGSI_OPCODE_DPH
:
1886 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1887 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1888 emit_mul( func
, 0, 1 );
1889 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1890 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1891 emit_mul( func
, 1, 2 );
1892 emit_add( func
, 0, 1 );
1893 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1894 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1895 emit_mul( func
, 1, 2 );
1896 emit_add( func
, 0, 1 );
1897 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1898 emit_add( func
, 0, 1 );
1899 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1900 STORE( func
, *inst
, 0, 0, chan_index
);
1904 case TGSI_OPCODE_COS
:
1905 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1906 emit_cos( func
, 0, 0 );
1907 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1908 STORE( func
, *inst
, 0, 0, chan_index
);
1912 case TGSI_OPCODE_DDX
:
1916 case TGSI_OPCODE_DDY
:
1920 case TGSI_OPCODE_KILP
:
1921 /* predicated kill */
1923 return 0; /* XXX fix me */
1926 case TGSI_OPCODE_KIL
:
1927 /* conditional kill */
1928 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1931 case TGSI_OPCODE_PK2H
:
1935 case TGSI_OPCODE_PK2US
:
1939 case TGSI_OPCODE_PK4B
:
1943 case TGSI_OPCODE_PK4UB
:
1947 case TGSI_OPCODE_RFL
:
1951 case TGSI_OPCODE_SEQ
:
1955 case TGSI_OPCODE_SFL
:
1959 case TGSI_OPCODE_SGT
:
1963 case TGSI_OPCODE_SIN
:
1964 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1965 emit_sin( func
, 0, 0 );
1966 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1967 STORE( func
, *inst
, 0, 0, chan_index
);
1971 case TGSI_OPCODE_SLE
:
1975 case TGSI_OPCODE_SNE
:
1979 case TGSI_OPCODE_STR
:
1983 case TGSI_OPCODE_TEX
:
1985 /* Disable dummy texture code:
1992 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1993 STORE( func
, *inst
, 0, 0, chan_index
);
2001 case TGSI_OPCODE_TXD
:
2005 case TGSI_OPCODE_UP2H
:
2009 case TGSI_OPCODE_UP2US
:
2013 case TGSI_OPCODE_UP4B
:
2017 case TGSI_OPCODE_UP4UB
:
2021 case TGSI_OPCODE_X2D
:
2025 case TGSI_OPCODE_ARA
:
2029 case TGSI_OPCODE_ARR
:
2033 case TGSI_OPCODE_BRA
:
2037 case TGSI_OPCODE_CAL
:
2041 case TGSI_OPCODE_RET
:
2045 case TGSI_OPCODE_END
:
2048 case TGSI_OPCODE_SSG
:
2052 case TGSI_OPCODE_CMP
:
2053 emit_cmp (func
, inst
);
2056 case TGSI_OPCODE_SCS
:
2057 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2058 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2059 emit_cos( func
, 0, 0 );
2060 STORE( func
, *inst
, 0, 0, CHAN_X
);
2062 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2063 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2064 emit_sin( func
, 0, 0 );
2065 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2067 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2071 TGSI_EXEC_TEMP_00000000_I
,
2072 TGSI_EXEC_TEMP_00000000_C
);
2073 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2075 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2081 STORE( func
, *inst
, 0, 0, CHAN_W
);
2085 case TGSI_OPCODE_TXB
:
2089 case TGSI_OPCODE_NRM
:
2091 case TGSI_OPCODE_NRM4
:
2092 /* 3 or 4-component normalization */
2094 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2095 /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2096 FETCH( func
, *inst
, 4, 0, CHAN_X
); /* xmm4 = src[0].x */
2097 FETCH( func
, *inst
, 5, 0, CHAN_Y
); /* xmm5 = src[0].y */
2098 FETCH( func
, *inst
, 6, 0, CHAN_Z
); /* xmm6 = src[0].z */
2100 FETCH( func
, *inst
, 7, 0, CHAN_W
); /* xmm7 = src[0].w */
2102 emit_MOV( func
, 0, 4 ); /* xmm0 = xmm3 */
2103 emit_mul( func
, 0, 4 ); /* xmm0 *= xmm3 */
2104 emit_MOV( func
, 1, 5 ); /* xmm1 = xmm4 */
2105 emit_mul( func
, 1, 5 ); /* xmm1 *= xmm4 */
2106 emit_add( func
, 0, 1 ); /* xmm0 += xmm1 */
2107 emit_MOV( func
, 1, 6 ); /* xmm1 = xmm5 */
2108 emit_mul( func
, 1, 6 ); /* xmm1 *= xmm5 */
2109 emit_add( func
, 0, 1 ); /* xmm0 += xmm1 */
2111 emit_MOV( func
, 1, 7 ); /* xmm1 = xmm7 */
2112 emit_mul( func
, 1, 7 ); /* xmm1 *= xmm7 */
2113 emit_add( func
, 0, 0 ); /* xmm0 += xmm1 */
2115 emit_rsqrt( func
, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2117 if (chan_index
< dims
) {
2118 emit_mul( func
, 4+chan_index
, 1); /* xmm[4+ch] *= xmm1 */
2119 STORE( func
, *inst
, 4+chan_index
, 0, chan_index
);
2125 case TGSI_OPCODE_DIV
:
2129 case TGSI_OPCODE_DP2
:
2130 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2131 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2132 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2133 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2134 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2135 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2136 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2137 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2138 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2142 case TGSI_OPCODE_TXL
:
2146 case TGSI_OPCODE_BRK
:
2150 case TGSI_OPCODE_IF
:
2154 case TGSI_OPCODE_LOOP
:
2158 case TGSI_OPCODE_REP
:
2162 case TGSI_OPCODE_ELSE
:
2166 case TGSI_OPCODE_ENDIF
:
2170 case TGSI_OPCODE_ENDLOOP
:
2174 case TGSI_OPCODE_ENDREP
:
2178 case TGSI_OPCODE_PUSHA
:
2182 case TGSI_OPCODE_POPA
:
2186 case TGSI_OPCODE_CEIL
:
2190 case TGSI_OPCODE_I2F
:
2194 case TGSI_OPCODE_NOT
:
2198 case TGSI_OPCODE_TRUNC
:
2199 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2200 FETCH( func
, *inst
, 0, 0, chan_index
);
2201 emit_f2it( func
, 0 );
2202 emit_i2f( func
, 0 );
2203 STORE( func
, *inst
, 0, 0, chan_index
);
2207 case TGSI_OPCODE_SHL
:
2211 case TGSI_OPCODE_SHR
:
2215 case TGSI_OPCODE_AND
:
2219 case TGSI_OPCODE_OR
:
2223 case TGSI_OPCODE_MOD
:
2227 case TGSI_OPCODE_XOR
:
2231 case TGSI_OPCODE_SAD
:
2235 case TGSI_OPCODE_TXF
:
2239 case TGSI_OPCODE_TXQ
:
2243 case TGSI_OPCODE_CONT
:
2247 case TGSI_OPCODE_EMIT
:
2251 case TGSI_OPCODE_ENDPRIM
:
2264 struct x86_function
*func
,
2265 struct tgsi_full_declaration
*decl
)
2267 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2268 unsigned first
, last
, mask
;
2271 first
= decl
->DeclarationRange
.First
;
2272 last
= decl
->DeclarationRange
.Last
;
2273 mask
= decl
->Declaration
.UsageMask
;
2275 for( i
= first
; i
<= last
; i
++ ) {
2276 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2277 if( mask
& (1 << j
) ) {
2278 switch( decl
->Declaration
.Interpolate
) {
2279 case TGSI_INTERPOLATE_CONSTANT
:
2280 emit_coef_a0( func
, 0, i
, j
);
2281 emit_inputs( func
, 0, i
, j
);
2284 case TGSI_INTERPOLATE_LINEAR
:
2285 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2286 emit_coef_dadx( func
, 1, i
, j
);
2287 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2288 emit_coef_dady( func
, 3, i
, j
);
2289 emit_mul( func
, 0, 1 ); /* x * dadx */
2290 emit_coef_a0( func
, 4, i
, j
);
2291 emit_mul( func
, 2, 3 ); /* y * dady */
2292 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2293 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2294 emit_inputs( func
, 0, i
, j
);
2297 case TGSI_INTERPOLATE_PERSPECTIVE
:
2298 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2299 emit_coef_dadx( func
, 1, i
, j
);
2300 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2301 emit_coef_dady( func
, 3, i
, j
);
2302 emit_mul( func
, 0, 1 ); /* x * dadx */
2303 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2304 emit_coef_a0( func
, 5, i
, j
);
2305 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2306 emit_mul( func
, 2, 3 ); /* y * dady */
2307 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2308 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2309 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2310 emit_inputs( func
, 0, i
, j
);
2323 static void aos_to_soa( struct x86_function
*func
,
2329 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2330 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2331 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2332 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2337 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2339 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2340 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2341 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2342 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2345 inner_loop
= x86_get_label( func
);
2347 x86_push( func
, aos_input
);
2348 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2349 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2350 x86_add( func
, aos_input
, stride
);
2351 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2352 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2353 x86_add( func
, aos_input
, stride
);
2354 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2355 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2356 x86_add( func
, aos_input
, stride
);
2357 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2358 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2359 x86_pop( func
, aos_input
);
2361 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2362 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2363 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2364 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2365 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2366 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2368 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2369 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2370 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2371 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2373 /* Advance to next input */
2374 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2375 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2377 /* while --num_inputs */
2378 x86_dec( func
, num_inputs
);
2379 x86_jcc( func
, cc_NE
, inner_loop
);
2382 x86_pop( func
, aos_input
);
2385 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2387 struct x86_reg soa_output
;
2388 struct x86_reg aos_output
;
2389 struct x86_reg num_outputs
;
2390 struct x86_reg temp
;
2393 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2394 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2395 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2396 temp
= x86_make_reg( file_REG32
, reg_DX
);
2399 x86_push( func
, aos_output
);
2401 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2402 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2403 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2406 inner_loop
= x86_get_label( func
);
2408 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2409 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2410 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2411 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2413 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2414 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2415 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2416 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2417 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2418 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2420 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2421 x86_push( func
, aos_output
);
2422 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2423 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2424 x86_add( func
, aos_output
, temp
);
2425 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2426 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2427 x86_add( func
, aos_output
, temp
);
2428 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2429 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2430 x86_add( func
, aos_output
, temp
);
2431 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2432 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2433 x86_pop( func
, aos_output
);
2435 /* Advance to next output */
2436 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2437 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2439 /* while --num_outputs */
2440 x86_dec( func
, num_outputs
);
2441 x86_jcc( func
, cc_NE
, inner_loop
);
2444 x86_pop( func
, aos_output
);
2448 * Translate a TGSI vertex/fragment shader to SSE2 code.
2449 * Slightly different things are done for vertex vs. fragment shaders.
2451 * Note that fragment shaders are responsible for interpolating shader
2452 * inputs. Because on x86 we have only 4 GP registers, and here we
2453 * have 5 shader arguments (input, output, const, temp and coef), the
2454 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2455 * GP register holding the output argument is aliased with the coeff
2456 * argument, as outputs are not needed in the DECLARATION phase.
2458 * \param tokens the TGSI input shader
2459 * \param func the output SSE code/function
2460 * \param immediates buffer to place immediates, later passed to SSE func
2461 * \param return 1 for success, 0 if translation failed
2465 const struct tgsi_token
*tokens
,
2466 struct x86_function
*func
,
2467 float (*immediates
)[4],
2468 boolean do_swizzles
)
2470 struct tgsi_parse_context parse
;
2471 boolean instruction_phase
= FALSE
;
2473 uint num_immediates
= 0;
2477 func
->csr
= func
->store
;
2479 tgsi_parse_init( &parse
, tokens
);
2481 /* Can't just use EDI, EBX without save/restoring them:
2485 get_immediate_base() );
2493 * Different function args for vertex/fragment shaders:
2495 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2496 /* DECLARATION phase, do not load output argument. */
2500 x86_fn_arg( func
, 1 ) );
2501 /* skipping outputs argument here */
2505 x86_fn_arg( func
, 3 ) );
2509 x86_fn_arg( func
, 4 ) );
2513 x86_fn_arg( func
, 5 ) );
2516 get_immediate_base(),
2517 x86_fn_arg( func
, 6 ) );
2520 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2525 1, /* machine->input */
2527 8 ); /* input_stride */
2532 x86_fn_arg( func
, 1 ) );
2536 x86_fn_arg( func
, 2 ) );
2540 x86_fn_arg( func
, 3 ) );
2544 x86_fn_arg( func
, 4 ) );
2547 get_immediate_base(),
2548 x86_fn_arg( func
, 5 ) );
2551 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2552 tgsi_parse_token( &parse
);
2554 switch( parse
.FullToken
.Token
.Type
) {
2555 case TGSI_TOKEN_TYPE_DECLARATION
:
2556 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2559 &parse
.FullToken
.FullDeclaration
);
2563 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2564 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2565 if( !instruction_phase
) {
2566 /* INSTRUCTION phase, overwrite coeff with output. */
2567 instruction_phase
= TRUE
;
2571 x86_fn_arg( func
, 2 ) );
2575 ok
= emit_instruction(
2577 &parse
.FullToken
.FullInstruction
);
2580 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2581 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2582 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2583 "vertex shader" : "fragment shader");
2587 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2588 /* simply copy the immediate values into the next immediates[] slot */
2590 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2593 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2594 for( i
= 0; i
< size
; i
++ ) {
2595 immediates
[num_immediates
][i
] =
2596 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2599 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2601 immediates
[num_immediates
][0],
2602 immediates
[num_immediates
][1],
2603 immediates
[num_immediates
][2],
2604 immediates
[num_immediates
][3]);
2616 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2618 soa_to_aos( func
, 9, 2, 10, 11 );
2621 /* Can't just use EBX, EDI without save/restoring them:
2629 get_immediate_base() );
2633 tgsi_parse_free( &parse
);
2638 #endif /* PIPE_ARCH_X86 */