1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
30 #include "pipe/p_debug.h"
31 #include "pipe/p_shader_tokens.h"
32 #include "util/u_math.h"
33 #include "util/u_sse.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_util.h"
36 #include "tgsi_exec.h"
37 #include "tgsi_sse2.h"
39 #include "rtasm/rtasm_x86sse.h"
43 * This costs about 100fps (close to 10%) in gears:
45 #define HIGH_PRECISION 1
50 #define FOR_EACH_CHANNEL( CHAN )\
51 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
53 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
54 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
56 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
57 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
59 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
60 FOR_EACH_CHANNEL( CHAN )\
61 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
68 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
69 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
71 #define TEMP_R0 TGSI_EXEC_TEMP_R0
72 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
75 * X86 utility functions.
84 (enum x86_reg_name
) xmm
);
88 * X86 register mapping helpers.
92 get_const_base( void )
100 get_input_base( void )
107 static struct x86_reg
108 get_output_base( void )
115 static struct x86_reg
116 get_temp_base( void )
123 static struct x86_reg
124 get_coef_base( void )
126 return get_output_base();
129 static struct x86_reg
130 get_immediate_base( void )
139 * Data access helpers.
143 static struct x86_reg
148 return x86_make_disp(
149 get_immediate_base(),
150 (vec
* 4 + chan
) * 4 );
153 static struct x86_reg
158 return x86_make_disp(
160 (vec
* 4 + chan
) * 4 );
163 static struct x86_reg
168 return x86_make_disp(
170 (vec
* 4 + chan
) * 16 );
173 static struct x86_reg
178 return x86_make_disp(
180 (vec
* 4 + chan
) * 16 );
183 static struct x86_reg
188 return x86_make_disp(
190 (vec
* 4 + chan
) * 16 );
193 static struct x86_reg
199 return x86_make_disp(
201 ((vec
* 3 + member
) * 4 + chan
) * 4 );
207 struct x86_function
*func
)
214 * Data fetch helpers.
218 * Copy a shader constant to xmm register
219 * \param xmm the destination xmm register
220 * \param vec the src const buffer index
221 * \param chan src channel to fetch (X, Y, Z or W)
225 struct x86_function
*func
,
234 struct x86_reg r0
= get_input_base();
235 struct x86_reg r1
= get_output_base();
238 assert( indirectFile
== TGSI_FILE_ADDRESS
);
239 assert( indirectIndex
== 0 );
241 x86_push( func
, r0
);
242 x86_push( func
, r1
);
244 for (i
= 0; i
< QUAD_SIZE
; i
++) {
245 x86_lea( func
, r0
, get_const( vec
, chan
) );
246 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
248 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
250 x86_add( func
, r1
, r1
);
251 x86_add( func
, r1
, r1
);
252 x86_add( func
, r1
, r1
);
253 x86_add( func
, r1
, r1
);
255 x86_add( func
, r0
, r1
);
256 x86_mov( func
, r1
, x86_deref( r0
) );
257 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
266 get_temp( TEMP_R0
, CHAN_X
) );
274 get_const( vec
, chan
) );
279 SHUF( 0, 0, 0, 0 ) );
285 struct x86_function
*func
,
293 get_immediate( vec
, chan
) );
298 SHUF( 0, 0, 0, 0 ) );
303 * Copy a shader input to xmm register
304 * \param xmm the destination xmm register
305 * \param vec the src input attrib
306 * \param chan src channel to fetch (X, Y, Z or W)
310 struct x86_function
*func
,
318 get_input( vec
, chan
) );
322 * Store an xmm register to a shader output
323 * \param xmm the source xmm register
324 * \param vec the dest output attrib
325 * \param chan src dest channel to store (X, Y, Z or W)
329 struct x86_function
*func
,
336 get_output( vec
, chan
),
341 * Copy a shader temporary to xmm register
342 * \param xmm the destination xmm register
343 * \param vec the src temp register
344 * \param chan src channel to fetch (X, Y, Z or W)
348 struct x86_function
*func
,
356 get_temp( vec
, chan
) );
360 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
361 * \param xmm the destination xmm register
362 * \param vec the src input/attribute coefficient index
363 * \param chan src channel to fetch (X, Y, Z or W)
364 * \param member 0=a0, 1=dadx, 2=dady
368 struct x86_function
*func
,
377 get_coef( vec
, chan
, member
) );
382 SHUF( 0, 0, 0, 0 ) );
386 * Data store helpers.
391 struct x86_function
*func
,
398 get_input( vec
, chan
),
404 struct x86_function
*func
,
411 get_temp( vec
, chan
),
417 struct x86_function
*func
,
427 vec
+ TGSI_EXEC_TEMP_ADDR
,
432 * Coefficent fetch helpers.
437 struct x86_function
*func
,
452 struct x86_function
*func
,
467 struct x86_function
*func
,
481 * Function call helpers.
485 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
486 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
487 * that the stack pointer is 16 byte aligned, as expected.
491 struct x86_function
*func
,
494 void (PIPE_CDECL
*code
)() )
496 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
500 /* Bitmask of the xmm registers to save */
501 xmm_mask
= (1 << xmm_save
) - 1;
502 xmm_mask
&= ~(1 << xmm_dst
);
506 get_temp( TEMP_R0
, 0 ),
507 make_xmm( xmm_dst
) );
511 x86_make_reg( file_REG32
, reg_AX
) );
514 x86_make_reg( file_REG32
, reg_CX
) );
517 x86_make_reg( file_REG32
, reg_DX
) );
519 for(i
= 0, n
= 0; i
< 8; ++i
)
520 if(xmm_mask
& (1 << i
))
525 x86_make_reg( file_REG32
, reg_SP
),
528 for(i
= 0, n
= 0; i
< 8; ++i
)
529 if(xmm_mask
& (1 << i
)) {
532 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
540 get_temp( TEMP_R0
, 0 ) );
542 x86_push( func
, ecx
);
543 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
544 x86_call( func
, ecx
);
547 for(i
= 0, n
= 0; i
< 8; ++i
)
548 if(xmm_mask
& (1 << i
)) {
552 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
558 x86_make_reg( file_REG32
, reg_SP
),
561 /* Restore GP registers in a reverse order.
565 x86_make_reg( file_REG32
, reg_DX
) );
568 x86_make_reg( file_REG32
, reg_CX
) );
571 x86_make_reg( file_REG32
, reg_AX
) );
576 get_temp( TEMP_R0
, 0 ) );
580 emit_func_call_dst_src(
581 struct x86_function
*func
,
585 void (PIPE_CDECL
*code
)() )
589 get_temp( TEMP_R0
, 1 ),
590 make_xmm( xmm_src
) );
600 * Fast SSE2 implementation of special math functions.
603 #define POLY0(x, c0) _mm_set1_ps(c0)
604 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
605 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
606 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
607 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
608 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
610 #define EXP_POLY_DEGREE 3
611 #define LOG_POLY_DEGREE 5
614 * See http://www.devmaster.net/forums/showthread.php?p=43580
620 __m128 fpart
, expipart
, expfpart
;
622 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
623 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
625 /* ipart = int(x - 0.5) */
626 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
628 /* fpart = x - ipart */
629 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
631 /* expipart = (float) (1 << ipart) */
632 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
634 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
635 #if EXP_POLY_DEGREE == 5
636 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
637 #elif EXP_POLY_DEGREE == 4
638 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
639 #elif EXP_POLY_DEGREE == 3
640 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
641 #elif EXP_POLY_DEGREE == 2
642 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
647 return _mm_mul_ps(expipart
, expfpart
);
651 * See http://www.devmaster.net/forums/showthread.php?p=43580
656 __m128i expmask
= _mm_set1_epi32(0x7f800000);
657 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
658 __m128 one
= _mm_set1_ps(1.0f
);
660 __m128i i
= _mm_castps_si128(x
);
662 /* exp = (float) exponent(x) */
663 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
665 /* mant = (float) mantissa(x) */
666 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
670 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
671 * These coefficients can be generate with
672 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
674 #if LOG_POLY_DEGREE == 6
675 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
676 #elif LOG_POLY_DEGREE == 5
677 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
678 #elif LOG_POLY_DEGREE == 4
679 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
680 #elif LOG_POLY_DEGREE == 3
681 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
686 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
687 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
689 return _mm_add_ps(logmant
, exp
);
693 powf4(__m128 x
, __m128 y
)
695 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
700 * Low-level instruction translators.
705 struct x86_function
*func
,
712 TGSI_EXEC_TEMP_7FFFFFFF_I
,
713 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
718 struct x86_function
*func
,
725 make_xmm( xmm_src
) );
728 static void PIPE_CDECL
732 store
[0] = cosf( store
[0] );
733 store
[1] = cosf( store
[1] );
734 store
[2] = cosf( store
[2] );
735 store
[3] = cosf( store
[3] );
740 struct x86_function
*func
,
751 static void PIPE_CDECL
752 #if defined(PIPE_CC_GCC)
753 __attribute__((force_align_arg_pointer
))
758 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
763 struct x86_function
*func
,
776 struct x86_function
*func
,
785 static void PIPE_CDECL
789 store
[0] = floorf( store
[0] );
790 store
[1] = floorf( store
[1] );
791 store
[2] = floorf( store
[2] );
792 store
[3] = floorf( store
[3] );
797 struct x86_function
*func
,
808 static void PIPE_CDECL
812 store
[0] -= floorf( store
[0] );
813 store
[1] -= floorf( store
[1] );
814 store
[2] -= floorf( store
[2] );
815 store
[3] -= floorf( store
[3] );
820 struct x86_function
*func
,
831 static void PIPE_CDECL
832 #if defined(PIPE_CC_GCC)
833 __attribute__((force_align_arg_pointer
))
838 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
843 struct x86_function
*func
,
856 struct x86_function
*func
,
863 make_xmm( xmm_src
) );
867 emit_mul (struct x86_function
*func
,
874 make_xmm( xmm_src
) );
879 struct x86_function
*func
,
886 TGSI_EXEC_TEMP_80000000_I
,
887 TGSI_EXEC_TEMP_80000000_C
) );
890 static void PIPE_CDECL
891 #if defined(PIPE_CC_GCC)
892 __attribute__((force_align_arg_pointer
))
898 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
900 store
[0] = powf( store
[0], store
[4] );
901 store
[1] = powf( store
[1], store
[5] );
902 store
[2] = powf( store
[2], store
[6] );
903 store
[3] = powf( store
[3], store
[7] );
909 struct x86_function
*func
,
914 emit_func_call_dst_src(
924 struct x86_function
*func
,
928 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
929 * good enough. Need to either emit a proper divide or use the
930 * iterative technique described below in emit_rsqrt().
935 make_xmm( xmm_src
) );
940 struct x86_function
*func
,
945 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
946 * implementations, it is possible to improve its precision at
947 * fairly low cost, using a newton/raphson step, as below:
949 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
950 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
952 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
955 struct x86_reg dst
= make_xmm( xmm_dst
);
956 struct x86_reg src
= make_xmm( xmm_src
);
957 struct x86_reg tmp0
= make_xmm( 2 );
958 struct x86_reg tmp1
= make_xmm( 3 );
960 assert( xmm_dst
!= xmm_src
);
961 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
962 assert( xmm_src
!= 2 && xmm_src
!= 3 );
964 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
965 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
966 sse_rsqrtps( func
, tmp1
, src
);
967 sse_mulps( func
, src
, tmp1
);
968 sse_mulps( func
, dst
, tmp1
);
969 sse_mulps( func
, src
, tmp1
);
970 sse_subps( func
, tmp0
, src
);
971 sse_mulps( func
, dst
, tmp0
);
974 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
980 make_xmm( xmm_src
) );
986 struct x86_function
*func
,
993 TGSI_EXEC_TEMP_80000000_I
,
994 TGSI_EXEC_TEMP_80000000_C
) );
997 static void PIPE_CDECL
1001 store
[0] = sinf( store
[0] );
1002 store
[1] = sinf( store
[1] );
1003 store
[2] = sinf( store
[2] );
1004 store
[3] = sinf( store
[3] );
1008 emit_sin (struct x86_function
*func
,
1021 struct x86_function
*func
,
1027 make_xmm( xmm_dst
),
1028 make_xmm( xmm_src
) );
1037 struct x86_function
*func
,
1039 const struct tgsi_full_src_register
*reg
,
1040 const unsigned chan_index
)
1042 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1045 case TGSI_EXTSWIZZLE_X
:
1046 case TGSI_EXTSWIZZLE_Y
:
1047 case TGSI_EXTSWIZZLE_Z
:
1048 case TGSI_EXTSWIZZLE_W
:
1049 switch (reg
->SrcRegister
.File
) {
1050 case TGSI_FILE_CONSTANT
:
1054 reg
->SrcRegister
.Index
,
1056 reg
->SrcRegister
.Indirect
,
1057 reg
->SrcRegisterInd
.File
,
1058 reg
->SrcRegisterInd
.Index
);
1061 case TGSI_FILE_IMMEDIATE
:
1065 reg
->SrcRegister
.Index
,
1069 case TGSI_FILE_INPUT
:
1073 reg
->SrcRegister
.Index
,
1077 case TGSI_FILE_TEMPORARY
:
1081 reg
->SrcRegister
.Index
,
1090 case TGSI_EXTSWIZZLE_ZERO
:
1094 TGSI_EXEC_TEMP_00000000_I
,
1095 TGSI_EXEC_TEMP_00000000_C
);
1098 case TGSI_EXTSWIZZLE_ONE
:
1110 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1111 case TGSI_UTIL_SIGN_CLEAR
:
1112 emit_abs( func
, xmm
);
1115 case TGSI_UTIL_SIGN_SET
:
1116 emit_setsign( func
, xmm
);
1119 case TGSI_UTIL_SIGN_TOGGLE
:
1120 emit_neg( func
, xmm
);
1123 case TGSI_UTIL_SIGN_KEEP
:
1128 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1129 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1137 struct x86_function
*func
,
1139 const struct tgsi_full_dst_register
*reg
,
1140 const struct tgsi_full_instruction
*inst
,
1141 unsigned chan_index
)
1143 switch( reg
->DstRegister
.File
) {
1144 case TGSI_FILE_OUTPUT
:
1148 reg
->DstRegister
.Index
,
1152 case TGSI_FILE_TEMPORARY
:
1156 reg
->DstRegister
.Index
,
1160 case TGSI_FILE_ADDRESS
:
1164 reg
->DstRegister
.Index
,
1172 switch( inst
->Instruction
.Saturate
) {
1176 case TGSI_SAT_ZERO_ONE
:
1180 case TGSI_SAT_MINUS_PLUS_ONE
:
1186 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1187 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1190 * High-level instruction translators.
1195 struct x86_function
*func
,
1196 const struct tgsi_full_src_register
*reg
)
1198 unsigned uniquemask
;
1199 unsigned registers
[4];
1200 unsigned nextregister
= 0;
1201 unsigned firstchan
= ~0;
1202 unsigned chan_index
;
1204 /* This mask stores component bits that were already tested. Note that
1205 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1207 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1209 FOR_EACH_CHANNEL( chan_index
) {
1212 /* unswizzle channel */
1213 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1217 /* check if the component has not been already tested */
1218 if( !(uniquemask
& (1 << swizzle
)) ) {
1219 uniquemask
|= 1 << swizzle
;
1221 /* allocate register */
1222 registers
[chan_index
] = nextregister
;
1230 /* mark the first channel used */
1231 if( firstchan
== ~0 ) {
1232 firstchan
= chan_index
;
1239 x86_make_reg( file_REG32
, reg_AX
) );
1242 x86_make_reg( file_REG32
, reg_DX
) );
1244 FOR_EACH_CHANNEL( chan_index
) {
1245 if( uniquemask
& (1 << chan_index
) ) {
1248 make_xmm( registers
[chan_index
] ),
1250 TGSI_EXEC_TEMP_00000000_I
,
1251 TGSI_EXEC_TEMP_00000000_C
),
1254 if( chan_index
== firstchan
) {
1257 x86_make_reg( file_REG32
, reg_AX
),
1258 make_xmm( registers
[chan_index
] ) );
1263 x86_make_reg( file_REG32
, reg_DX
),
1264 make_xmm( registers
[chan_index
] ) );
1267 x86_make_reg( file_REG32
, reg_AX
),
1268 x86_make_reg( file_REG32
, reg_DX
) );
1276 TGSI_EXEC_TEMP_KILMASK_I
,
1277 TGSI_EXEC_TEMP_KILMASK_C
),
1278 x86_make_reg( file_REG32
, reg_AX
) );
1282 x86_make_reg( file_REG32
, reg_DX
) );
1285 x86_make_reg( file_REG32
, reg_AX
) );
1291 struct x86_function
*func
)
1293 /* XXX todo / fix me */
1299 struct x86_function
*func
,
1300 struct tgsi_full_instruction
*inst
,
1303 unsigned chan_index
;
1305 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1306 FETCH( func
, *inst
, 0, 0, chan_index
);
1307 FETCH( func
, *inst
, 1, 1, chan_index
);
1319 STORE( func
, *inst
, 0, 0, chan_index
);
1325 struct x86_function
*func
,
1326 struct tgsi_full_instruction
*inst
)
1328 unsigned chan_index
;
1330 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1331 FETCH( func
, *inst
, 0, 0, chan_index
);
1332 FETCH( func
, *inst
, 1, 1, chan_index
);
1333 FETCH( func
, *inst
, 2, 2, chan_index
);
1338 TGSI_EXEC_TEMP_00000000_I
,
1339 TGSI_EXEC_TEMP_00000000_C
),
1353 STORE( func
, *inst
, 0, 0, chan_index
);
1359 struct x86_function
*func
,
1360 struct tgsi_full_instruction
*inst
)
1362 unsigned chan_index
;
1364 switch (inst
->Instruction
.Opcode
) {
1365 case TGSI_OPCODE_ARL
:
1366 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1367 FETCH( func
, *inst
, 0, 0, chan_index
);
1368 emit_f2it( func
, 0 );
1369 STORE( func
, *inst
, 0, 0, chan_index
);
1373 case TGSI_OPCODE_MOV
:
1374 case TGSI_OPCODE_SWZ
:
1375 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1376 FETCH( func
, *inst
, 0, 0, chan_index
);
1377 STORE( func
, *inst
, 0, 0, chan_index
);
1381 case TGSI_OPCODE_LIT
:
1382 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1383 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1389 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1390 STORE( func
, *inst
, 0, 0, CHAN_X
);
1392 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1393 STORE( func
, *inst
, 0, 0, CHAN_W
);
1396 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1397 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1398 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1399 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1404 TGSI_EXEC_TEMP_00000000_I
,
1405 TGSI_EXEC_TEMP_00000000_C
) );
1406 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1408 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1409 /* XMM[1] = SrcReg[0].yyyy */
1410 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1411 /* XMM[1] = max(XMM[1], 0) */
1416 TGSI_EXEC_TEMP_00000000_I
,
1417 TGSI_EXEC_TEMP_00000000_C
) );
1418 /* XMM[2] = SrcReg[0].wwww */
1419 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1420 /* XMM[2] = min(XMM[2], 128.0) */
1425 TGSI_EXEC_TEMP_128_I
,
1426 TGSI_EXEC_TEMP_128_C
) );
1427 /* XMM[2] = max(XMM[2], -128.0) */
1432 TGSI_EXEC_TEMP_MINUS_128_I
,
1433 TGSI_EXEC_TEMP_MINUS_128_C
) );
1434 emit_pow( func
, 3, 1, 2 );
1435 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1449 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1454 case TGSI_OPCODE_RCP
:
1455 /* TGSI_OPCODE_RECIP */
1456 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1457 emit_rcp( func
, 0, 0 );
1458 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1459 STORE( func
, *inst
, 0, 0, chan_index
);
1463 case TGSI_OPCODE_RSQ
:
1464 /* TGSI_OPCODE_RECIPSQRT */
1465 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1466 emit_rsqrt( func
, 1, 0 );
1467 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1468 STORE( func
, *inst
, 1, 0, chan_index
);
1472 case TGSI_OPCODE_EXP
:
1473 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1474 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1475 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1476 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1477 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1478 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1479 emit_MOV( func
, 1, 0 );
1480 emit_flr( func
, 2, 1 );
1481 /* dst.x = ex2(floor(src.x)) */
1482 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1483 emit_MOV( func
, 2, 1 );
1484 emit_ex2( func
, 3, 2 );
1485 STORE( func
, *inst
, 2, 0, CHAN_X
);
1487 /* dst.y = src.x - floor(src.x) */
1488 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1489 emit_MOV( func
, 2, 0 );
1490 emit_sub( func
, 2, 1 );
1491 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1494 /* dst.z = ex2(src.x) */
1495 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1496 emit_ex2( func
, 3, 0 );
1497 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1501 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1502 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1503 STORE( func
, *inst
, 0, 0, CHAN_W
);
1507 case TGSI_OPCODE_LOG
:
1508 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1509 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1510 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1511 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1512 emit_abs( func
, 0 );
1513 emit_MOV( func
, 1, 0 );
1514 emit_lg2( func
, 2, 1 );
1515 /* dst.z = lg2(abs(src.x)) */
1516 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1517 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1519 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1520 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1521 emit_flr( func
, 2, 1 );
1522 /* dst.x = floor(lg2(abs(src.x))) */
1523 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1524 STORE( func
, *inst
, 1, 0, CHAN_X
);
1526 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1527 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1528 emit_ex2( func
, 2, 1 );
1529 emit_rcp( func
, 1, 1 );
1530 emit_mul( func
, 0, 1 );
1531 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1536 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1537 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1538 STORE( func
, *inst
, 0, 0, CHAN_W
);
1542 case TGSI_OPCODE_MUL
:
1543 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1544 FETCH( func
, *inst
, 0, 0, chan_index
);
1545 FETCH( func
, *inst
, 1, 1, chan_index
);
1546 emit_mul( func
, 0, 1 );
1547 STORE( func
, *inst
, 0, 0, chan_index
);
1551 case TGSI_OPCODE_ADD
:
1552 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1553 FETCH( func
, *inst
, 0, 0, chan_index
);
1554 FETCH( func
, *inst
, 1, 1, chan_index
);
1555 emit_add( func
, 0, 1 );
1556 STORE( func
, *inst
, 0, 0, chan_index
);
1560 case TGSI_OPCODE_DP3
:
1561 /* TGSI_OPCODE_DOT3 */
1562 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1563 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1564 emit_mul( func
, 0, 1 );
1565 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1566 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1567 emit_mul( func
, 1, 2 );
1568 emit_add( func
, 0, 1 );
1569 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1570 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1571 emit_mul( func
, 1, 2 );
1572 emit_add( func
, 0, 1 );
1573 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1574 STORE( func
, *inst
, 0, 0, chan_index
);
1578 case TGSI_OPCODE_DP4
:
1579 /* TGSI_OPCODE_DOT4 */
1580 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1581 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1582 emit_mul( func
, 0, 1 );
1583 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1584 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1585 emit_mul( func
, 1, 2 );
1586 emit_add( func
, 0, 1 );
1587 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1588 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1589 emit_mul(func
, 1, 2 );
1590 emit_add(func
, 0, 1 );
1591 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1592 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1593 emit_mul( func
, 1, 2 );
1594 emit_add( func
, 0, 1 );
1595 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1596 STORE( func
, *inst
, 0, 0, chan_index
);
1600 case TGSI_OPCODE_DST
:
1601 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1607 STORE( func
, *inst
, 0, 0, CHAN_X
);
1609 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1610 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1611 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1612 emit_mul( func
, 0, 1 );
1613 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1615 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1616 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1617 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1619 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1620 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1621 STORE( func
, *inst
, 0, 0, CHAN_W
);
1625 case TGSI_OPCODE_MIN
:
1626 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1627 FETCH( func
, *inst
, 0, 0, chan_index
);
1628 FETCH( func
, *inst
, 1, 1, chan_index
);
1633 STORE( func
, *inst
, 0, 0, chan_index
);
1637 case TGSI_OPCODE_MAX
:
1638 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1639 FETCH( func
, *inst
, 0, 0, chan_index
);
1640 FETCH( func
, *inst
, 1, 1, chan_index
);
1645 STORE( func
, *inst
, 0, 0, chan_index
);
1649 case TGSI_OPCODE_SLT
:
1650 /* TGSI_OPCODE_SETLT */
1651 emit_setcc( func
, inst
, cc_LessThan
);
1654 case TGSI_OPCODE_SGE
:
1655 /* TGSI_OPCODE_SETGE */
1656 emit_setcc( func
, inst
, cc_NotLessThan
);
1659 case TGSI_OPCODE_MAD
:
1660 /* TGSI_OPCODE_MADD */
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1662 FETCH( func
, *inst
, 0, 0, chan_index
);
1663 FETCH( func
, *inst
, 1, 1, chan_index
);
1664 FETCH( func
, *inst
, 2, 2, chan_index
);
1665 emit_mul( func
, 0, 1 );
1666 emit_add( func
, 0, 2 );
1667 STORE( func
, *inst
, 0, 0, chan_index
);
1671 case TGSI_OPCODE_SUB
:
1672 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1673 FETCH( func
, *inst
, 0, 0, chan_index
);
1674 FETCH( func
, *inst
, 1, 1, chan_index
);
1675 emit_sub( func
, 0, 1 );
1676 STORE( func
, *inst
, 0, 0, chan_index
);
1680 case TGSI_OPCODE_LERP
:
1681 /* TGSI_OPCODE_LRP */
1682 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1683 FETCH( func
, *inst
, 0, 0, chan_index
);
1684 FETCH( func
, *inst
, 1, 1, chan_index
);
1685 FETCH( func
, *inst
, 2, 2, chan_index
);
1686 emit_sub( func
, 1, 2 );
1687 emit_mul( func
, 0, 1 );
1688 emit_add( func
, 0, 2 );
1689 STORE( func
, *inst
, 0, 0, chan_index
);
1693 case TGSI_OPCODE_CND
:
1697 case TGSI_OPCODE_CND0
:
1701 case TGSI_OPCODE_DOT2ADD
:
1702 /* TGSI_OPCODE_DP2A */
1706 case TGSI_OPCODE_INDEX
:
1710 case TGSI_OPCODE_NEGATE
:
1714 case TGSI_OPCODE_FRAC
:
1715 /* TGSI_OPCODE_FRC */
1716 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1717 FETCH( func
, *inst
, 0, 0, chan_index
);
1718 emit_frc( func
, 0, 0 );
1719 STORE( func
, *inst
, 0, 0, chan_index
);
1723 case TGSI_OPCODE_CLAMP
:
1727 case TGSI_OPCODE_FLOOR
:
1728 /* TGSI_OPCODE_FLR */
1729 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1730 FETCH( func
, *inst
, 0, 0, chan_index
);
1731 emit_flr( func
, 0, 0 );
1732 STORE( func
, *inst
, 0, 0, chan_index
);
1736 case TGSI_OPCODE_ROUND
:
1740 case TGSI_OPCODE_EXPBASE2
:
1741 /* TGSI_OPCODE_EX2 */
1742 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1743 emit_ex2( func
, 0, 0 );
1744 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1745 STORE( func
, *inst
, 0, 0, chan_index
);
1749 case TGSI_OPCODE_LOGBASE2
:
1750 /* TGSI_OPCODE_LG2 */
1751 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1752 emit_lg2( func
, 0, 0 );
1753 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1754 STORE( func
, *inst
, 0, 0, chan_index
);
1758 case TGSI_OPCODE_POWER
:
1759 /* TGSI_OPCODE_POW */
1760 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1761 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1762 emit_pow( func
, 0, 0, 1 );
1763 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1764 STORE( func
, *inst
, 0, 0, chan_index
);
1768 case TGSI_OPCODE_CROSSPRODUCT
:
1769 /* TGSI_OPCODE_XPD */
1770 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1771 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1772 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1773 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1775 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1776 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1777 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1778 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1780 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1781 emit_MOV( func
, 2, 0 );
1782 emit_mul( func
, 2, 1 );
1783 emit_MOV( func
, 5, 3 );
1784 emit_mul( func
, 5, 4 );
1785 emit_sub( func
, 2, 5 );
1786 STORE( func
, *inst
, 2, 0, CHAN_X
);
1788 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1789 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1790 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1791 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1793 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1794 emit_mul( func
, 3, 2 );
1795 emit_mul( func
, 1, 5 );
1796 emit_sub( func
, 3, 1 );
1797 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1799 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1800 emit_mul( func
, 5, 4 );
1801 emit_mul( func
, 0, 2 );
1802 emit_sub( func
, 5, 0 );
1803 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1805 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1811 STORE( func
, *inst
, 0, 0, CHAN_W
);
1815 case TGSI_OPCODE_MULTIPLYMATRIX
:
1819 case TGSI_OPCODE_ABS
:
1820 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1821 FETCH( func
, *inst
, 0, 0, chan_index
);
1822 emit_abs( func
, 0) ;
1824 STORE( func
, *inst
, 0, 0, chan_index
);
1828 case TGSI_OPCODE_RCC
:
1832 case TGSI_OPCODE_DPH
:
1833 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1834 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1835 emit_mul( func
, 0, 1 );
1836 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1837 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1838 emit_mul( func
, 1, 2 );
1839 emit_add( func
, 0, 1 );
1840 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1841 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1842 emit_mul( func
, 1, 2 );
1843 emit_add( func
, 0, 1 );
1844 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1845 emit_add( func
, 0, 1 );
1846 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1847 STORE( func
, *inst
, 0, 0, chan_index
);
1851 case TGSI_OPCODE_COS
:
1852 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1853 emit_cos( func
, 0, 0 );
1854 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1855 STORE( func
, *inst
, 0, 0, chan_index
);
1859 case TGSI_OPCODE_DDX
:
1863 case TGSI_OPCODE_DDY
:
1867 case TGSI_OPCODE_KILP
:
1868 /* predicated kill */
1870 return 0; /* XXX fix me */
1873 case TGSI_OPCODE_KIL
:
1874 /* conditional kill */
1875 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1878 case TGSI_OPCODE_PK2H
:
1882 case TGSI_OPCODE_PK2US
:
1886 case TGSI_OPCODE_PK4B
:
1890 case TGSI_OPCODE_PK4UB
:
1894 case TGSI_OPCODE_RFL
:
1898 case TGSI_OPCODE_SEQ
:
1902 case TGSI_OPCODE_SFL
:
1906 case TGSI_OPCODE_SGT
:
1910 case TGSI_OPCODE_SIN
:
1911 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1912 emit_sin( func
, 0, 0 );
1913 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1914 STORE( func
, *inst
, 0, 0, chan_index
);
1918 case TGSI_OPCODE_SLE
:
1922 case TGSI_OPCODE_SNE
:
1926 case TGSI_OPCODE_STR
:
1930 case TGSI_OPCODE_TEX
:
1932 /* Disable dummy texture code:
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1940 STORE( func
, *inst
, 0, 0, chan_index
);
1948 case TGSI_OPCODE_TXD
:
1952 case TGSI_OPCODE_UP2H
:
1956 case TGSI_OPCODE_UP2US
:
1960 case TGSI_OPCODE_UP4B
:
1964 case TGSI_OPCODE_UP4UB
:
1968 case TGSI_OPCODE_X2D
:
1972 case TGSI_OPCODE_ARA
:
1976 case TGSI_OPCODE_ARR
:
1980 case TGSI_OPCODE_BRA
:
1984 case TGSI_OPCODE_CAL
:
1988 case TGSI_OPCODE_RET
:
1992 case TGSI_OPCODE_END
:
1995 case TGSI_OPCODE_SSG
:
1999 case TGSI_OPCODE_CMP
:
2000 emit_cmp (func
, inst
);
2003 case TGSI_OPCODE_SCS
:
2004 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2005 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2006 emit_cos( func
, 0, 0 );
2007 STORE( func
, *inst
, 0, 0, CHAN_X
);
2009 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2010 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2011 emit_sin( func
, 0, 0 );
2012 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2014 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2018 TGSI_EXEC_TEMP_00000000_I
,
2019 TGSI_EXEC_TEMP_00000000_C
);
2020 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2022 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2028 STORE( func
, *inst
, 0, 0, CHAN_W
);
2032 case TGSI_OPCODE_TXB
:
2036 case TGSI_OPCODE_NRM
:
2040 case TGSI_OPCODE_DIV
:
2044 case TGSI_OPCODE_DP2
:
2048 case TGSI_OPCODE_TXL
:
2052 case TGSI_OPCODE_BRK
:
2056 case TGSI_OPCODE_IF
:
2060 case TGSI_OPCODE_LOOP
:
2064 case TGSI_OPCODE_REP
:
2068 case TGSI_OPCODE_ELSE
:
2072 case TGSI_OPCODE_ENDIF
:
2076 case TGSI_OPCODE_ENDLOOP
:
2080 case TGSI_OPCODE_ENDREP
:
2084 case TGSI_OPCODE_PUSHA
:
2088 case TGSI_OPCODE_POPA
:
2092 case TGSI_OPCODE_CEIL
:
2096 case TGSI_OPCODE_I2F
:
2100 case TGSI_OPCODE_NOT
:
2104 case TGSI_OPCODE_TRUNC
:
2108 case TGSI_OPCODE_SHL
:
2112 case TGSI_OPCODE_SHR
:
2116 case TGSI_OPCODE_AND
:
2120 case TGSI_OPCODE_OR
:
2124 case TGSI_OPCODE_MOD
:
2128 case TGSI_OPCODE_XOR
:
2132 case TGSI_OPCODE_SAD
:
2136 case TGSI_OPCODE_TXF
:
2140 case TGSI_OPCODE_TXQ
:
2144 case TGSI_OPCODE_CONT
:
2148 case TGSI_OPCODE_EMIT
:
2152 case TGSI_OPCODE_ENDPRIM
:
2165 struct x86_function
*func
,
2166 struct tgsi_full_declaration
*decl
)
2168 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2169 unsigned first
, last
, mask
;
2172 first
= decl
->DeclarationRange
.First
;
2173 last
= decl
->DeclarationRange
.Last
;
2174 mask
= decl
->Declaration
.UsageMask
;
2176 for( i
= first
; i
<= last
; i
++ ) {
2177 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2178 if( mask
& (1 << j
) ) {
2179 switch( decl
->Declaration
.Interpolate
) {
2180 case TGSI_INTERPOLATE_CONSTANT
:
2181 emit_coef_a0( func
, 0, i
, j
);
2182 emit_inputs( func
, 0, i
, j
);
2185 case TGSI_INTERPOLATE_LINEAR
:
2186 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2187 emit_coef_dadx( func
, 1, i
, j
);
2188 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2189 emit_coef_dady( func
, 3, i
, j
);
2190 emit_mul( func
, 0, 1 ); /* x * dadx */
2191 emit_coef_a0( func
, 4, i
, j
);
2192 emit_mul( func
, 2, 3 ); /* y * dady */
2193 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2194 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2195 emit_inputs( func
, 0, i
, j
);
2198 case TGSI_INTERPOLATE_PERSPECTIVE
:
2199 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2200 emit_coef_dadx( func
, 1, i
, j
);
2201 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2202 emit_coef_dady( func
, 3, i
, j
);
2203 emit_mul( func
, 0, 1 ); /* x * dadx */
2204 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2205 emit_coef_a0( func
, 5, i
, j
);
2206 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2207 emit_mul( func
, 2, 3 ); /* y * dady */
2208 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2209 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2210 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2211 emit_inputs( func
, 0, i
, j
);
2224 static void aos_to_soa( struct x86_function
*func
,
2230 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2231 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2232 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2233 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2238 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2240 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2241 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2242 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2243 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2246 inner_loop
= x86_get_label( func
);
2248 x86_push( func
, aos_input
);
2249 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2250 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2251 x86_add( func
, aos_input
, stride
);
2252 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2253 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2254 x86_add( func
, aos_input
, stride
);
2255 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2256 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2257 x86_add( func
, aos_input
, stride
);
2258 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2259 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2260 x86_pop( func
, aos_input
);
2262 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2263 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2264 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2265 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2266 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2267 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2269 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2270 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2271 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2272 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2274 /* Advance to next input */
2275 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2276 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2278 /* while --num_inputs */
2279 x86_dec( func
, num_inputs
);
2280 x86_jcc( func
, cc_NE
, inner_loop
);
2283 x86_pop( func
, aos_input
);
2286 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2288 struct x86_reg soa_output
;
2289 struct x86_reg aos_output
;
2290 struct x86_reg num_outputs
;
2291 struct x86_reg temp
;
2294 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2295 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2296 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2297 temp
= x86_make_reg( file_REG32
, reg_DX
);
2300 x86_push( func
, aos_output
);
2302 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2303 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2304 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2307 inner_loop
= x86_get_label( func
);
2309 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2310 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2311 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2312 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2314 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2315 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2316 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2317 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2318 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2319 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2321 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2322 x86_push( func
, aos_output
);
2323 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2324 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2325 x86_add( func
, aos_output
, temp
);
2326 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2327 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2328 x86_add( func
, aos_output
, temp
);
2329 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2330 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2331 x86_add( func
, aos_output
, temp
);
2332 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2333 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2334 x86_pop( func
, aos_output
);
2336 /* Advance to next output */
2337 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2338 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2340 /* while --num_outputs */
2341 x86_dec( func
, num_outputs
);
2342 x86_jcc( func
, cc_NE
, inner_loop
);
2345 x86_pop( func
, aos_output
);
2349 * Translate a TGSI vertex/fragment shader to SSE2 code.
2350 * Slightly different things are done for vertex vs. fragment shaders.
2352 * Note that fragment shaders are responsible for interpolating shader
2353 * inputs. Because on x86 we have only 4 GP registers, and here we
2354 * have 5 shader arguments (input, output, const, temp and coef), the
2355 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2356 * GP register holding the output argument is aliased with the coeff
2357 * argument, as outputs are not needed in the DECLARATION phase.
2359 * \param tokens the TGSI input shader
2360 * \param func the output SSE code/function
2361 * \param immediates buffer to place immediates, later passed to SSE func
2362 * \param return 1 for success, 0 if translation failed
2366 const struct tgsi_token
*tokens
,
2367 struct x86_function
*func
,
2368 float (*immediates
)[4],
2369 boolean do_swizzles
)
2371 struct tgsi_parse_context parse
;
2372 boolean instruction_phase
= FALSE
;
2374 uint num_immediates
= 0;
2378 func
->csr
= func
->store
;
2380 tgsi_parse_init( &parse
, tokens
);
2382 /* Can't just use EDI, EBX without save/restoring them:
2386 get_immediate_base() );
2394 * Different function args for vertex/fragment shaders:
2396 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2397 /* DECLARATION phase, do not load output argument. */
2401 x86_fn_arg( func
, 1 ) );
2402 /* skipping outputs argument here */
2406 x86_fn_arg( func
, 3 ) );
2410 x86_fn_arg( func
, 4 ) );
2414 x86_fn_arg( func
, 5 ) );
2417 get_immediate_base(),
2418 x86_fn_arg( func
, 6 ) );
2421 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2426 1, /* machine->input */
2428 8 ); /* input_stride */
2433 x86_fn_arg( func
, 1 ) );
2437 x86_fn_arg( func
, 2 ) );
2441 x86_fn_arg( func
, 3 ) );
2445 x86_fn_arg( func
, 4 ) );
2448 get_immediate_base(),
2449 x86_fn_arg( func
, 5 ) );
2452 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2453 tgsi_parse_token( &parse
);
2455 switch( parse
.FullToken
.Token
.Type
) {
2456 case TGSI_TOKEN_TYPE_DECLARATION
:
2457 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2460 &parse
.FullToken
.FullDeclaration
);
2464 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2465 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2466 if( !instruction_phase
) {
2467 /* INSTRUCTION phase, overwrite coeff with output. */
2468 instruction_phase
= TRUE
;
2472 x86_fn_arg( func
, 2 ) );
2476 ok
= emit_instruction(
2478 &parse
.FullToken
.FullInstruction
);
2481 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2482 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2483 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2484 "vertex shader" : "fragment shader");
2488 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2489 /* simply copy the immediate values into the next immediates[] slot */
2491 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2494 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2495 for( i
= 0; i
< size
; i
++ ) {
2496 immediates
[num_immediates
][i
] =
2497 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2500 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2502 immediates
[num_immediates
][0],
2503 immediates
[num_immediates
][1],
2504 immediates
[num_immediates
][2],
2505 immediates
[num_immediates
][3]);
2517 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2519 soa_to_aos( func
, 9, 2, 10, 11 );
2522 /* Can't just use EBX, EDI without save/restoring them:
2530 get_immediate_base() );
2534 tgsi_parse_free( &parse
);
2539 #endif /* PIPE_ARCH_X86 */