1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
32 #include "pipe/p_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_sse.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_util.h"
38 #include "tgsi_exec.h"
39 #include "tgsi_sse2.h"
41 #include "rtasm/rtasm_x86sse.h"
45 * This costs about 100fps (close to 10%) in gears:
47 #define HIGH_PRECISION 1
52 #define FOR_EACH_CHANNEL( CHAN )\
53 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
62 FOR_EACH_CHANNEL( CHAN )\
63 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
70 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
71 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
73 #define TEMP_R0 TGSI_EXEC_TEMP_R0
74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
77 * X86 utility functions.
86 (enum x86_reg_name
) xmm
);
90 * X86 register mapping helpers.
94 get_const_base( void )
101 static struct x86_reg
102 get_input_base( void )
109 static struct x86_reg
110 get_output_base( void )
117 static struct x86_reg
118 get_temp_base( void )
125 static struct x86_reg
126 get_coef_base( void )
128 return get_output_base();
131 static struct x86_reg
132 get_immediate_base( void )
141 * Data access helpers.
145 static struct x86_reg
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec
* 4 + chan
) * 4 );
155 static struct x86_reg
160 return x86_make_disp(
162 (vec
* 4 + chan
) * 4 );
165 static struct x86_reg
170 return x86_make_disp(
172 (vec
* 4 + chan
) * 16 );
175 static struct x86_reg
180 return x86_make_disp(
182 (vec
* 4 + chan
) * 16 );
185 static struct x86_reg
190 return x86_make_disp(
192 (vec
* 4 + chan
) * 16 );
195 static struct x86_reg
201 return x86_make_disp(
203 ((vec
* 3 + member
) * 4 + chan
) * 4 );
209 struct x86_function
*func
)
216 * Data fetch helpers.
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
227 struct x86_function
*func
,
236 struct x86_reg r0
= get_input_base();
237 struct x86_reg r1
= get_output_base();
240 assert( indirectFile
== TGSI_FILE_ADDRESS
);
241 assert( indirectIndex
== 0 );
243 x86_push( func
, r0
);
244 x86_push( func
, r1
);
246 for (i
= 0; i
< QUAD_SIZE
; i
++) {
247 x86_lea( func
, r0
, get_const( vec
, chan
) );
248 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
250 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
252 x86_add( func
, r1
, r1
);
253 x86_add( func
, r1
, r1
);
254 x86_add( func
, r1
, r1
);
255 x86_add( func
, r1
, r1
);
257 x86_add( func
, r0
, r1
);
258 x86_mov( func
, r1
, x86_deref( r0
) );
259 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
268 get_temp( TEMP_R0
, CHAN_X
) );
276 get_const( vec
, chan
) );
281 SHUF( 0, 0, 0, 0 ) );
287 struct x86_function
*func
,
295 get_immediate( vec
, chan
) );
300 SHUF( 0, 0, 0, 0 ) );
305 * Copy a shader input to xmm register
306 * \param xmm the destination xmm register
307 * \param vec the src input attrib
308 * \param chan src channel to fetch (X, Y, Z or W)
312 struct x86_function
*func
,
320 get_input( vec
, chan
) );
324 * Store an xmm register to a shader output
325 * \param xmm the source xmm register
326 * \param vec the dest output attrib
327 * \param chan src dest channel to store (X, Y, Z or W)
331 struct x86_function
*func
,
338 get_output( vec
, chan
),
343 * Copy a shader temporary to xmm register
344 * \param xmm the destination xmm register
345 * \param vec the src temp register
346 * \param chan src channel to fetch (X, Y, Z or W)
350 struct x86_function
*func
,
358 get_temp( vec
, chan
) );
362 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
363 * \param xmm the destination xmm register
364 * \param vec the src input/attribute coefficient index
365 * \param chan src channel to fetch (X, Y, Z or W)
366 * \param member 0=a0, 1=dadx, 2=dady
370 struct x86_function
*func
,
379 get_coef( vec
, chan
, member
) );
384 SHUF( 0, 0, 0, 0 ) );
388 * Data store helpers.
393 struct x86_function
*func
,
400 get_input( vec
, chan
),
406 struct x86_function
*func
,
413 get_temp( vec
, chan
),
419 struct x86_function
*func
,
429 vec
+ TGSI_EXEC_TEMP_ADDR
,
434 * Coefficent fetch helpers.
439 struct x86_function
*func
,
454 struct x86_function
*func
,
469 struct x86_function
*func
,
483 * Function call helpers.
487 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
488 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
489 * that the stack pointer is 16 byte aligned, as expected.
493 struct x86_function
*func
,
496 void (PIPE_CDECL
*code
)() )
498 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
502 /* Bitmask of the xmm registers to save */
503 xmm_mask
= (1 << xmm_save
) - 1;
504 xmm_mask
&= ~(1 << xmm_dst
);
508 get_temp( TEMP_R0
, 0 ),
509 make_xmm( xmm_dst
) );
513 x86_make_reg( file_REG32
, reg_AX
) );
516 x86_make_reg( file_REG32
, reg_CX
) );
519 x86_make_reg( file_REG32
, reg_DX
) );
521 for(i
= 0, n
= 0; i
< 8; ++i
)
522 if(xmm_mask
& (1 << i
))
527 x86_make_reg( file_REG32
, reg_SP
),
530 for(i
= 0, n
= 0; i
< 8; ++i
)
531 if(xmm_mask
& (1 << i
)) {
534 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
542 get_temp( TEMP_R0
, 0 ) );
544 x86_push( func
, ecx
);
545 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
546 x86_call( func
, ecx
);
549 for(i
= 0, n
= 0; i
< 8; ++i
)
550 if(xmm_mask
& (1 << i
)) {
554 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
560 x86_make_reg( file_REG32
, reg_SP
),
563 /* Restore GP registers in a reverse order.
567 x86_make_reg( file_REG32
, reg_DX
) );
570 x86_make_reg( file_REG32
, reg_CX
) );
573 x86_make_reg( file_REG32
, reg_AX
) );
578 get_temp( TEMP_R0
, 0 ) );
582 emit_func_call_dst_src(
583 struct x86_function
*func
,
587 void (PIPE_CDECL
*code
)() )
591 get_temp( TEMP_R0
, 1 ),
592 make_xmm( xmm_src
) );
602 * Fast SSE2 implementation of special math functions.
605 #define POLY0(x, c0) _mm_set1_ps(c0)
606 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
607 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
608 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
609 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
610 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
612 #define EXP_POLY_DEGREE 3
613 #define LOG_POLY_DEGREE 5
616 * See http://www.devmaster.net/forums/showthread.php?p=43580
622 __m128 fpart
, expipart
, expfpart
;
624 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
625 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
627 /* ipart = int(x - 0.5) */
628 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
630 /* fpart = x - ipart */
631 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
633 /* expipart = (float) (1 << ipart) */
634 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
636 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
637 #if EXP_POLY_DEGREE == 5
638 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
639 #elif EXP_POLY_DEGREE == 4
640 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
641 #elif EXP_POLY_DEGREE == 3
642 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
643 #elif EXP_POLY_DEGREE == 2
644 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
649 return _mm_mul_ps(expipart
, expfpart
);
653 * See http://www.devmaster.net/forums/showthread.php?p=43580
658 __m128i expmask
= _mm_set1_epi32(0x7f800000);
659 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
660 __m128 one
= _mm_set1_ps(1.0f
);
662 __m128i i
= _mm_castps_si128(x
);
664 /* exp = (float) exponent(x) */
665 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
667 /* mant = (float) mantissa(x) */
668 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
672 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
673 * These coefficients can be generate with
674 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
676 #if LOG_POLY_DEGREE == 6
677 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
678 #elif LOG_POLY_DEGREE == 5
679 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
680 #elif LOG_POLY_DEGREE == 4
681 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
682 #elif LOG_POLY_DEGREE == 3
683 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
688 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
689 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
691 return _mm_add_ps(logmant
, exp
);
695 powf4(__m128 x
, __m128 y
)
697 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
702 * Low-level instruction translators.
707 struct x86_function
*func
,
714 TGSI_EXEC_TEMP_7FFFFFFF_I
,
715 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
720 struct x86_function
*func
,
727 make_xmm( xmm_src
) );
730 static void PIPE_CDECL
734 store
[0] = cosf( store
[0] );
735 store
[1] = cosf( store
[1] );
736 store
[2] = cosf( store
[2] );
737 store
[3] = cosf( store
[3] );
742 struct x86_function
*func
,
753 static void PIPE_CDECL
754 #if defined(PIPE_CC_GCC)
755 __attribute__((force_align_arg_pointer
))
760 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
765 struct x86_function
*func
,
778 struct x86_function
*func
,
787 static void PIPE_CDECL
791 store
[0] = floorf( store
[0] );
792 store
[1] = floorf( store
[1] );
793 store
[2] = floorf( store
[2] );
794 store
[3] = floorf( store
[3] );
799 struct x86_function
*func
,
810 static void PIPE_CDECL
814 store
[0] -= floorf( store
[0] );
815 store
[1] -= floorf( store
[1] );
816 store
[2] -= floorf( store
[2] );
817 store
[3] -= floorf( store
[3] );
822 struct x86_function
*func
,
833 static void PIPE_CDECL
834 #if defined(PIPE_CC_GCC)
835 __attribute__((force_align_arg_pointer
))
840 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
845 struct x86_function
*func
,
858 struct x86_function
*func
,
865 make_xmm( xmm_src
) );
869 emit_mul (struct x86_function
*func
,
876 make_xmm( xmm_src
) );
881 struct x86_function
*func
,
888 TGSI_EXEC_TEMP_80000000_I
,
889 TGSI_EXEC_TEMP_80000000_C
) );
892 static void PIPE_CDECL
893 #if defined(PIPE_CC_GCC)
894 __attribute__((force_align_arg_pointer
))
900 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
902 store
[0] = powf( store
[0], store
[4] );
903 store
[1] = powf( store
[1], store
[5] );
904 store
[2] = powf( store
[2], store
[6] );
905 store
[3] = powf( store
[3], store
[7] );
911 struct x86_function
*func
,
916 emit_func_call_dst_src(
926 struct x86_function
*func
,
930 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
931 * good enough. Need to either emit a proper divide or use the
932 * iterative technique described below in emit_rsqrt().
937 make_xmm( xmm_src
) );
942 struct x86_function
*func
,
947 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
948 * implementations, it is possible to improve its precision at
949 * fairly low cost, using a newton/raphson step, as below:
951 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
952 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
954 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
957 struct x86_reg dst
= make_xmm( xmm_dst
);
958 struct x86_reg src
= make_xmm( xmm_src
);
959 struct x86_reg tmp0
= make_xmm( 2 );
960 struct x86_reg tmp1
= make_xmm( 3 );
962 assert( xmm_dst
!= xmm_src
);
963 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
964 assert( xmm_src
!= 2 && xmm_src
!= 3 );
966 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
967 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
968 sse_rsqrtps( func
, tmp1
, src
);
969 sse_mulps( func
, src
, tmp1
);
970 sse_mulps( func
, dst
, tmp1
);
971 sse_mulps( func
, src
, tmp1
);
972 sse_subps( func
, tmp0
, src
);
973 sse_mulps( func
, dst
, tmp0
);
976 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
982 make_xmm( xmm_src
) );
988 struct x86_function
*func
,
995 TGSI_EXEC_TEMP_80000000_I
,
996 TGSI_EXEC_TEMP_80000000_C
) );
999 static void PIPE_CDECL
1003 store
[0] = sinf( store
[0] );
1004 store
[1] = sinf( store
[1] );
1005 store
[2] = sinf( store
[2] );
1006 store
[3] = sinf( store
[3] );
1010 emit_sin (struct x86_function
*func
,
1023 struct x86_function
*func
,
1029 make_xmm( xmm_dst
),
1030 make_xmm( xmm_src
) );
1039 struct x86_function
*func
,
1041 const struct tgsi_full_src_register
*reg
,
1042 const unsigned chan_index
)
1044 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1047 case TGSI_EXTSWIZZLE_X
:
1048 case TGSI_EXTSWIZZLE_Y
:
1049 case TGSI_EXTSWIZZLE_Z
:
1050 case TGSI_EXTSWIZZLE_W
:
1051 switch (reg
->SrcRegister
.File
) {
1052 case TGSI_FILE_CONSTANT
:
1056 reg
->SrcRegister
.Index
,
1058 reg
->SrcRegister
.Indirect
,
1059 reg
->SrcRegisterInd
.File
,
1060 reg
->SrcRegisterInd
.Index
);
1063 case TGSI_FILE_IMMEDIATE
:
1067 reg
->SrcRegister
.Index
,
1071 case TGSI_FILE_INPUT
:
1075 reg
->SrcRegister
.Index
,
1079 case TGSI_FILE_TEMPORARY
:
1083 reg
->SrcRegister
.Index
,
1092 case TGSI_EXTSWIZZLE_ZERO
:
1096 TGSI_EXEC_TEMP_00000000_I
,
1097 TGSI_EXEC_TEMP_00000000_C
);
1100 case TGSI_EXTSWIZZLE_ONE
:
1112 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1113 case TGSI_UTIL_SIGN_CLEAR
:
1114 emit_abs( func
, xmm
);
1117 case TGSI_UTIL_SIGN_SET
:
1118 emit_setsign( func
, xmm
);
1121 case TGSI_UTIL_SIGN_TOGGLE
:
1122 emit_neg( func
, xmm
);
1125 case TGSI_UTIL_SIGN_KEEP
:
1130 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1131 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1139 struct x86_function
*func
,
1141 const struct tgsi_full_dst_register
*reg
,
1142 const struct tgsi_full_instruction
*inst
,
1143 unsigned chan_index
)
1145 switch( reg
->DstRegister
.File
) {
1146 case TGSI_FILE_OUTPUT
:
1150 reg
->DstRegister
.Index
,
1154 case TGSI_FILE_TEMPORARY
:
1158 reg
->DstRegister
.Index
,
1162 case TGSI_FILE_ADDRESS
:
1166 reg
->DstRegister
.Index
,
1174 switch( inst
->Instruction
.Saturate
) {
1178 case TGSI_SAT_ZERO_ONE
:
1182 case TGSI_SAT_MINUS_PLUS_ONE
:
1188 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1189 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1192 * High-level instruction translators.
1197 struct x86_function
*func
,
1198 const struct tgsi_full_src_register
*reg
)
1200 unsigned uniquemask
;
1201 unsigned registers
[4];
1202 unsigned nextregister
= 0;
1203 unsigned firstchan
= ~0;
1204 unsigned chan_index
;
1206 /* This mask stores component bits that were already tested. Note that
1207 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1209 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1211 FOR_EACH_CHANNEL( chan_index
) {
1214 /* unswizzle channel */
1215 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1219 /* check if the component has not been already tested */
1220 if( !(uniquemask
& (1 << swizzle
)) ) {
1221 uniquemask
|= 1 << swizzle
;
1223 /* allocate register */
1224 registers
[chan_index
] = nextregister
;
1232 /* mark the first channel used */
1233 if( firstchan
== ~0 ) {
1234 firstchan
= chan_index
;
1241 x86_make_reg( file_REG32
, reg_AX
) );
1244 x86_make_reg( file_REG32
, reg_DX
) );
1246 FOR_EACH_CHANNEL( chan_index
) {
1247 if( uniquemask
& (1 << chan_index
) ) {
1250 make_xmm( registers
[chan_index
] ),
1252 TGSI_EXEC_TEMP_00000000_I
,
1253 TGSI_EXEC_TEMP_00000000_C
),
1256 if( chan_index
== firstchan
) {
1259 x86_make_reg( file_REG32
, reg_AX
),
1260 make_xmm( registers
[chan_index
] ) );
1265 x86_make_reg( file_REG32
, reg_DX
),
1266 make_xmm( registers
[chan_index
] ) );
1269 x86_make_reg( file_REG32
, reg_AX
),
1270 x86_make_reg( file_REG32
, reg_DX
) );
1278 TGSI_EXEC_TEMP_KILMASK_I
,
1279 TGSI_EXEC_TEMP_KILMASK_C
),
1280 x86_make_reg( file_REG32
, reg_AX
) );
1284 x86_make_reg( file_REG32
, reg_DX
) );
1287 x86_make_reg( file_REG32
, reg_AX
) );
1293 struct x86_function
*func
)
1295 /* XXX todo / fix me */
1301 struct x86_function
*func
,
1302 struct tgsi_full_instruction
*inst
,
1305 unsigned chan_index
;
1307 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1308 FETCH( func
, *inst
, 0, 0, chan_index
);
1309 FETCH( func
, *inst
, 1, 1, chan_index
);
1321 STORE( func
, *inst
, 0, 0, chan_index
);
1327 struct x86_function
*func
,
1328 struct tgsi_full_instruction
*inst
)
1330 unsigned chan_index
;
1332 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1333 FETCH( func
, *inst
, 0, 0, chan_index
);
1334 FETCH( func
, *inst
, 1, 1, chan_index
);
1335 FETCH( func
, *inst
, 2, 2, chan_index
);
1340 TGSI_EXEC_TEMP_00000000_I
,
1341 TGSI_EXEC_TEMP_00000000_C
),
1355 STORE( func
, *inst
, 0, 0, chan_index
);
1361 struct x86_function
*func
,
1362 struct tgsi_full_instruction
*inst
)
1364 unsigned chan_index
;
1366 switch (inst
->Instruction
.Opcode
) {
1367 case TGSI_OPCODE_ARL
:
1368 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1369 FETCH( func
, *inst
, 0, 0, chan_index
);
1370 emit_f2it( func
, 0 );
1371 STORE( func
, *inst
, 0, 0, chan_index
);
1375 case TGSI_OPCODE_MOV
:
1376 case TGSI_OPCODE_SWZ
:
1377 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1378 FETCH( func
, *inst
, 0, 0, chan_index
);
1379 STORE( func
, *inst
, 0, 0, chan_index
);
1383 case TGSI_OPCODE_LIT
:
1384 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1385 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1391 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1392 STORE( func
, *inst
, 0, 0, CHAN_X
);
1394 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1395 STORE( func
, *inst
, 0, 0, CHAN_W
);
1398 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1399 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1400 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1401 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1406 TGSI_EXEC_TEMP_00000000_I
,
1407 TGSI_EXEC_TEMP_00000000_C
) );
1408 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1410 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1411 /* XMM[1] = SrcReg[0].yyyy */
1412 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1413 /* XMM[1] = max(XMM[1], 0) */
1418 TGSI_EXEC_TEMP_00000000_I
,
1419 TGSI_EXEC_TEMP_00000000_C
) );
1420 /* XMM[2] = SrcReg[0].wwww */
1421 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1422 /* XMM[2] = min(XMM[2], 128.0) */
1427 TGSI_EXEC_TEMP_128_I
,
1428 TGSI_EXEC_TEMP_128_C
) );
1429 /* XMM[2] = max(XMM[2], -128.0) */
1434 TGSI_EXEC_TEMP_MINUS_128_I
,
1435 TGSI_EXEC_TEMP_MINUS_128_C
) );
1436 emit_pow( func
, 3, 1, 2 );
1437 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1451 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1456 case TGSI_OPCODE_RCP
:
1457 /* TGSI_OPCODE_RECIP */
1458 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1459 emit_rcp( func
, 0, 0 );
1460 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1461 STORE( func
, *inst
, 0, 0, chan_index
);
1465 case TGSI_OPCODE_RSQ
:
1466 /* TGSI_OPCODE_RECIPSQRT */
1467 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1468 emit_rsqrt( func
, 1, 0 );
1469 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1470 STORE( func
, *inst
, 1, 0, chan_index
);
1474 case TGSI_OPCODE_EXP
:
1475 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1476 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1477 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1478 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1479 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1480 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1481 emit_MOV( func
, 1, 0 );
1482 emit_flr( func
, 2, 1 );
1483 /* dst.x = ex2(floor(src.x)) */
1484 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1485 emit_MOV( func
, 2, 1 );
1486 emit_ex2( func
, 3, 2 );
1487 STORE( func
, *inst
, 2, 0, CHAN_X
);
1489 /* dst.y = src.x - floor(src.x) */
1490 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1491 emit_MOV( func
, 2, 0 );
1492 emit_sub( func
, 2, 1 );
1493 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1496 /* dst.z = ex2(src.x) */
1497 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1498 emit_ex2( func
, 3, 0 );
1499 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1503 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1504 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1505 STORE( func
, *inst
, 0, 0, CHAN_W
);
1509 case TGSI_OPCODE_LOG
:
1510 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1511 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1512 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1513 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1514 emit_abs( func
, 0 );
1515 emit_MOV( func
, 1, 0 );
1516 emit_lg2( func
, 2, 1 );
1517 /* dst.z = lg2(abs(src.x)) */
1518 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1519 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1521 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1522 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1523 emit_flr( func
, 2, 1 );
1524 /* dst.x = floor(lg2(abs(src.x))) */
1525 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1526 STORE( func
, *inst
, 1, 0, CHAN_X
);
1528 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1529 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1530 emit_ex2( func
, 2, 1 );
1531 emit_rcp( func
, 1, 1 );
1532 emit_mul( func
, 0, 1 );
1533 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1538 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1539 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1540 STORE( func
, *inst
, 0, 0, CHAN_W
);
1544 case TGSI_OPCODE_MUL
:
1545 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1546 FETCH( func
, *inst
, 0, 0, chan_index
);
1547 FETCH( func
, *inst
, 1, 1, chan_index
);
1548 emit_mul( func
, 0, 1 );
1549 STORE( func
, *inst
, 0, 0, chan_index
);
1553 case TGSI_OPCODE_ADD
:
1554 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1555 FETCH( func
, *inst
, 0, 0, chan_index
);
1556 FETCH( func
, *inst
, 1, 1, chan_index
);
1557 emit_add( func
, 0, 1 );
1558 STORE( func
, *inst
, 0, 0, chan_index
);
1562 case TGSI_OPCODE_DP3
:
1563 /* TGSI_OPCODE_DOT3 */
1564 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1565 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1566 emit_mul( func
, 0, 1 );
1567 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1568 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1569 emit_mul( func
, 1, 2 );
1570 emit_add( func
, 0, 1 );
1571 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1572 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1573 emit_mul( func
, 1, 2 );
1574 emit_add( func
, 0, 1 );
1575 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1576 STORE( func
, *inst
, 0, 0, chan_index
);
1580 case TGSI_OPCODE_DP4
:
1581 /* TGSI_OPCODE_DOT4 */
1582 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1583 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1584 emit_mul( func
, 0, 1 );
1585 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1586 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1587 emit_mul( func
, 1, 2 );
1588 emit_add( func
, 0, 1 );
1589 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1590 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1591 emit_mul(func
, 1, 2 );
1592 emit_add(func
, 0, 1 );
1593 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1594 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1595 emit_mul( func
, 1, 2 );
1596 emit_add( func
, 0, 1 );
1597 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1598 STORE( func
, *inst
, 0, 0, chan_index
);
1602 case TGSI_OPCODE_DST
:
1603 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1609 STORE( func
, *inst
, 0, 0, CHAN_X
);
1611 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1612 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1613 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1614 emit_mul( func
, 0, 1 );
1615 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1617 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1618 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1619 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1621 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1622 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1623 STORE( func
, *inst
, 0, 0, CHAN_W
);
1627 case TGSI_OPCODE_MIN
:
1628 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1629 FETCH( func
, *inst
, 0, 0, chan_index
);
1630 FETCH( func
, *inst
, 1, 1, chan_index
);
1635 STORE( func
, *inst
, 0, 0, chan_index
);
1639 case TGSI_OPCODE_MAX
:
1640 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1641 FETCH( func
, *inst
, 0, 0, chan_index
);
1642 FETCH( func
, *inst
, 1, 1, chan_index
);
1647 STORE( func
, *inst
, 0, 0, chan_index
);
1651 case TGSI_OPCODE_SLT
:
1652 /* TGSI_OPCODE_SETLT */
1653 emit_setcc( func
, inst
, cc_LessThan
);
1656 case TGSI_OPCODE_SGE
:
1657 /* TGSI_OPCODE_SETGE */
1658 emit_setcc( func
, inst
, cc_NotLessThan
);
1661 case TGSI_OPCODE_MAD
:
1662 /* TGSI_OPCODE_MADD */
1663 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1664 FETCH( func
, *inst
, 0, 0, chan_index
);
1665 FETCH( func
, *inst
, 1, 1, chan_index
);
1666 FETCH( func
, *inst
, 2, 2, chan_index
);
1667 emit_mul( func
, 0, 1 );
1668 emit_add( func
, 0, 2 );
1669 STORE( func
, *inst
, 0, 0, chan_index
);
1673 case TGSI_OPCODE_SUB
:
1674 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1675 FETCH( func
, *inst
, 0, 0, chan_index
);
1676 FETCH( func
, *inst
, 1, 1, chan_index
);
1677 emit_sub( func
, 0, 1 );
1678 STORE( func
, *inst
, 0, 0, chan_index
);
1682 case TGSI_OPCODE_LERP
:
1683 /* TGSI_OPCODE_LRP */
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1685 FETCH( func
, *inst
, 0, 0, chan_index
);
1686 FETCH( func
, *inst
, 1, 1, chan_index
);
1687 FETCH( func
, *inst
, 2, 2, chan_index
);
1688 emit_sub( func
, 1, 2 );
1689 emit_mul( func
, 0, 1 );
1690 emit_add( func
, 0, 2 );
1691 STORE( func
, *inst
, 0, 0, chan_index
);
1695 case TGSI_OPCODE_CND
:
1699 case TGSI_OPCODE_CND0
:
1703 case TGSI_OPCODE_DOT2ADD
:
1704 /* TGSI_OPCODE_DP2A */
1708 case TGSI_OPCODE_INDEX
:
1712 case TGSI_OPCODE_NEGATE
:
1716 case TGSI_OPCODE_FRAC
:
1717 /* TGSI_OPCODE_FRC */
1718 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1719 FETCH( func
, *inst
, 0, 0, chan_index
);
1720 emit_frc( func
, 0, 0 );
1721 STORE( func
, *inst
, 0, 0, chan_index
);
1725 case TGSI_OPCODE_CLAMP
:
1729 case TGSI_OPCODE_FLOOR
:
1730 /* TGSI_OPCODE_FLR */
1731 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1732 FETCH( func
, *inst
, 0, 0, chan_index
);
1733 emit_flr( func
, 0, 0 );
1734 STORE( func
, *inst
, 0, 0, chan_index
);
1738 case TGSI_OPCODE_ROUND
:
1742 case TGSI_OPCODE_EXPBASE2
:
1743 /* TGSI_OPCODE_EX2 */
1744 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1745 emit_ex2( func
, 0, 0 );
1746 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1747 STORE( func
, *inst
, 0, 0, chan_index
);
1751 case TGSI_OPCODE_LOGBASE2
:
1752 /* TGSI_OPCODE_LG2 */
1753 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1754 emit_lg2( func
, 0, 0 );
1755 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1756 STORE( func
, *inst
, 0, 0, chan_index
);
1760 case TGSI_OPCODE_POWER
:
1761 /* TGSI_OPCODE_POW */
1762 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1763 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1764 emit_pow( func
, 0, 0, 1 );
1765 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1766 STORE( func
, *inst
, 0, 0, chan_index
);
1770 case TGSI_OPCODE_CROSSPRODUCT
:
1771 /* TGSI_OPCODE_XPD */
1772 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1773 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1774 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1775 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1777 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1778 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1779 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1780 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1782 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1783 emit_MOV( func
, 2, 0 );
1784 emit_mul( func
, 2, 1 );
1785 emit_MOV( func
, 5, 3 );
1786 emit_mul( func
, 5, 4 );
1787 emit_sub( func
, 2, 5 );
1788 STORE( func
, *inst
, 2, 0, CHAN_X
);
1790 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1791 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1792 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1793 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1795 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1796 emit_mul( func
, 3, 2 );
1797 emit_mul( func
, 1, 5 );
1798 emit_sub( func
, 3, 1 );
1799 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1801 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1802 emit_mul( func
, 5, 4 );
1803 emit_mul( func
, 0, 2 );
1804 emit_sub( func
, 5, 0 );
1805 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1807 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1813 STORE( func
, *inst
, 0, 0, CHAN_W
);
1817 case TGSI_OPCODE_MULTIPLYMATRIX
:
1821 case TGSI_OPCODE_ABS
:
1822 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1823 FETCH( func
, *inst
, 0, 0, chan_index
);
1824 emit_abs( func
, 0) ;
1826 STORE( func
, *inst
, 0, 0, chan_index
);
1830 case TGSI_OPCODE_RCC
:
1834 case TGSI_OPCODE_DPH
:
1835 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1836 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1837 emit_mul( func
, 0, 1 );
1838 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1839 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1840 emit_mul( func
, 1, 2 );
1841 emit_add( func
, 0, 1 );
1842 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1843 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1844 emit_mul( func
, 1, 2 );
1845 emit_add( func
, 0, 1 );
1846 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1847 emit_add( func
, 0, 1 );
1848 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1849 STORE( func
, *inst
, 0, 0, chan_index
);
1853 case TGSI_OPCODE_COS
:
1854 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1855 emit_cos( func
, 0, 0 );
1856 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1857 STORE( func
, *inst
, 0, 0, chan_index
);
1861 case TGSI_OPCODE_DDX
:
1865 case TGSI_OPCODE_DDY
:
1869 case TGSI_OPCODE_KILP
:
1870 /* predicated kill */
1872 return 0; /* XXX fix me */
1875 case TGSI_OPCODE_KIL
:
1876 /* conditional kill */
1877 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1880 case TGSI_OPCODE_PK2H
:
1884 case TGSI_OPCODE_PK2US
:
1888 case TGSI_OPCODE_PK4B
:
1892 case TGSI_OPCODE_PK4UB
:
1896 case TGSI_OPCODE_RFL
:
1900 case TGSI_OPCODE_SEQ
:
1904 case TGSI_OPCODE_SFL
:
1908 case TGSI_OPCODE_SGT
:
1912 case TGSI_OPCODE_SIN
:
1913 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1914 emit_sin( func
, 0, 0 );
1915 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1916 STORE( func
, *inst
, 0, 0, chan_index
);
1920 case TGSI_OPCODE_SLE
:
1924 case TGSI_OPCODE_SNE
:
1928 case TGSI_OPCODE_STR
:
1932 case TGSI_OPCODE_TEX
:
1934 /* Disable dummy texture code:
1941 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1942 STORE( func
, *inst
, 0, 0, chan_index
);
1950 case TGSI_OPCODE_TXD
:
1954 case TGSI_OPCODE_UP2H
:
1958 case TGSI_OPCODE_UP2US
:
1962 case TGSI_OPCODE_UP4B
:
1966 case TGSI_OPCODE_UP4UB
:
1970 case TGSI_OPCODE_X2D
:
1974 case TGSI_OPCODE_ARA
:
1978 case TGSI_OPCODE_ARR
:
1982 case TGSI_OPCODE_BRA
:
1986 case TGSI_OPCODE_CAL
:
1990 case TGSI_OPCODE_RET
:
1994 case TGSI_OPCODE_END
:
1997 case TGSI_OPCODE_SSG
:
2001 case TGSI_OPCODE_CMP
:
2002 emit_cmp (func
, inst
);
2005 case TGSI_OPCODE_SCS
:
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2007 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2008 emit_cos( func
, 0, 0 );
2009 STORE( func
, *inst
, 0, 0, CHAN_X
);
2011 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2012 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2013 emit_sin( func
, 0, 0 );
2014 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2016 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2020 TGSI_EXEC_TEMP_00000000_I
,
2021 TGSI_EXEC_TEMP_00000000_C
);
2022 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2024 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2030 STORE( func
, *inst
, 0, 0, CHAN_W
);
2034 case TGSI_OPCODE_TXB
:
2038 case TGSI_OPCODE_NRM
:
2042 case TGSI_OPCODE_DIV
:
2046 case TGSI_OPCODE_DP2
:
2050 case TGSI_OPCODE_TXL
:
2054 case TGSI_OPCODE_BRK
:
2058 case TGSI_OPCODE_IF
:
2062 case TGSI_OPCODE_LOOP
:
2066 case TGSI_OPCODE_REP
:
2070 case TGSI_OPCODE_ELSE
:
2074 case TGSI_OPCODE_ENDIF
:
2078 case TGSI_OPCODE_ENDLOOP
:
2082 case TGSI_OPCODE_ENDREP
:
2086 case TGSI_OPCODE_PUSHA
:
2090 case TGSI_OPCODE_POPA
:
2094 case TGSI_OPCODE_CEIL
:
2098 case TGSI_OPCODE_I2F
:
2102 case TGSI_OPCODE_NOT
:
2106 case TGSI_OPCODE_TRUNC
:
2110 case TGSI_OPCODE_SHL
:
2114 case TGSI_OPCODE_SHR
:
2118 case TGSI_OPCODE_AND
:
2122 case TGSI_OPCODE_OR
:
2126 case TGSI_OPCODE_MOD
:
2130 case TGSI_OPCODE_XOR
:
2134 case TGSI_OPCODE_SAD
:
2138 case TGSI_OPCODE_TXF
:
2142 case TGSI_OPCODE_TXQ
:
2146 case TGSI_OPCODE_CONT
:
2150 case TGSI_OPCODE_EMIT
:
2154 case TGSI_OPCODE_ENDPRIM
:
2167 struct x86_function
*func
,
2168 struct tgsi_full_declaration
*decl
)
2170 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2171 unsigned first
, last
, mask
;
2174 first
= decl
->DeclarationRange
.First
;
2175 last
= decl
->DeclarationRange
.Last
;
2176 mask
= decl
->Declaration
.UsageMask
;
2178 for( i
= first
; i
<= last
; i
++ ) {
2179 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2180 if( mask
& (1 << j
) ) {
2181 switch( decl
->Declaration
.Interpolate
) {
2182 case TGSI_INTERPOLATE_CONSTANT
:
2183 emit_coef_a0( func
, 0, i
, j
);
2184 emit_inputs( func
, 0, i
, j
);
2187 case TGSI_INTERPOLATE_LINEAR
:
2188 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2189 emit_coef_dadx( func
, 1, i
, j
);
2190 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2191 emit_coef_dady( func
, 3, i
, j
);
2192 emit_mul( func
, 0, 1 ); /* x * dadx */
2193 emit_coef_a0( func
, 4, i
, j
);
2194 emit_mul( func
, 2, 3 ); /* y * dady */
2195 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2196 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2197 emit_inputs( func
, 0, i
, j
);
2200 case TGSI_INTERPOLATE_PERSPECTIVE
:
2201 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2202 emit_coef_dadx( func
, 1, i
, j
);
2203 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2204 emit_coef_dady( func
, 3, i
, j
);
2205 emit_mul( func
, 0, 1 ); /* x * dadx */
2206 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2207 emit_coef_a0( func
, 5, i
, j
);
2208 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2209 emit_mul( func
, 2, 3 ); /* y * dady */
2210 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2211 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2212 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2213 emit_inputs( func
, 0, i
, j
);
2226 static void aos_to_soa( struct x86_function
*func
,
2232 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2233 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2234 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2235 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2240 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2242 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2243 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2244 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2245 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2248 inner_loop
= x86_get_label( func
);
2250 x86_push( func
, aos_input
);
2251 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2252 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2253 x86_add( func
, aos_input
, stride
);
2254 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2255 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2256 x86_add( func
, aos_input
, stride
);
2257 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2258 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2259 x86_add( func
, aos_input
, stride
);
2260 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2261 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2262 x86_pop( func
, aos_input
);
2264 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2265 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2266 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2267 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2268 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2269 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2271 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2272 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2273 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2274 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2276 /* Advance to next input */
2277 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2278 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2280 /* while --num_inputs */
2281 x86_dec( func
, num_inputs
);
2282 x86_jcc( func
, cc_NE
, inner_loop
);
2285 x86_pop( func
, aos_input
);
2288 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2290 struct x86_reg soa_output
;
2291 struct x86_reg aos_output
;
2292 struct x86_reg num_outputs
;
2293 struct x86_reg temp
;
2296 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2297 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2298 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2299 temp
= x86_make_reg( file_REG32
, reg_DX
);
2302 x86_push( func
, aos_output
);
2304 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2305 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2306 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2309 inner_loop
= x86_get_label( func
);
2311 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2312 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2313 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2314 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2316 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2317 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2318 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2319 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2320 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2321 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2323 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2324 x86_push( func
, aos_output
);
2325 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2326 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2327 x86_add( func
, aos_output
, temp
);
2328 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2329 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2330 x86_add( func
, aos_output
, temp
);
2331 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2332 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2333 x86_add( func
, aos_output
, temp
);
2334 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2335 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2336 x86_pop( func
, aos_output
);
2338 /* Advance to next output */
2339 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2340 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2342 /* while --num_outputs */
2343 x86_dec( func
, num_outputs
);
2344 x86_jcc( func
, cc_NE
, inner_loop
);
2347 x86_pop( func
, aos_output
);
2351 * Translate a TGSI vertex/fragment shader to SSE2 code.
2352 * Slightly different things are done for vertex vs. fragment shaders.
2354 * Note that fragment shaders are responsible for interpolating shader
2355 * inputs. Because on x86 we have only 4 GP registers, and here we
2356 * have 5 shader arguments (input, output, const, temp and coef), the
2357 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2358 * GP register holding the output argument is aliased with the coeff
2359 * argument, as outputs are not needed in the DECLARATION phase.
2361 * \param tokens the TGSI input shader
2362 * \param func the output SSE code/function
2363 * \param immediates buffer to place immediates, later passed to SSE func
2364 * \param return 1 for success, 0 if translation failed
2368 const struct tgsi_token
*tokens
,
2369 struct x86_function
*func
,
2370 float (*immediates
)[4],
2371 boolean do_swizzles
)
2373 struct tgsi_parse_context parse
;
2374 boolean instruction_phase
= FALSE
;
2376 uint num_immediates
= 0;
2380 func
->csr
= func
->store
;
2382 tgsi_parse_init( &parse
, tokens
);
2384 /* Can't just use EDI, EBX without save/restoring them:
2388 get_immediate_base() );
2396 * Different function args for vertex/fragment shaders:
2398 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2399 /* DECLARATION phase, do not load output argument. */
2403 x86_fn_arg( func
, 1 ) );
2404 /* skipping outputs argument here */
2408 x86_fn_arg( func
, 3 ) );
2412 x86_fn_arg( func
, 4 ) );
2416 x86_fn_arg( func
, 5 ) );
2419 get_immediate_base(),
2420 x86_fn_arg( func
, 6 ) );
2423 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2428 1, /* machine->input */
2430 8 ); /* input_stride */
2435 x86_fn_arg( func
, 1 ) );
2439 x86_fn_arg( func
, 2 ) );
2443 x86_fn_arg( func
, 3 ) );
2447 x86_fn_arg( func
, 4 ) );
2450 get_immediate_base(),
2451 x86_fn_arg( func
, 5 ) );
2454 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2455 tgsi_parse_token( &parse
);
2457 switch( parse
.FullToken
.Token
.Type
) {
2458 case TGSI_TOKEN_TYPE_DECLARATION
:
2459 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2462 &parse
.FullToken
.FullDeclaration
);
2466 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2467 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2468 if( !instruction_phase
) {
2469 /* INSTRUCTION phase, overwrite coeff with output. */
2470 instruction_phase
= TRUE
;
2474 x86_fn_arg( func
, 2 ) );
2478 ok
= emit_instruction(
2480 &parse
.FullToken
.FullInstruction
);
2483 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2484 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2485 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2486 "vertex shader" : "fragment shader");
2490 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2491 /* simply copy the immediate values into the next immediates[] slot */
2493 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2496 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2497 for( i
= 0; i
< size
; i
++ ) {
2498 immediates
[num_immediates
][i
] =
2499 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2502 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2504 immediates
[num_immediates
][0],
2505 immediates
[num_immediates
][1],
2506 immediates
[num_immediates
][2],
2507 immediates
[num_immediates
][3]);
2519 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2521 soa_to_aos( func
, 9, 2, 10, 11 );
2524 /* Can't just use EBX, EDI without save/restoring them:
2532 get_immediate_base() );
2536 tgsi_parse_free( &parse
);
2541 #endif /* PIPE_ARCH_X86 */