1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_debug.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "tgsi/tgsi_util.h"
33 #include "tgsi_exec.h"
34 #include "tgsi_sse2.h"
36 #include "rtasm/rtasm_x86sse.h"
42 * This costs about 100fps (close to 10%) in gears:
44 #define HIGH_PRECISION 1
49 #define FOR_EACH_CHANNEL( CHAN )\
50 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
59 FOR_EACH_CHANNEL( CHAN )\
60 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
68 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
70 #define TEMP_R0 TGSI_EXEC_TEMP_R0
71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
77 * X86 utility functions.
86 (enum x86_reg_name
) xmm
);
90 * X86 register mapping helpers.
94 get_const_base( void )
101 static struct x86_reg
102 get_input_base( void )
109 static struct x86_reg
110 get_output_base( void )
117 static struct x86_reg
118 get_temp_base( void )
125 static struct x86_reg
126 get_coef_base( void )
128 return get_output_base();
131 static struct x86_reg
132 get_immediate_base( void )
141 * Data access helpers.
145 static struct x86_reg
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec
* 4 + chan
) * 4 );
155 static struct x86_reg
160 return x86_make_disp(
162 (vec
* 4 + chan
) * 4 );
165 static struct x86_reg
170 return x86_make_disp(
172 (vec
* 4 + chan
) * 16 );
175 static struct x86_reg
180 return x86_make_disp(
182 (vec
* 4 + chan
) * 16 );
185 static struct x86_reg
190 return x86_make_disp(
192 (vec
* 4 + chan
) * 16 );
195 static struct x86_reg
201 return x86_make_disp(
203 ((vec
* 3 + member
) * 4 + chan
) * 4 );
209 struct x86_function
*func
)
216 * Data fetch helpers.
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
227 struct x86_function
*func
,
236 /* 'vec' is the offset from the address register's value.
237 * We're loading CONST[ADDR+vec] into an xmm register.
239 struct x86_reg r0
= get_input_base();
240 struct x86_reg r1
= get_output_base();
243 assert( indirectFile
== TGSI_FILE_ADDRESS
);
244 assert( indirectIndex
== 0 );
246 x86_push( func
, r0
);
247 x86_push( func
, r1
);
250 * Loop over the four pixels or vertices in the quad.
251 * Get the value of the address (offset) register for pixel/vertex[i],
252 * add it to the src offset and index into the constant buffer.
253 * Note that we're working on SOA data.
254 * If any of the pixel/vertex execution channels are unused their
255 * values will be garbage. It's very important that we don't use
256 * those garbage values as indexes into the constant buffer since
257 * that'll cause segfaults.
258 * The solution is to bitwise-AND the offset with the execution mask
259 * register whose values are either 0 or ~0.
260 * The caller must setup the execution mask register to indicate
261 * which channels are valid/alive before running the shader.
262 * The execution mask will also figure into loops and conditionals
265 for (i
= 0; i
< QUAD_SIZE
; i
++) {
266 /* r1 = address register[i] */
267 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
268 /* r0 = execution mask[i] */
269 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
271 x86_and( func
, r1
, r0
);
272 /* r0 = 'vec', the offset */
273 x86_lea( func
, r0
, get_const( vec
, chan
) );
275 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
277 x86_add( func
, r1
, r1
);
278 x86_add( func
, r1
, r1
);
279 x86_add( func
, r1
, r1
);
280 x86_add( func
, r1
, r1
);
282 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
283 x86_mov( func
, r1
, x86_deref( r0
) );
284 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
293 get_temp( TEMP_R0
, CHAN_X
) );
296 /* 'vec' is the index into the src register file, such as TEMP[vec] */
302 get_const( vec
, chan
) );
307 SHUF( 0, 0, 0, 0 ) );
313 struct x86_function
*func
,
321 get_immediate( vec
, chan
) );
326 SHUF( 0, 0, 0, 0 ) );
331 * Copy a shader input to xmm register
332 * \param xmm the destination xmm register
333 * \param vec the src input attrib
334 * \param chan src channel to fetch (X, Y, Z or W)
338 struct x86_function
*func
,
346 get_input( vec
, chan
) );
350 * Store an xmm register to a shader output
351 * \param xmm the source xmm register
352 * \param vec the dest output attrib
353 * \param chan src dest channel to store (X, Y, Z or W)
357 struct x86_function
*func
,
364 get_output( vec
, chan
),
369 * Copy a shader temporary to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src temp register
372 * \param chan src channel to fetch (X, Y, Z or W)
376 struct x86_function
*func
,
384 get_temp( vec
, chan
) );
388 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
389 * \param xmm the destination xmm register
390 * \param vec the src input/attribute coefficient index
391 * \param chan src channel to fetch (X, Y, Z or W)
392 * \param member 0=a0, 1=dadx, 2=dady
396 struct x86_function
*func
,
405 get_coef( vec
, chan
, member
) );
410 SHUF( 0, 0, 0, 0 ) );
414 * Data store helpers.
419 struct x86_function
*func
,
426 get_input( vec
, chan
),
432 struct x86_function
*func
,
439 get_temp( vec
, chan
),
445 struct x86_function
*func
,
455 vec
+ TGSI_EXEC_TEMP_ADDR
,
460 * Coefficent fetch helpers.
465 struct x86_function
*func
,
480 struct x86_function
*func
,
495 struct x86_function
*func
,
509 * Function call helpers.
514 struct x86_function
*func
)
518 x86_make_reg( file_REG32
, reg_AX
) );
521 x86_make_reg( file_REG32
, reg_CX
) );
524 x86_make_reg( file_REG32
, reg_DX
) );
529 struct x86_function
*func
)
531 /* Restore GP registers in a reverse order.
535 x86_make_reg( file_REG32
, reg_DX
) );
538 x86_make_reg( file_REG32
, reg_CX
) );
541 x86_make_reg( file_REG32
, reg_AX
) );
546 struct x86_function
*func
,
548 void (PIPE_CDECL
*code
)() )
552 get_temp( TEMP_R0
, 0 ),
553 make_xmm( xmm_dst
) );
559 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
564 get_temp( TEMP_R0
, 0 ) );
566 x86_push( func
, ecx
);
567 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
568 x86_call( func
, ecx
);
579 get_temp( TEMP_R0
, 0 ) );
583 emit_func_call_dst_src(
584 struct x86_function
*func
,
587 void (PIPE_CDECL
*code
)() )
591 get_temp( TEMP_R0
, 1 ),
592 make_xmm( xmm_src
) );
601 * Low-level instruction translators.
606 struct x86_function
*func
,
613 TGSI_EXEC_TEMP_7FFFFFFF_I
,
614 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
619 struct x86_function
*func
,
626 make_xmm( xmm_src
) );
629 static void PIPE_CDECL
633 store
[0] = cosf( store
[0] );
634 store
[1] = cosf( store
[1] );
635 store
[2] = cosf( store
[2] );
636 store
[3] = cosf( store
[3] );
641 struct x86_function
*func
,
650 static void PIPE_CDECL
655 store
[0] = util_fast_exp2( store
[0] );
656 store
[1] = util_fast_exp2( store
[1] );
657 store
[2] = util_fast_exp2( store
[2] );
658 store
[3] = util_fast_exp2( store
[3] );
660 store
[0] = powf( 2.0f
, store
[0] );
661 store
[1] = powf( 2.0f
, store
[1] );
662 store
[2] = powf( 2.0f
, store
[2] );
663 store
[3] = powf( 2.0f
, store
[3] );
669 struct x86_function
*func
,
680 struct x86_function
*func
,
691 struct x86_function
*func
,
700 static void PIPE_CDECL
704 store
[0] = floorf( store
[0] );
705 store
[1] = floorf( store
[1] );
706 store
[2] = floorf( store
[2] );
707 store
[3] = floorf( store
[3] );
712 struct x86_function
*func
,
721 static void PIPE_CDECL
725 store
[0] -= floorf( store
[0] );
726 store
[1] -= floorf( store
[1] );
727 store
[2] -= floorf( store
[2] );
728 store
[3] -= floorf( store
[3] );
733 struct x86_function
*func
,
742 static void PIPE_CDECL
746 store
[0] = util_fast_log2( store
[0] );
747 store
[1] = util_fast_log2( store
[1] );
748 store
[2] = util_fast_log2( store
[2] );
749 store
[3] = util_fast_log2( store
[3] );
754 struct x86_function
*func
,
765 struct x86_function
*func
,
772 make_xmm( xmm_src
) );
776 emit_mul (struct x86_function
*func
,
783 make_xmm( xmm_src
) );
788 struct x86_function
*func
,
795 TGSI_EXEC_TEMP_80000000_I
,
796 TGSI_EXEC_TEMP_80000000_C
) );
799 static void PIPE_CDECL
804 store
[0] = util_fast_pow( store
[0], store
[4] );
805 store
[1] = util_fast_pow( store
[1], store
[5] );
806 store
[2] = util_fast_pow( store
[2], store
[6] );
807 store
[3] = util_fast_pow( store
[3], store
[7] );
809 store
[0] = powf( store
[0], store
[4] );
810 store
[1] = powf( store
[1], store
[5] );
811 store
[2] = powf( store
[2], store
[6] );
812 store
[3] = powf( store
[3], store
[7] );
818 struct x86_function
*func
,
822 emit_func_call_dst_src(
831 struct x86_function
*func
,
835 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
836 * good enough. Need to either emit a proper divide or use the
837 * iterative technique described below in emit_rsqrt().
842 make_xmm( xmm_src
) );
845 static void PIPE_CDECL
849 store
[0] = floorf( store
[0] + 0.5f
);
850 store
[1] = floorf( store
[1] + 0.5f
);
851 store
[2] = floorf( store
[2] + 0.5f
);
852 store
[3] = floorf( store
[3] + 0.5f
);
857 struct x86_function
*func
,
870 struct x86_function
*func
,
875 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
876 * implementations, it is possible to improve its precision at
877 * fairly low cost, using a newton/raphson step, as below:
879 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
880 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
882 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
885 struct x86_reg dst
= make_xmm( xmm_dst
);
886 struct x86_reg src
= make_xmm( xmm_src
);
887 struct x86_reg tmp0
= make_xmm( 2 );
888 struct x86_reg tmp1
= make_xmm( 3 );
890 assert( xmm_dst
!= xmm_src
);
891 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
892 assert( xmm_src
!= 2 && xmm_src
!= 3 );
894 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
895 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
896 sse_rsqrtps( func
, tmp1
, src
);
897 sse_mulps( func
, src
, tmp1
);
898 sse_mulps( func
, dst
, tmp1
);
899 sse_mulps( func
, src
, tmp1
);
900 sse_subps( func
, tmp0
, src
);
901 sse_mulps( func
, dst
, tmp0
);
904 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
910 make_xmm( xmm_src
) );
916 struct x86_function
*func
,
923 TGSI_EXEC_TEMP_80000000_I
,
924 TGSI_EXEC_TEMP_80000000_C
) );
927 static void PIPE_CDECL
931 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
932 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
933 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
934 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
939 struct x86_function
*func
,
950 static void PIPE_CDECL
954 store
[0] = sinf( store
[0] );
955 store
[1] = sinf( store
[1] );
956 store
[2] = sinf( store
[2] );
957 store
[3] = sinf( store
[3] );
961 emit_sin (struct x86_function
*func
,
972 struct x86_function
*func
,
979 make_xmm( xmm_src
) );
988 struct x86_function
*func
,
990 const struct tgsi_full_src_register
*reg
,
991 const unsigned chan_index
)
993 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
996 case TGSI_EXTSWIZZLE_X
:
997 case TGSI_EXTSWIZZLE_Y
:
998 case TGSI_EXTSWIZZLE_Z
:
999 case TGSI_EXTSWIZZLE_W
:
1000 switch (reg
->SrcRegister
.File
) {
1001 case TGSI_FILE_CONSTANT
:
1005 reg
->SrcRegister
.Index
,
1007 reg
->SrcRegister
.Indirect
,
1008 reg
->SrcRegisterInd
.File
,
1009 reg
->SrcRegisterInd
.Index
);
1012 case TGSI_FILE_IMMEDIATE
:
1016 reg
->SrcRegister
.Index
,
1020 case TGSI_FILE_INPUT
:
1024 reg
->SrcRegister
.Index
,
1028 case TGSI_FILE_TEMPORARY
:
1032 reg
->SrcRegister
.Index
,
1041 case TGSI_EXTSWIZZLE_ZERO
:
1045 TGSI_EXEC_TEMP_00000000_I
,
1046 TGSI_EXEC_TEMP_00000000_C
);
1049 case TGSI_EXTSWIZZLE_ONE
:
1061 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1062 case TGSI_UTIL_SIGN_CLEAR
:
1063 emit_abs( func
, xmm
);
1066 case TGSI_UTIL_SIGN_SET
:
1067 emit_setsign( func
, xmm
);
1070 case TGSI_UTIL_SIGN_TOGGLE
:
1071 emit_neg( func
, xmm
);
1074 case TGSI_UTIL_SIGN_KEEP
:
1079 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1080 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1088 struct x86_function
*func
,
1090 const struct tgsi_full_dst_register
*reg
,
1091 const struct tgsi_full_instruction
*inst
,
1092 unsigned chan_index
)
1094 switch( reg
->DstRegister
.File
) {
1095 case TGSI_FILE_OUTPUT
:
1099 reg
->DstRegister
.Index
,
1103 case TGSI_FILE_TEMPORARY
:
1107 reg
->DstRegister
.Index
,
1111 case TGSI_FILE_ADDRESS
:
1115 reg
->DstRegister
.Index
,
1123 switch( inst
->Instruction
.Saturate
) {
1127 case TGSI_SAT_ZERO_ONE
:
1131 case TGSI_SAT_MINUS_PLUS_ONE
:
1137 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1138 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1141 * High-level instruction translators.
1146 struct x86_function
*func
,
1147 const struct tgsi_full_src_register
*reg
)
1149 unsigned uniquemask
;
1150 unsigned registers
[4];
1151 unsigned nextregister
= 0;
1152 unsigned firstchan
= ~0;
1153 unsigned chan_index
;
1155 /* This mask stores component bits that were already tested. Note that
1156 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1158 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1160 FOR_EACH_CHANNEL( chan_index
) {
1163 /* unswizzle channel */
1164 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1168 /* check if the component has not been already tested */
1169 if( !(uniquemask
& (1 << swizzle
)) ) {
1170 uniquemask
|= 1 << swizzle
;
1172 /* allocate register */
1173 registers
[chan_index
] = nextregister
;
1181 /* mark the first channel used */
1182 if( firstchan
== ~0 ) {
1183 firstchan
= chan_index
;
1190 x86_make_reg( file_REG32
, reg_AX
) );
1193 x86_make_reg( file_REG32
, reg_DX
) );
1195 FOR_EACH_CHANNEL( chan_index
) {
1196 if( uniquemask
& (1 << chan_index
) ) {
1199 make_xmm( registers
[chan_index
] ),
1201 TGSI_EXEC_TEMP_00000000_I
,
1202 TGSI_EXEC_TEMP_00000000_C
),
1205 if( chan_index
== firstchan
) {
1208 x86_make_reg( file_REG32
, reg_AX
),
1209 make_xmm( registers
[chan_index
] ) );
1214 x86_make_reg( file_REG32
, reg_DX
),
1215 make_xmm( registers
[chan_index
] ) );
1218 x86_make_reg( file_REG32
, reg_AX
),
1219 x86_make_reg( file_REG32
, reg_DX
) );
1227 TGSI_EXEC_TEMP_KILMASK_I
,
1228 TGSI_EXEC_TEMP_KILMASK_C
),
1229 x86_make_reg( file_REG32
, reg_AX
) );
1233 x86_make_reg( file_REG32
, reg_DX
) );
1236 x86_make_reg( file_REG32
, reg_AX
) );
1242 struct x86_function
*func
)
1244 /* XXX todo / fix me */
1250 struct x86_function
*func
,
1251 struct tgsi_full_instruction
*inst
,
1254 unsigned chan_index
;
1256 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1257 FETCH( func
, *inst
, 0, 0, chan_index
);
1258 FETCH( func
, *inst
, 1, 1, chan_index
);
1270 STORE( func
, *inst
, 0, 0, chan_index
);
1276 struct x86_function
*func
,
1277 struct tgsi_full_instruction
*inst
)
1279 unsigned chan_index
;
1281 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1282 FETCH( func
, *inst
, 0, 0, chan_index
);
1283 FETCH( func
, *inst
, 1, 1, chan_index
);
1284 FETCH( func
, *inst
, 2, 2, chan_index
);
1289 TGSI_EXEC_TEMP_00000000_I
,
1290 TGSI_EXEC_TEMP_00000000_C
),
1304 STORE( func
, *inst
, 0, 0, chan_index
);
1310 struct x86_function
*func
,
1311 struct tgsi_full_instruction
*inst
)
1313 unsigned chan_index
;
1315 switch (inst
->Instruction
.Opcode
) {
1316 case TGSI_OPCODE_ARL
:
1317 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1318 FETCH( func
, *inst
, 0, 0, chan_index
);
1319 emit_f2it( func
, 0 );
1320 STORE( func
, *inst
, 0, 0, chan_index
);
1324 case TGSI_OPCODE_MOV
:
1325 case TGSI_OPCODE_SWZ
:
1326 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1327 FETCH( func
, *inst
, 0, 0, chan_index
);
1328 STORE( func
, *inst
, 0, 0, chan_index
);
1332 case TGSI_OPCODE_LIT
:
1333 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1334 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1340 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1341 STORE( func
, *inst
, 0, 0, CHAN_X
);
1343 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1344 STORE( func
, *inst
, 0, 0, CHAN_W
);
1347 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1348 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1349 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1350 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1355 TGSI_EXEC_TEMP_00000000_I
,
1356 TGSI_EXEC_TEMP_00000000_C
) );
1357 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1359 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1360 /* XMM[1] = SrcReg[0].yyyy */
1361 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1362 /* XMM[1] = max(XMM[1], 0) */
1367 TGSI_EXEC_TEMP_00000000_I
,
1368 TGSI_EXEC_TEMP_00000000_C
) );
1369 /* XMM[2] = SrcReg[0].wwww */
1370 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1371 /* XMM[2] = min(XMM[2], 128.0) */
1376 TGSI_EXEC_TEMP_128_I
,
1377 TGSI_EXEC_TEMP_128_C
) );
1378 /* XMM[2] = max(XMM[2], -128.0) */
1383 TGSI_EXEC_TEMP_MINUS_128_I
,
1384 TGSI_EXEC_TEMP_MINUS_128_C
) );
1385 emit_pow( func
, 1, 2 );
1386 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1400 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1405 case TGSI_OPCODE_RCP
:
1406 /* TGSI_OPCODE_RECIP */
1407 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1408 emit_rcp( func
, 0, 0 );
1409 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1410 STORE( func
, *inst
, 0, 0, chan_index
);
1414 case TGSI_OPCODE_RSQ
:
1415 /* TGSI_OPCODE_RECIPSQRT */
1416 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1417 emit_rsqrt( func
, 1, 0 );
1418 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1419 STORE( func
, *inst
, 1, 0, chan_index
);
1423 case TGSI_OPCODE_EXP
:
1424 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1425 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1426 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1427 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1428 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1429 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1430 emit_MOV( func
, 1, 0 );
1431 emit_flr( func
, 1 );
1432 /* dst.x = ex2(floor(src.x)) */
1433 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1434 emit_MOV( func
, 2, 1 );
1435 emit_ex2( func
, 2 );
1436 STORE( func
, *inst
, 2, 0, CHAN_X
);
1438 /* dst.y = src.x - floor(src.x) */
1439 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1440 emit_MOV( func
, 2, 0 );
1441 emit_sub( func
, 2, 1 );
1442 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1445 /* dst.z = ex2(src.x) */
1446 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1447 emit_ex2( func
, 0 );
1448 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1452 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1453 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1454 STORE( func
, *inst
, 0, 0, CHAN_W
);
1458 case TGSI_OPCODE_LOG
:
1459 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1460 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1461 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1462 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1463 emit_abs( func
, 0 );
1464 emit_MOV( func
, 1, 0 );
1465 emit_lg2( func
, 1 );
1466 /* dst.z = lg2(abs(src.x)) */
1467 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1468 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1470 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1471 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1472 emit_flr( func
, 1 );
1473 /* dst.x = floor(lg2(abs(src.x))) */
1474 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1475 STORE( func
, *inst
, 1, 0, CHAN_X
);
1477 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1478 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1479 emit_ex2( func
, 1 );
1480 emit_rcp( func
, 1, 1 );
1481 emit_mul( func
, 0, 1 );
1482 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1487 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1488 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1489 STORE( func
, *inst
, 0, 0, CHAN_W
);
1493 case TGSI_OPCODE_MUL
:
1494 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1495 FETCH( func
, *inst
, 0, 0, chan_index
);
1496 FETCH( func
, *inst
, 1, 1, chan_index
);
1497 emit_mul( func
, 0, 1 );
1498 STORE( func
, *inst
, 0, 0, chan_index
);
1502 case TGSI_OPCODE_ADD
:
1503 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1504 FETCH( func
, *inst
, 0, 0, chan_index
);
1505 FETCH( func
, *inst
, 1, 1, chan_index
);
1506 emit_add( func
, 0, 1 );
1507 STORE( func
, *inst
, 0, 0, chan_index
);
1511 case TGSI_OPCODE_DP3
:
1512 /* TGSI_OPCODE_DOT3 */
1513 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1514 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1515 emit_mul( func
, 0, 1 );
1516 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1517 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1518 emit_mul( func
, 1, 2 );
1519 emit_add( func
, 0, 1 );
1520 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1521 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1522 emit_mul( func
, 1, 2 );
1523 emit_add( func
, 0, 1 );
1524 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1525 STORE( func
, *inst
, 0, 0, chan_index
);
1529 case TGSI_OPCODE_DP4
:
1530 /* TGSI_OPCODE_DOT4 */
1531 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1532 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1533 emit_mul( func
, 0, 1 );
1534 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1535 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1536 emit_mul( func
, 1, 2 );
1537 emit_add( func
, 0, 1 );
1538 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1539 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1540 emit_mul(func
, 1, 2 );
1541 emit_add(func
, 0, 1 );
1542 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1543 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1544 emit_mul( func
, 1, 2 );
1545 emit_add( func
, 0, 1 );
1546 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1547 STORE( func
, *inst
, 0, 0, chan_index
);
1551 case TGSI_OPCODE_DST
:
1552 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1558 STORE( func
, *inst
, 0, 0, CHAN_X
);
1560 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1561 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1562 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1563 emit_mul( func
, 0, 1 );
1564 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1566 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1567 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1568 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1570 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1571 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1572 STORE( func
, *inst
, 0, 0, CHAN_W
);
1576 case TGSI_OPCODE_MIN
:
1577 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1578 FETCH( func
, *inst
, 0, 0, chan_index
);
1579 FETCH( func
, *inst
, 1, 1, chan_index
);
1584 STORE( func
, *inst
, 0, 0, chan_index
);
1588 case TGSI_OPCODE_MAX
:
1589 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1590 FETCH( func
, *inst
, 0, 0, chan_index
);
1591 FETCH( func
, *inst
, 1, 1, chan_index
);
1596 STORE( func
, *inst
, 0, 0, chan_index
);
1600 case TGSI_OPCODE_SLT
:
1601 /* TGSI_OPCODE_SETLT */
1602 emit_setcc( func
, inst
, cc_LessThan
);
1605 case TGSI_OPCODE_SGE
:
1606 /* TGSI_OPCODE_SETGE */
1607 emit_setcc( func
, inst
, cc_NotLessThan
);
1610 case TGSI_OPCODE_MAD
:
1611 /* TGSI_OPCODE_MADD */
1612 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1613 FETCH( func
, *inst
, 0, 0, chan_index
);
1614 FETCH( func
, *inst
, 1, 1, chan_index
);
1615 FETCH( func
, *inst
, 2, 2, chan_index
);
1616 emit_mul( func
, 0, 1 );
1617 emit_add( func
, 0, 2 );
1618 STORE( func
, *inst
, 0, 0, chan_index
);
1622 case TGSI_OPCODE_SUB
:
1623 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1624 FETCH( func
, *inst
, 0, 0, chan_index
);
1625 FETCH( func
, *inst
, 1, 1, chan_index
);
1626 emit_sub( func
, 0, 1 );
1627 STORE( func
, *inst
, 0, 0, chan_index
);
1631 case TGSI_OPCODE_LERP
:
1632 /* TGSI_OPCODE_LRP */
1633 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1634 FETCH( func
, *inst
, 0, 0, chan_index
);
1635 FETCH( func
, *inst
, 1, 1, chan_index
);
1636 FETCH( func
, *inst
, 2, 2, chan_index
);
1637 emit_sub( func
, 1, 2 );
1638 emit_mul( func
, 0, 1 );
1639 emit_add( func
, 0, 2 );
1640 STORE( func
, *inst
, 0, 0, chan_index
);
1644 case TGSI_OPCODE_CND
:
1648 case TGSI_OPCODE_CND0
:
1652 case TGSI_OPCODE_DOT2ADD
:
1653 /* TGSI_OPCODE_DP2A */
1657 case TGSI_OPCODE_INDEX
:
1661 case TGSI_OPCODE_NEGATE
:
1665 case TGSI_OPCODE_FRAC
:
1666 /* TGSI_OPCODE_FRC */
1667 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1668 FETCH( func
, *inst
, 0, 0, chan_index
);
1669 emit_frc( func
, 0 );
1670 STORE( func
, *inst
, 0, 0, chan_index
);
1674 case TGSI_OPCODE_CLAMP
:
1678 case TGSI_OPCODE_FLOOR
:
1679 /* TGSI_OPCODE_FLR */
1680 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1681 FETCH( func
, *inst
, 0, 0, chan_index
);
1682 emit_flr( func
, 0 );
1683 STORE( func
, *inst
, 0, 0, chan_index
);
1687 case TGSI_OPCODE_ROUND
:
1688 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1689 FETCH( func
, *inst
, 0, 0, chan_index
);
1690 emit_rnd( func
, 0, 0 );
1691 STORE( func
, *inst
, 0, 0, chan_index
);
1695 case TGSI_OPCODE_EXPBASE2
:
1696 /* TGSI_OPCODE_EX2 */
1697 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1698 emit_ex2( func
, 0 );
1699 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1700 STORE( func
, *inst
, 0, 0, chan_index
);
1704 case TGSI_OPCODE_LOGBASE2
:
1705 /* TGSI_OPCODE_LG2 */
1706 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1707 emit_lg2( func
, 0 );
1708 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1709 STORE( func
, *inst
, 0, 0, chan_index
);
1713 case TGSI_OPCODE_POWER
:
1714 /* TGSI_OPCODE_POW */
1715 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1716 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1717 emit_pow( func
, 0, 1 );
1718 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1719 STORE( func
, *inst
, 0, 0, chan_index
);
1723 case TGSI_OPCODE_CROSSPRODUCT
:
1724 /* TGSI_OPCODE_XPD */
1725 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1726 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1727 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1728 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1730 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1731 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1732 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1733 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1735 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1736 emit_MOV( func
, 2, 0 );
1737 emit_mul( func
, 2, 1 );
1738 emit_MOV( func
, 5, 3 );
1739 emit_mul( func
, 5, 4 );
1740 emit_sub( func
, 2, 5 );
1741 STORE( func
, *inst
, 2, 0, CHAN_X
);
1743 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1744 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1745 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1746 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1748 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1749 emit_mul( func
, 3, 2 );
1750 emit_mul( func
, 1, 5 );
1751 emit_sub( func
, 3, 1 );
1752 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1754 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1755 emit_mul( func
, 5, 4 );
1756 emit_mul( func
, 0, 2 );
1757 emit_sub( func
, 5, 0 );
1758 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1760 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1766 STORE( func
, *inst
, 0, 0, CHAN_W
);
1770 case TGSI_OPCODE_MULTIPLYMATRIX
:
1774 case TGSI_OPCODE_ABS
:
1775 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1776 FETCH( func
, *inst
, 0, 0, chan_index
);
1777 emit_abs( func
, 0) ;
1779 STORE( func
, *inst
, 0, 0, chan_index
);
1783 case TGSI_OPCODE_RCC
:
1787 case TGSI_OPCODE_DPH
:
1788 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1789 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1790 emit_mul( func
, 0, 1 );
1791 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1792 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1793 emit_mul( func
, 1, 2 );
1794 emit_add( func
, 0, 1 );
1795 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1796 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1797 emit_mul( func
, 1, 2 );
1798 emit_add( func
, 0, 1 );
1799 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1800 emit_add( func
, 0, 1 );
1801 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1802 STORE( func
, *inst
, 0, 0, chan_index
);
1806 case TGSI_OPCODE_COS
:
1807 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1808 emit_cos( func
, 0 );
1809 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1810 STORE( func
, *inst
, 0, 0, chan_index
);
1814 case TGSI_OPCODE_DDX
:
1818 case TGSI_OPCODE_DDY
:
1822 case TGSI_OPCODE_KILP
:
1823 /* predicated kill */
1825 return 0; /* XXX fix me */
1828 case TGSI_OPCODE_KIL
:
1829 /* conditional kill */
1830 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1833 case TGSI_OPCODE_PK2H
:
1837 case TGSI_OPCODE_PK2US
:
1841 case TGSI_OPCODE_PK4B
:
1845 case TGSI_OPCODE_PK4UB
:
1849 case TGSI_OPCODE_RFL
:
1853 case TGSI_OPCODE_SEQ
:
1857 case TGSI_OPCODE_SFL
:
1861 case TGSI_OPCODE_SGT
:
1865 case TGSI_OPCODE_SIN
:
1866 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1867 emit_sin( func
, 0 );
1868 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1869 STORE( func
, *inst
, 0, 0, chan_index
);
1873 case TGSI_OPCODE_SLE
:
1877 case TGSI_OPCODE_SNE
:
1881 case TGSI_OPCODE_STR
:
1885 case TGSI_OPCODE_TEX
:
1887 /* Disable dummy texture code:
1894 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1895 STORE( func
, *inst
, 0, 0, chan_index
);
1903 case TGSI_OPCODE_TXD
:
1907 case TGSI_OPCODE_UP2H
:
1911 case TGSI_OPCODE_UP2US
:
1915 case TGSI_OPCODE_UP4B
:
1919 case TGSI_OPCODE_UP4UB
:
1923 case TGSI_OPCODE_X2D
:
1927 case TGSI_OPCODE_ARA
:
1932 case TGSI_OPCODE_ARR
:
1933 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1934 FETCH( func
, *inst
, 0, 0, chan_index
);
1935 emit_rnd( func
, 0, 0 );
1936 emit_f2it( func
, 0 );
1937 STORE( func
, *inst
, 0, 0, chan_index
);
1941 case TGSI_OPCODE_BRA
:
1945 case TGSI_OPCODE_CAL
:
1949 case TGSI_OPCODE_RET
:
1953 case TGSI_OPCODE_END
:
1956 case TGSI_OPCODE_SSG
:
1957 /* TGSI_OPCODE_SGN */
1958 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1959 FETCH( func
, *inst
, 0, 0, chan_index
);
1960 emit_sgn( func
, 0, 0 );
1961 STORE( func
, *inst
, 0, 0, chan_index
);
1965 case TGSI_OPCODE_CMP
:
1966 emit_cmp (func
, inst
);
1969 case TGSI_OPCODE_SCS
:
1970 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1971 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1972 emit_cos( func
, 0 );
1973 STORE( func
, *inst
, 0, 0, CHAN_X
);
1975 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1976 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1977 emit_sin( func
, 0 );
1978 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1980 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1984 TGSI_EXEC_TEMP_00000000_I
,
1985 TGSI_EXEC_TEMP_00000000_C
);
1986 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1994 STORE( func
, *inst
, 0, 0, CHAN_W
);
1998 case TGSI_OPCODE_TXB
:
2002 case TGSI_OPCODE_NRM
:
2006 case TGSI_OPCODE_DIV
:
2010 case TGSI_OPCODE_DP2
:
2014 case TGSI_OPCODE_TXL
:
2018 case TGSI_OPCODE_BRK
:
2022 case TGSI_OPCODE_IF
:
2026 case TGSI_OPCODE_LOOP
:
2030 case TGSI_OPCODE_REP
:
2034 case TGSI_OPCODE_ELSE
:
2038 case TGSI_OPCODE_ENDIF
:
2042 case TGSI_OPCODE_ENDLOOP
:
2046 case TGSI_OPCODE_ENDREP
:
2050 case TGSI_OPCODE_PUSHA
:
2054 case TGSI_OPCODE_POPA
:
2058 case TGSI_OPCODE_CEIL
:
2062 case TGSI_OPCODE_I2F
:
2066 case TGSI_OPCODE_NOT
:
2070 case TGSI_OPCODE_TRUNC
:
2071 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2072 FETCH( func
, *inst
, 0, 0, chan_index
);
2073 emit_f2it( func
, 0 );
2074 emit_i2f( func
, 0 );
2075 STORE( func
, *inst
, 0, 0, chan_index
);
2079 case TGSI_OPCODE_SHL
:
2083 case TGSI_OPCODE_SHR
:
2087 case TGSI_OPCODE_AND
:
2091 case TGSI_OPCODE_OR
:
2095 case TGSI_OPCODE_MOD
:
2099 case TGSI_OPCODE_XOR
:
2103 case TGSI_OPCODE_SAD
:
2107 case TGSI_OPCODE_TXF
:
2111 case TGSI_OPCODE_TXQ
:
2115 case TGSI_OPCODE_CONT
:
2119 case TGSI_OPCODE_EMIT
:
2123 case TGSI_OPCODE_ENDPRIM
:
2136 struct x86_function
*func
,
2137 struct tgsi_full_declaration
*decl
)
2139 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2140 unsigned first
, last
, mask
;
2143 first
= decl
->DeclarationRange
.First
;
2144 last
= decl
->DeclarationRange
.Last
;
2145 mask
= decl
->Declaration
.UsageMask
;
2147 for( i
= first
; i
<= last
; i
++ ) {
2148 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2149 if( mask
& (1 << j
) ) {
2150 switch( decl
->Declaration
.Interpolate
) {
2151 case TGSI_INTERPOLATE_CONSTANT
:
2152 emit_coef_a0( func
, 0, i
, j
);
2153 emit_inputs( func
, 0, i
, j
);
2156 case TGSI_INTERPOLATE_LINEAR
:
2157 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2158 emit_coef_dadx( func
, 1, i
, j
);
2159 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2160 emit_coef_dady( func
, 3, i
, j
);
2161 emit_mul( func
, 0, 1 ); /* x * dadx */
2162 emit_coef_a0( func
, 4, i
, j
);
2163 emit_mul( func
, 2, 3 ); /* y * dady */
2164 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2165 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2166 emit_inputs( func
, 0, i
, j
);
2169 case TGSI_INTERPOLATE_PERSPECTIVE
:
2170 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2171 emit_coef_dadx( func
, 1, i
, j
);
2172 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2173 emit_coef_dady( func
, 3, i
, j
);
2174 emit_mul( func
, 0, 1 ); /* x * dadx */
2175 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2176 emit_coef_a0( func
, 5, i
, j
);
2177 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2178 emit_mul( func
, 2, 3 ); /* y * dady */
2179 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2180 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2181 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2182 emit_inputs( func
, 0, i
, j
);
2195 static void aos_to_soa( struct x86_function
*func
,
2201 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2202 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2203 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2204 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2209 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2211 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2212 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2213 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2214 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2217 inner_loop
= x86_get_label( func
);
2219 x86_push( func
, aos_input
);
2220 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2221 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2222 x86_add( func
, aos_input
, stride
);
2223 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2224 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2225 x86_add( func
, aos_input
, stride
);
2226 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2227 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2228 x86_add( func
, aos_input
, stride
);
2229 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2230 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2231 x86_pop( func
, aos_input
);
2233 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2234 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2235 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2236 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2237 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2238 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2240 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2241 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2242 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2243 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2245 /* Advance to next input */
2246 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2247 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2249 /* while --num_inputs */
2250 x86_dec( func
, num_inputs
);
2251 x86_jcc( func
, cc_NE
, inner_loop
);
2254 x86_pop( func
, aos_input
);
2257 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2259 struct x86_reg soa_output
;
2260 struct x86_reg aos_output
;
2261 struct x86_reg num_outputs
;
2262 struct x86_reg temp
;
2265 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2266 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2267 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2268 temp
= x86_make_reg( file_REG32
, reg_DX
);
2271 x86_push( func
, aos_output
);
2273 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2274 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2275 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2278 inner_loop
= x86_get_label( func
);
2280 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2281 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2282 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2283 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2285 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2286 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2287 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2288 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2289 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2290 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2292 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2293 x86_push( func
, aos_output
);
2294 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2295 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2296 x86_add( func
, aos_output
, temp
);
2297 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2298 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2299 x86_add( func
, aos_output
, temp
);
2300 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2301 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2302 x86_add( func
, aos_output
, temp
);
2303 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2304 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2305 x86_pop( func
, aos_output
);
2307 /* Advance to next output */
2308 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2309 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2311 /* while --num_outputs */
2312 x86_dec( func
, num_outputs
);
2313 x86_jcc( func
, cc_NE
, inner_loop
);
2316 x86_pop( func
, aos_output
);
2320 * Translate a TGSI vertex/fragment shader to SSE2 code.
2321 * Slightly different things are done for vertex vs. fragment shaders.
2323 * Note that fragment shaders are responsible for interpolating shader
2324 * inputs. Because on x86 we have only 4 GP registers, and here we
2325 * have 5 shader arguments (input, output, const, temp and coef), the
2326 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2327 * GP register holding the output argument is aliased with the coeff
2328 * argument, as outputs are not needed in the DECLARATION phase.
2330 * \param tokens the TGSI input shader
2331 * \param func the output SSE code/function
2332 * \param immediates buffer to place immediates, later passed to SSE func
2333 * \param return 1 for success, 0 if translation failed
2337 const struct tgsi_token
*tokens
,
2338 struct x86_function
*func
,
2339 float (*immediates
)[4],
2340 boolean do_swizzles
)
2342 struct tgsi_parse_context parse
;
2343 boolean instruction_phase
= FALSE
;
2345 uint num_immediates
= 0;
2349 func
->csr
= func
->store
;
2351 tgsi_parse_init( &parse
, tokens
);
2353 /* Can't just use EDI, EBX without save/restoring them:
2357 get_immediate_base() );
2365 * Different function args for vertex/fragment shaders:
2367 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2368 /* DECLARATION phase, do not load output argument. */
2372 x86_fn_arg( func
, 1 ) );
2373 /* skipping outputs argument here */
2377 x86_fn_arg( func
, 3 ) );
2381 x86_fn_arg( func
, 4 ) );
2385 x86_fn_arg( func
, 5 ) );
2388 get_immediate_base(),
2389 x86_fn_arg( func
, 6 ) );
2392 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2397 1, /* machine->input */
2399 8 ); /* input_stride */
2404 x86_fn_arg( func
, 1 ) );
2408 x86_fn_arg( func
, 2 ) );
2412 x86_fn_arg( func
, 3 ) );
2416 x86_fn_arg( func
, 4 ) );
2419 get_immediate_base(),
2420 x86_fn_arg( func
, 5 ) );
2423 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2424 tgsi_parse_token( &parse
);
2426 switch( parse
.FullToken
.Token
.Type
) {
2427 case TGSI_TOKEN_TYPE_DECLARATION
:
2428 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2431 &parse
.FullToken
.FullDeclaration
);
2435 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2436 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2437 if( !instruction_phase
) {
2438 /* INSTRUCTION phase, overwrite coeff with output. */
2439 instruction_phase
= TRUE
;
2443 x86_fn_arg( func
, 2 ) );
2447 ok
= emit_instruction(
2449 &parse
.FullToken
.FullInstruction
);
2452 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2453 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2454 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2455 "vertex shader" : "fragment shader");
2459 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2460 /* simply copy the immediate values into the next immediates[] slot */
2462 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2465 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2466 for( i
= 0; i
< size
; i
++ ) {
2467 immediates
[num_immediates
][i
] =
2468 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2471 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2473 immediates
[num_immediates
][0],
2474 immediates
[num_immediates
][1],
2475 immediates
[num_immediates
][2],
2476 immediates
[num_immediates
][3]);
2488 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2490 soa_to_aos( func
, 9, 2, 10, 11 );
2493 /* Can't just use EBX, EDI without save/restoring them:
2501 get_immediate_base() );
2505 tgsi_parse_free( &parse
);
2510 #endif /* PIPE_ARCH_X86 */