1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_debug.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "tgsi/tgsi_util.h"
33 #include "tgsi_exec.h"
34 #include "tgsi_sse2.h"
36 #include "rtasm/rtasm_x86sse.h"
42 * This costs about 100fps (close to 10%) in gears:
44 #define HIGH_PRECISION 1
49 #define FOR_EACH_CHANNEL( CHAN )\
50 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
56 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
59 FOR_EACH_CHANNEL( CHAN )\
60 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
67 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
68 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
70 #define TEMP_R0 TGSI_EXEC_TEMP_R0
71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
77 * X86 utility functions.
86 (enum x86_reg_name
) xmm
);
90 * X86 register mapping helpers.
94 get_const_base( void )
101 static struct x86_reg
102 get_input_base( void )
109 static struct x86_reg
110 get_output_base( void )
117 static struct x86_reg
118 get_temp_base( void )
125 static struct x86_reg
126 get_coef_base( void )
128 return get_output_base();
131 static struct x86_reg
132 get_immediate_base( void )
141 * Data access helpers.
145 static struct x86_reg
150 return x86_make_disp(
151 get_immediate_base(),
152 (vec
* 4 + chan
) * 4 );
155 static struct x86_reg
160 return x86_make_disp(
162 (vec
* 4 + chan
) * 4 );
165 static struct x86_reg
170 return x86_make_disp(
172 (vec
* 4 + chan
) * 16 );
175 static struct x86_reg
180 return x86_make_disp(
182 (vec
* 4 + chan
) * 16 );
185 static struct x86_reg
190 return x86_make_disp(
192 (vec
* 4 + chan
) * 16 );
195 static struct x86_reg
201 return x86_make_disp(
203 ((vec
* 3 + member
) * 4 + chan
) * 4 );
209 struct x86_function
*func
)
216 * Data fetch helpers.
220 * Copy a shader constant to xmm register
221 * \param xmm the destination xmm register
222 * \param vec the src const buffer index
223 * \param chan src channel to fetch (X, Y, Z or W)
227 struct x86_function
*func
,
236 /* 'vec' is the offset from the address register's value.
237 * We're loading CONST[ADDR+vec] into an xmm register.
239 struct x86_reg r0
= get_input_base();
240 struct x86_reg r1
= get_output_base();
243 assert( indirectFile
== TGSI_FILE_ADDRESS
);
244 assert( indirectIndex
== 0 );
246 x86_push( func
, r0
);
247 x86_push( func
, r1
);
250 * Loop over the four pixels or vertices in the quad.
251 * Get the value of the address (offset) register for pixel/vertex[i],
252 * add it to the src offset and index into the constant buffer.
253 * Note that we're working on SOA data.
254 * If any of the pixel/vertex execution channels are unused their
255 * values will be garbage. It's very important that we don't use
256 * those garbage values as indexes into the constant buffer since
257 * that'll cause segfaults.
258 * The solution is to bitwise-AND the offset with the execution mask
259 * register whose values are either 0 or ~0.
260 * The caller must setup the execution mask register to indicate
261 * which channels are valid/alive before running the shader.
262 * The execution mask will also figure into loops and conditionals
265 for (i
= 0; i
< QUAD_SIZE
; i
++) {
266 /* r1 = address register[i] */
267 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
268 /* r0 = execution mask[i] */
269 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
271 x86_and( func
, r1
, r0
);
272 /* r0 = 'vec', the offset */
273 x86_lea( func
, r0
, get_const( vec
, chan
) );
275 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
277 x86_add( func
, r1
, r1
);
278 x86_add( func
, r1
, r1
);
279 x86_add( func
, r1
, r1
);
280 x86_add( func
, r1
, r1
);
282 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
283 x86_mov( func
, r1
, x86_deref( r0
) );
284 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
293 get_temp( TEMP_R0
, CHAN_X
) );
296 /* 'vec' is the index into the src register file, such as TEMP[vec] */
302 get_const( vec
, chan
) );
307 SHUF( 0, 0, 0, 0 ) );
313 struct x86_function
*func
,
321 get_immediate( vec
, chan
) );
326 SHUF( 0, 0, 0, 0 ) );
331 * Copy a shader input to xmm register
332 * \param xmm the destination xmm register
333 * \param vec the src input attrib
334 * \param chan src channel to fetch (X, Y, Z or W)
338 struct x86_function
*func
,
346 get_input( vec
, chan
) );
350 * Store an xmm register to a shader output
351 * \param xmm the source xmm register
352 * \param vec the dest output attrib
353 * \param chan src dest channel to store (X, Y, Z or W)
357 struct x86_function
*func
,
364 get_output( vec
, chan
),
369 * Copy a shader temporary to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src temp register
372 * \param chan src channel to fetch (X, Y, Z or W)
376 struct x86_function
*func
,
384 get_temp( vec
, chan
) );
388 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
389 * \param xmm the destination xmm register
390 * \param vec the src input/attribute coefficient index
391 * \param chan src channel to fetch (X, Y, Z or W)
392 * \param member 0=a0, 1=dadx, 2=dady
396 struct x86_function
*func
,
405 get_coef( vec
, chan
, member
) );
410 SHUF( 0, 0, 0, 0 ) );
414 * Data store helpers.
419 struct x86_function
*func
,
426 get_input( vec
, chan
),
432 struct x86_function
*func
,
439 get_temp( vec
, chan
),
445 struct x86_function
*func
,
455 vec
+ TGSI_EXEC_TEMP_ADDR
,
460 * Coefficent fetch helpers.
465 struct x86_function
*func
,
480 struct x86_function
*func
,
495 struct x86_function
*func
,
509 * Function call helpers.
514 struct x86_function
*func
)
518 x86_make_reg( file_REG32
, reg_AX
) );
521 x86_make_reg( file_REG32
, reg_CX
) );
524 x86_make_reg( file_REG32
, reg_DX
) );
529 struct x86_function
*func
)
531 /* Restore GP registers in a reverse order.
535 x86_make_reg( file_REG32
, reg_DX
) );
538 x86_make_reg( file_REG32
, reg_CX
) );
541 x86_make_reg( file_REG32
, reg_AX
) );
546 struct x86_function
*func
,
548 void (PIPE_CDECL
*code
)() )
552 get_temp( TEMP_R0
, 0 ),
553 make_xmm( xmm_dst
) );
559 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
564 get_temp( TEMP_R0
, 0 ) );
566 x86_push( func
, ecx
);
567 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
568 x86_call( func
, ecx
);
579 get_temp( TEMP_R0
, 0 ) );
583 emit_func_call_dst_src(
584 struct x86_function
*func
,
587 void (PIPE_CDECL
*code
)() )
591 get_temp( TEMP_R0
, 1 ),
592 make_xmm( xmm_src
) );
601 * Low-level instruction translators.
606 struct x86_function
*func
,
613 TGSI_EXEC_TEMP_7FFFFFFF_I
,
614 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
619 struct x86_function
*func
,
626 make_xmm( xmm_src
) );
629 static void PIPE_CDECL
633 store
[0] = cosf( store
[0] );
634 store
[1] = cosf( store
[1] );
635 store
[2] = cosf( store
[2] );
636 store
[3] = cosf( store
[3] );
641 struct x86_function
*func
,
650 static void PIPE_CDECL
655 store
[0] = util_fast_exp2( store
[0] );
656 store
[1] = util_fast_exp2( store
[1] );
657 store
[2] = util_fast_exp2( store
[2] );
658 store
[3] = util_fast_exp2( store
[3] );
660 store
[0] = powf( 2.0f
, store
[0] );
661 store
[1] = powf( 2.0f
, store
[1] );
662 store
[2] = powf( 2.0f
, store
[2] );
663 store
[3] = powf( 2.0f
, store
[3] );
669 struct x86_function
*func
,
680 struct x86_function
*func
,
691 struct x86_function
*func
,
700 static void PIPE_CDECL
704 store
[0] = floorf( store
[0] );
705 store
[1] = floorf( store
[1] );
706 store
[2] = floorf( store
[2] );
707 store
[3] = floorf( store
[3] );
712 struct x86_function
*func
,
721 static void PIPE_CDECL
725 store
[0] -= floorf( store
[0] );
726 store
[1] -= floorf( store
[1] );
727 store
[2] -= floorf( store
[2] );
728 store
[3] -= floorf( store
[3] );
733 struct x86_function
*func
,
742 static void PIPE_CDECL
746 store
[0] = util_fast_log2( store
[0] );
747 store
[1] = util_fast_log2( store
[1] );
748 store
[2] = util_fast_log2( store
[2] );
749 store
[3] = util_fast_log2( store
[3] );
754 struct x86_function
*func
,
765 struct x86_function
*func
,
772 make_xmm( xmm_src
) );
776 emit_mul (struct x86_function
*func
,
783 make_xmm( xmm_src
) );
788 struct x86_function
*func
,
795 TGSI_EXEC_TEMP_80000000_I
,
796 TGSI_EXEC_TEMP_80000000_C
) );
799 static void PIPE_CDECL
804 store
[0] = util_fast_pow( store
[0], store
[4] );
805 store
[1] = util_fast_pow( store
[1], store
[5] );
806 store
[2] = util_fast_pow( store
[2], store
[6] );
807 store
[3] = util_fast_pow( store
[3], store
[7] );
809 store
[0] = powf( store
[0], store
[4] );
810 store
[1] = powf( store
[1], store
[5] );
811 store
[2] = powf( store
[2], store
[6] );
812 store
[3] = powf( store
[3], store
[7] );
818 struct x86_function
*func
,
822 emit_func_call_dst_src(
831 struct x86_function
*func
,
835 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
836 * good enough. Need to either emit a proper divide or use the
837 * iterative technique described below in emit_rsqrt().
842 make_xmm( xmm_src
) );
845 static void PIPE_CDECL
849 store
[0] = floorf( store
[0] + 0.5f
);
850 store
[1] = floorf( store
[1] + 0.5f
);
851 store
[2] = floorf( store
[2] + 0.5f
);
852 store
[3] = floorf( store
[3] + 0.5f
);
857 struct x86_function
*func
,
870 struct x86_function
*func
,
875 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
876 * implementations, it is possible to improve its precision at
877 * fairly low cost, using a newton/raphson step, as below:
879 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
880 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
882 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
885 struct x86_reg dst
= make_xmm( xmm_dst
);
886 struct x86_reg src
= make_xmm( xmm_src
);
887 struct x86_reg tmp0
= make_xmm( 2 );
888 struct x86_reg tmp1
= make_xmm( 3 );
890 assert( xmm_dst
!= xmm_src
);
891 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
892 assert( xmm_src
!= 2 && xmm_src
!= 3 );
894 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
895 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
896 sse_rsqrtps( func
, tmp1
, src
);
897 sse_mulps( func
, src
, tmp1
);
898 sse_mulps( func
, dst
, tmp1
);
899 sse_mulps( func
, src
, tmp1
);
900 sse_subps( func
, tmp0
, src
);
901 sse_mulps( func
, dst
, tmp0
);
904 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
910 make_xmm( xmm_src
) );
916 struct x86_function
*func
,
923 TGSI_EXEC_TEMP_80000000_I
,
924 TGSI_EXEC_TEMP_80000000_C
) );
927 static void PIPE_CDECL
931 store
[0] = sinf( store
[0] );
932 store
[1] = sinf( store
[1] );
933 store
[2] = sinf( store
[2] );
934 store
[3] = sinf( store
[3] );
938 emit_sin (struct x86_function
*func
,
949 struct x86_function
*func
,
956 make_xmm( xmm_src
) );
965 struct x86_function
*func
,
967 const struct tgsi_full_src_register
*reg
,
968 const unsigned chan_index
)
970 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
973 case TGSI_EXTSWIZZLE_X
:
974 case TGSI_EXTSWIZZLE_Y
:
975 case TGSI_EXTSWIZZLE_Z
:
976 case TGSI_EXTSWIZZLE_W
:
977 switch (reg
->SrcRegister
.File
) {
978 case TGSI_FILE_CONSTANT
:
982 reg
->SrcRegister
.Index
,
984 reg
->SrcRegister
.Indirect
,
985 reg
->SrcRegisterInd
.File
,
986 reg
->SrcRegisterInd
.Index
);
989 case TGSI_FILE_IMMEDIATE
:
993 reg
->SrcRegister
.Index
,
997 case TGSI_FILE_INPUT
:
1001 reg
->SrcRegister
.Index
,
1005 case TGSI_FILE_TEMPORARY
:
1009 reg
->SrcRegister
.Index
,
1018 case TGSI_EXTSWIZZLE_ZERO
:
1022 TGSI_EXEC_TEMP_00000000_I
,
1023 TGSI_EXEC_TEMP_00000000_C
);
1026 case TGSI_EXTSWIZZLE_ONE
:
1038 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1039 case TGSI_UTIL_SIGN_CLEAR
:
1040 emit_abs( func
, xmm
);
1043 case TGSI_UTIL_SIGN_SET
:
1044 emit_setsign( func
, xmm
);
1047 case TGSI_UTIL_SIGN_TOGGLE
:
1048 emit_neg( func
, xmm
);
1051 case TGSI_UTIL_SIGN_KEEP
:
1056 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1057 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1065 struct x86_function
*func
,
1067 const struct tgsi_full_dst_register
*reg
,
1068 const struct tgsi_full_instruction
*inst
,
1069 unsigned chan_index
)
1071 switch( reg
->DstRegister
.File
) {
1072 case TGSI_FILE_OUTPUT
:
1076 reg
->DstRegister
.Index
,
1080 case TGSI_FILE_TEMPORARY
:
1084 reg
->DstRegister
.Index
,
1088 case TGSI_FILE_ADDRESS
:
1092 reg
->DstRegister
.Index
,
1100 switch( inst
->Instruction
.Saturate
) {
1104 case TGSI_SAT_ZERO_ONE
:
1108 case TGSI_SAT_MINUS_PLUS_ONE
:
1114 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1115 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1118 * High-level instruction translators.
1123 struct x86_function
*func
,
1124 const struct tgsi_full_src_register
*reg
)
1126 unsigned uniquemask
;
1127 unsigned registers
[4];
1128 unsigned nextregister
= 0;
1129 unsigned firstchan
= ~0;
1130 unsigned chan_index
;
1132 /* This mask stores component bits that were already tested. Note that
1133 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1135 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1137 FOR_EACH_CHANNEL( chan_index
) {
1140 /* unswizzle channel */
1141 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1145 /* check if the component has not been already tested */
1146 if( !(uniquemask
& (1 << swizzle
)) ) {
1147 uniquemask
|= 1 << swizzle
;
1149 /* allocate register */
1150 registers
[chan_index
] = nextregister
;
1158 /* mark the first channel used */
1159 if( firstchan
== ~0 ) {
1160 firstchan
= chan_index
;
1167 x86_make_reg( file_REG32
, reg_AX
) );
1170 x86_make_reg( file_REG32
, reg_DX
) );
1172 FOR_EACH_CHANNEL( chan_index
) {
1173 if( uniquemask
& (1 << chan_index
) ) {
1176 make_xmm( registers
[chan_index
] ),
1178 TGSI_EXEC_TEMP_00000000_I
,
1179 TGSI_EXEC_TEMP_00000000_C
),
1182 if( chan_index
== firstchan
) {
1185 x86_make_reg( file_REG32
, reg_AX
),
1186 make_xmm( registers
[chan_index
] ) );
1191 x86_make_reg( file_REG32
, reg_DX
),
1192 make_xmm( registers
[chan_index
] ) );
1195 x86_make_reg( file_REG32
, reg_AX
),
1196 x86_make_reg( file_REG32
, reg_DX
) );
1204 TGSI_EXEC_TEMP_KILMASK_I
,
1205 TGSI_EXEC_TEMP_KILMASK_C
),
1206 x86_make_reg( file_REG32
, reg_AX
) );
1210 x86_make_reg( file_REG32
, reg_DX
) );
1213 x86_make_reg( file_REG32
, reg_AX
) );
1219 struct x86_function
*func
)
1221 /* XXX todo / fix me */
1227 struct x86_function
*func
,
1228 struct tgsi_full_instruction
*inst
,
1231 unsigned chan_index
;
1233 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1234 FETCH( func
, *inst
, 0, 0, chan_index
);
1235 FETCH( func
, *inst
, 1, 1, chan_index
);
1247 STORE( func
, *inst
, 0, 0, chan_index
);
1253 struct x86_function
*func
,
1254 struct tgsi_full_instruction
*inst
)
1256 unsigned chan_index
;
1258 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1259 FETCH( func
, *inst
, 0, 0, chan_index
);
1260 FETCH( func
, *inst
, 1, 1, chan_index
);
1261 FETCH( func
, *inst
, 2, 2, chan_index
);
1266 TGSI_EXEC_TEMP_00000000_I
,
1267 TGSI_EXEC_TEMP_00000000_C
),
1281 STORE( func
, *inst
, 0, 0, chan_index
);
1287 struct x86_function
*func
,
1288 struct tgsi_full_instruction
*inst
)
1290 unsigned chan_index
;
1292 switch (inst
->Instruction
.Opcode
) {
1293 case TGSI_OPCODE_ARL
:
1294 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1295 FETCH( func
, *inst
, 0, 0, chan_index
);
1296 emit_f2it( func
, 0 );
1297 STORE( func
, *inst
, 0, 0, chan_index
);
1301 case TGSI_OPCODE_MOV
:
1302 case TGSI_OPCODE_SWZ
:
1303 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1304 FETCH( func
, *inst
, 0, 0, chan_index
);
1305 STORE( func
, *inst
, 0, 0, chan_index
);
1309 case TGSI_OPCODE_LIT
:
1310 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1311 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1317 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1318 STORE( func
, *inst
, 0, 0, CHAN_X
);
1320 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1321 STORE( func
, *inst
, 0, 0, CHAN_W
);
1324 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1325 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1326 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1327 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1332 TGSI_EXEC_TEMP_00000000_I
,
1333 TGSI_EXEC_TEMP_00000000_C
) );
1334 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1336 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1337 /* XMM[1] = SrcReg[0].yyyy */
1338 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1339 /* XMM[1] = max(XMM[1], 0) */
1344 TGSI_EXEC_TEMP_00000000_I
,
1345 TGSI_EXEC_TEMP_00000000_C
) );
1346 /* XMM[2] = SrcReg[0].wwww */
1347 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1348 /* XMM[2] = min(XMM[2], 128.0) */
1353 TGSI_EXEC_TEMP_128_I
,
1354 TGSI_EXEC_TEMP_128_C
) );
1355 /* XMM[2] = max(XMM[2], -128.0) */
1360 TGSI_EXEC_TEMP_MINUS_128_I
,
1361 TGSI_EXEC_TEMP_MINUS_128_C
) );
1362 emit_pow( func
, 1, 2 );
1363 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1377 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1382 case TGSI_OPCODE_RCP
:
1383 /* TGSI_OPCODE_RECIP */
1384 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1385 emit_rcp( func
, 0, 0 );
1386 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1387 STORE( func
, *inst
, 0, 0, chan_index
);
1391 case TGSI_OPCODE_RSQ
:
1392 /* TGSI_OPCODE_RECIPSQRT */
1393 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1394 emit_rsqrt( func
, 1, 0 );
1395 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1396 STORE( func
, *inst
, 1, 0, chan_index
);
1400 case TGSI_OPCODE_EXP
:
1401 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1402 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1403 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1404 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1405 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1406 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1407 emit_MOV( func
, 1, 0 );
1408 emit_flr( func
, 1 );
1409 /* dst.x = ex2(floor(src.x)) */
1410 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1411 emit_MOV( func
, 2, 1 );
1412 emit_ex2( func
, 2 );
1413 STORE( func
, *inst
, 2, 0, CHAN_X
);
1415 /* dst.y = src.x - floor(src.x) */
1416 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1417 emit_MOV( func
, 2, 0 );
1418 emit_sub( func
, 2, 1 );
1419 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1422 /* dst.z = ex2(src.x) */
1423 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1424 emit_ex2( func
, 0 );
1425 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1429 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1430 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1431 STORE( func
, *inst
, 0, 0, CHAN_W
);
1435 case TGSI_OPCODE_LOG
:
1436 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1437 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1438 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1439 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1440 emit_abs( func
, 0 );
1441 emit_MOV( func
, 1, 0 );
1442 emit_lg2( func
, 1 );
1443 /* dst.z = lg2(abs(src.x)) */
1444 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1445 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1447 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1448 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1449 emit_flr( func
, 1 );
1450 /* dst.x = floor(lg2(abs(src.x))) */
1451 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1452 STORE( func
, *inst
, 1, 0, CHAN_X
);
1454 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1455 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1456 emit_ex2( func
, 1 );
1457 emit_rcp( func
, 1, 1 );
1458 emit_mul( func
, 0, 1 );
1459 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1464 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1465 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1466 STORE( func
, *inst
, 0, 0, CHAN_W
);
1470 case TGSI_OPCODE_MUL
:
1471 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1472 FETCH( func
, *inst
, 0, 0, chan_index
);
1473 FETCH( func
, *inst
, 1, 1, chan_index
);
1474 emit_mul( func
, 0, 1 );
1475 STORE( func
, *inst
, 0, 0, chan_index
);
1479 case TGSI_OPCODE_ADD
:
1480 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1481 FETCH( func
, *inst
, 0, 0, chan_index
);
1482 FETCH( func
, *inst
, 1, 1, chan_index
);
1483 emit_add( func
, 0, 1 );
1484 STORE( func
, *inst
, 0, 0, chan_index
);
1488 case TGSI_OPCODE_DP3
:
1489 /* TGSI_OPCODE_DOT3 */
1490 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1491 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1492 emit_mul( func
, 0, 1 );
1493 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1494 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1495 emit_mul( func
, 1, 2 );
1496 emit_add( func
, 0, 1 );
1497 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1498 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1499 emit_mul( func
, 1, 2 );
1500 emit_add( func
, 0, 1 );
1501 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1502 STORE( func
, *inst
, 0, 0, chan_index
);
1506 case TGSI_OPCODE_DP4
:
1507 /* TGSI_OPCODE_DOT4 */
1508 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1509 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1510 emit_mul( func
, 0, 1 );
1511 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1512 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1513 emit_mul( func
, 1, 2 );
1514 emit_add( func
, 0, 1 );
1515 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1516 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1517 emit_mul(func
, 1, 2 );
1518 emit_add(func
, 0, 1 );
1519 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1520 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1521 emit_mul( func
, 1, 2 );
1522 emit_add( func
, 0, 1 );
1523 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1524 STORE( func
, *inst
, 0, 0, chan_index
);
1528 case TGSI_OPCODE_DST
:
1529 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1535 STORE( func
, *inst
, 0, 0, CHAN_X
);
1537 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1538 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1539 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1540 emit_mul( func
, 0, 1 );
1541 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1543 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1544 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1545 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1547 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1548 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1549 STORE( func
, *inst
, 0, 0, CHAN_W
);
1553 case TGSI_OPCODE_MIN
:
1554 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1555 FETCH( func
, *inst
, 0, 0, chan_index
);
1556 FETCH( func
, *inst
, 1, 1, chan_index
);
1561 STORE( func
, *inst
, 0, 0, chan_index
);
1565 case TGSI_OPCODE_MAX
:
1566 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1567 FETCH( func
, *inst
, 0, 0, chan_index
);
1568 FETCH( func
, *inst
, 1, 1, chan_index
);
1573 STORE( func
, *inst
, 0, 0, chan_index
);
1577 case TGSI_OPCODE_SLT
:
1578 /* TGSI_OPCODE_SETLT */
1579 emit_setcc( func
, inst
, cc_LessThan
);
1582 case TGSI_OPCODE_SGE
:
1583 /* TGSI_OPCODE_SETGE */
1584 emit_setcc( func
, inst
, cc_NotLessThan
);
1587 case TGSI_OPCODE_MAD
:
1588 /* TGSI_OPCODE_MADD */
1589 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1590 FETCH( func
, *inst
, 0, 0, chan_index
);
1591 FETCH( func
, *inst
, 1, 1, chan_index
);
1592 FETCH( func
, *inst
, 2, 2, chan_index
);
1593 emit_mul( func
, 0, 1 );
1594 emit_add( func
, 0, 2 );
1595 STORE( func
, *inst
, 0, 0, chan_index
);
1599 case TGSI_OPCODE_SUB
:
1600 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1601 FETCH( func
, *inst
, 0, 0, chan_index
);
1602 FETCH( func
, *inst
, 1, 1, chan_index
);
1603 emit_sub( func
, 0, 1 );
1604 STORE( func
, *inst
, 0, 0, chan_index
);
1608 case TGSI_OPCODE_LERP
:
1609 /* TGSI_OPCODE_LRP */
1610 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1611 FETCH( func
, *inst
, 0, 0, chan_index
);
1612 FETCH( func
, *inst
, 1, 1, chan_index
);
1613 FETCH( func
, *inst
, 2, 2, chan_index
);
1614 emit_sub( func
, 1, 2 );
1615 emit_mul( func
, 0, 1 );
1616 emit_add( func
, 0, 2 );
1617 STORE( func
, *inst
, 0, 0, chan_index
);
1621 case TGSI_OPCODE_CND
:
1625 case TGSI_OPCODE_CND0
:
1629 case TGSI_OPCODE_DOT2ADD
:
1630 /* TGSI_OPCODE_DP2A */
1634 case TGSI_OPCODE_INDEX
:
1638 case TGSI_OPCODE_NEGATE
:
1642 case TGSI_OPCODE_FRAC
:
1643 /* TGSI_OPCODE_FRC */
1644 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1645 FETCH( func
, *inst
, 0, 0, chan_index
);
1646 emit_frc( func
, 0 );
1647 STORE( func
, *inst
, 0, 0, chan_index
);
1651 case TGSI_OPCODE_CLAMP
:
1655 case TGSI_OPCODE_FLOOR
:
1656 /* TGSI_OPCODE_FLR */
1657 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1658 FETCH( func
, *inst
, 0, 0, chan_index
);
1659 emit_flr( func
, 0 );
1660 STORE( func
, *inst
, 0, 0, chan_index
);
1664 case TGSI_OPCODE_ROUND
:
1665 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1666 FETCH( func
, *inst
, 0, 0, chan_index
);
1667 emit_rnd( func
, 0, 0 );
1668 STORE( func
, *inst
, 0, 0, chan_index
);
1672 case TGSI_OPCODE_EXPBASE2
:
1673 /* TGSI_OPCODE_EX2 */
1674 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1675 emit_ex2( func
, 0 );
1676 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1677 STORE( func
, *inst
, 0, 0, chan_index
);
1681 case TGSI_OPCODE_LOGBASE2
:
1682 /* TGSI_OPCODE_LG2 */
1683 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1684 emit_lg2( func
, 0 );
1685 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1686 STORE( func
, *inst
, 0, 0, chan_index
);
1690 case TGSI_OPCODE_POWER
:
1691 /* TGSI_OPCODE_POW */
1692 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1693 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1694 emit_pow( func
, 0, 1 );
1695 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1696 STORE( func
, *inst
, 0, 0, chan_index
);
1700 case TGSI_OPCODE_CROSSPRODUCT
:
1701 /* TGSI_OPCODE_XPD */
1702 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1703 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1704 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1705 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1707 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1708 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1709 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1710 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1712 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1713 emit_MOV( func
, 2, 0 );
1714 emit_mul( func
, 2, 1 );
1715 emit_MOV( func
, 5, 3 );
1716 emit_mul( func
, 5, 4 );
1717 emit_sub( func
, 2, 5 );
1718 STORE( func
, *inst
, 2, 0, CHAN_X
);
1720 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1721 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1722 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1723 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1725 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1726 emit_mul( func
, 3, 2 );
1727 emit_mul( func
, 1, 5 );
1728 emit_sub( func
, 3, 1 );
1729 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1731 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1732 emit_mul( func
, 5, 4 );
1733 emit_mul( func
, 0, 2 );
1734 emit_sub( func
, 5, 0 );
1735 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1737 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1743 STORE( func
, *inst
, 0, 0, CHAN_W
);
1747 case TGSI_OPCODE_MULTIPLYMATRIX
:
1751 case TGSI_OPCODE_ABS
:
1752 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1753 FETCH( func
, *inst
, 0, 0, chan_index
);
1754 emit_abs( func
, 0) ;
1756 STORE( func
, *inst
, 0, 0, chan_index
);
1760 case TGSI_OPCODE_RCC
:
1764 case TGSI_OPCODE_DPH
:
1765 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1766 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1767 emit_mul( func
, 0, 1 );
1768 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1769 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1770 emit_mul( func
, 1, 2 );
1771 emit_add( func
, 0, 1 );
1772 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1773 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1774 emit_mul( func
, 1, 2 );
1775 emit_add( func
, 0, 1 );
1776 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1777 emit_add( func
, 0, 1 );
1778 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1779 STORE( func
, *inst
, 0, 0, chan_index
);
1783 case TGSI_OPCODE_COS
:
1784 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1785 emit_cos( func
, 0 );
1786 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1787 STORE( func
, *inst
, 0, 0, chan_index
);
1791 case TGSI_OPCODE_DDX
:
1795 case TGSI_OPCODE_DDY
:
1799 case TGSI_OPCODE_KILP
:
1800 /* predicated kill */
1802 return 0; /* XXX fix me */
1805 case TGSI_OPCODE_KIL
:
1806 /* conditional kill */
1807 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1810 case TGSI_OPCODE_PK2H
:
1814 case TGSI_OPCODE_PK2US
:
1818 case TGSI_OPCODE_PK4B
:
1822 case TGSI_OPCODE_PK4UB
:
1826 case TGSI_OPCODE_RFL
:
1830 case TGSI_OPCODE_SEQ
:
1834 case TGSI_OPCODE_SFL
:
1838 case TGSI_OPCODE_SGT
:
1842 case TGSI_OPCODE_SIN
:
1843 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1844 emit_sin( func
, 0 );
1845 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1846 STORE( func
, *inst
, 0, 0, chan_index
);
1850 case TGSI_OPCODE_SLE
:
1854 case TGSI_OPCODE_SNE
:
1858 case TGSI_OPCODE_STR
:
1862 case TGSI_OPCODE_TEX
:
1864 /* Disable dummy texture code:
1871 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1872 STORE( func
, *inst
, 0, 0, chan_index
);
1880 case TGSI_OPCODE_TXD
:
1884 case TGSI_OPCODE_UP2H
:
1888 case TGSI_OPCODE_UP2US
:
1892 case TGSI_OPCODE_UP4B
:
1896 case TGSI_OPCODE_UP4UB
:
1900 case TGSI_OPCODE_X2D
:
1904 case TGSI_OPCODE_ARA
:
1908 case TGSI_OPCODE_ARR
:
1909 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1910 FETCH( func
, *inst
, 0, 0, chan_index
);
1911 emit_rnd( func
, 0, 0 );
1912 emit_f2it( func
, 0 );
1913 STORE( func
, *inst
, 0, 0, chan_index
);
1917 case TGSI_OPCODE_BRA
:
1921 case TGSI_OPCODE_CAL
:
1925 case TGSI_OPCODE_RET
:
1929 case TGSI_OPCODE_END
:
1932 case TGSI_OPCODE_SSG
:
1936 case TGSI_OPCODE_CMP
:
1937 emit_cmp (func
, inst
);
1940 case TGSI_OPCODE_SCS
:
1941 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1942 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1943 emit_cos( func
, 0 );
1944 STORE( func
, *inst
, 0, 0, CHAN_X
);
1946 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1947 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1948 emit_sin( func
, 0 );
1949 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1951 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1955 TGSI_EXEC_TEMP_00000000_I
,
1956 TGSI_EXEC_TEMP_00000000_C
);
1957 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1959 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1965 STORE( func
, *inst
, 0, 0, CHAN_W
);
1969 case TGSI_OPCODE_TXB
:
1973 case TGSI_OPCODE_NRM
:
1977 case TGSI_OPCODE_DIV
:
1981 case TGSI_OPCODE_DP2
:
1985 case TGSI_OPCODE_TXL
:
1989 case TGSI_OPCODE_BRK
:
1993 case TGSI_OPCODE_IF
:
1997 case TGSI_OPCODE_LOOP
:
2001 case TGSI_OPCODE_REP
:
2005 case TGSI_OPCODE_ELSE
:
2009 case TGSI_OPCODE_ENDIF
:
2013 case TGSI_OPCODE_ENDLOOP
:
2017 case TGSI_OPCODE_ENDREP
:
2021 case TGSI_OPCODE_PUSHA
:
2025 case TGSI_OPCODE_POPA
:
2029 case TGSI_OPCODE_CEIL
:
2033 case TGSI_OPCODE_I2F
:
2037 case TGSI_OPCODE_NOT
:
2041 case TGSI_OPCODE_TRUNC
:
2042 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2043 FETCH( func
, *inst
, 0, 0, chan_index
);
2044 emit_f2it( func
, 0 );
2045 emit_i2f( func
, 0 );
2046 STORE( func
, *inst
, 0, 0, chan_index
);
2050 case TGSI_OPCODE_SHL
:
2054 case TGSI_OPCODE_SHR
:
2058 case TGSI_OPCODE_AND
:
2062 case TGSI_OPCODE_OR
:
2066 case TGSI_OPCODE_MOD
:
2070 case TGSI_OPCODE_XOR
:
2074 case TGSI_OPCODE_SAD
:
2078 case TGSI_OPCODE_TXF
:
2082 case TGSI_OPCODE_TXQ
:
2086 case TGSI_OPCODE_CONT
:
2090 case TGSI_OPCODE_EMIT
:
2094 case TGSI_OPCODE_ENDPRIM
:
2107 struct x86_function
*func
,
2108 struct tgsi_full_declaration
*decl
)
2110 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2111 unsigned first
, last
, mask
;
2114 first
= decl
->DeclarationRange
.First
;
2115 last
= decl
->DeclarationRange
.Last
;
2116 mask
= decl
->Declaration
.UsageMask
;
2118 for( i
= first
; i
<= last
; i
++ ) {
2119 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2120 if( mask
& (1 << j
) ) {
2121 switch( decl
->Declaration
.Interpolate
) {
2122 case TGSI_INTERPOLATE_CONSTANT
:
2123 emit_coef_a0( func
, 0, i
, j
);
2124 emit_inputs( func
, 0, i
, j
);
2127 case TGSI_INTERPOLATE_LINEAR
:
2128 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2129 emit_coef_dadx( func
, 1, i
, j
);
2130 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2131 emit_coef_dady( func
, 3, i
, j
);
2132 emit_mul( func
, 0, 1 ); /* x * dadx */
2133 emit_coef_a0( func
, 4, i
, j
);
2134 emit_mul( func
, 2, 3 ); /* y * dady */
2135 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2136 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2137 emit_inputs( func
, 0, i
, j
);
2140 case TGSI_INTERPOLATE_PERSPECTIVE
:
2141 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2142 emit_coef_dadx( func
, 1, i
, j
);
2143 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2144 emit_coef_dady( func
, 3, i
, j
);
2145 emit_mul( func
, 0, 1 ); /* x * dadx */
2146 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2147 emit_coef_a0( func
, 5, i
, j
);
2148 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2149 emit_mul( func
, 2, 3 ); /* y * dady */
2150 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2151 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2152 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2153 emit_inputs( func
, 0, i
, j
);
2166 static void aos_to_soa( struct x86_function
*func
,
2172 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2173 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2174 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2175 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2180 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2182 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2183 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2184 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2185 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2188 inner_loop
= x86_get_label( func
);
2190 x86_push( func
, aos_input
);
2191 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2192 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2193 x86_add( func
, aos_input
, stride
);
2194 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2195 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2196 x86_add( func
, aos_input
, stride
);
2197 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2198 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2199 x86_add( func
, aos_input
, stride
);
2200 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2201 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2202 x86_pop( func
, aos_input
);
2204 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2205 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2206 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2207 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2208 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2209 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2211 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2212 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2213 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2214 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2216 /* Advance to next input */
2217 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2218 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2220 /* while --num_inputs */
2221 x86_dec( func
, num_inputs
);
2222 x86_jcc( func
, cc_NE
, inner_loop
);
2225 x86_pop( func
, aos_input
);
2228 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2230 struct x86_reg soa_output
;
2231 struct x86_reg aos_output
;
2232 struct x86_reg num_outputs
;
2233 struct x86_reg temp
;
2236 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2237 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2238 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2239 temp
= x86_make_reg( file_REG32
, reg_DX
);
2242 x86_push( func
, aos_output
);
2244 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2245 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2246 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2249 inner_loop
= x86_get_label( func
);
2251 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2252 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2253 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2254 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2256 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2257 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2258 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2259 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2260 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2261 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2263 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2264 x86_push( func
, aos_output
);
2265 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2266 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2267 x86_add( func
, aos_output
, temp
);
2268 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2269 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2270 x86_add( func
, aos_output
, temp
);
2271 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2272 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2273 x86_add( func
, aos_output
, temp
);
2274 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2275 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2276 x86_pop( func
, aos_output
);
2278 /* Advance to next output */
2279 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2280 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2282 /* while --num_outputs */
2283 x86_dec( func
, num_outputs
);
2284 x86_jcc( func
, cc_NE
, inner_loop
);
2287 x86_pop( func
, aos_output
);
2291 * Translate a TGSI vertex/fragment shader to SSE2 code.
2292 * Slightly different things are done for vertex vs. fragment shaders.
2294 * Note that fragment shaders are responsible for interpolating shader
2295 * inputs. Because on x86 we have only 4 GP registers, and here we
2296 * have 5 shader arguments (input, output, const, temp and coef), the
2297 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2298 * GP register holding the output argument is aliased with the coeff
2299 * argument, as outputs are not needed in the DECLARATION phase.
2301 * \param tokens the TGSI input shader
2302 * \param func the output SSE code/function
2303 * \param immediates buffer to place immediates, later passed to SSE func
2304 * \param return 1 for success, 0 if translation failed
2308 const struct tgsi_token
*tokens
,
2309 struct x86_function
*func
,
2310 float (*immediates
)[4],
2311 boolean do_swizzles
)
2313 struct tgsi_parse_context parse
;
2314 boolean instruction_phase
= FALSE
;
2316 uint num_immediates
= 0;
2320 func
->csr
= func
->store
;
2322 tgsi_parse_init( &parse
, tokens
);
2324 /* Can't just use EDI, EBX without save/restoring them:
2328 get_immediate_base() );
2336 * Different function args for vertex/fragment shaders:
2338 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2339 /* DECLARATION phase, do not load output argument. */
2343 x86_fn_arg( func
, 1 ) );
2344 /* skipping outputs argument here */
2348 x86_fn_arg( func
, 3 ) );
2352 x86_fn_arg( func
, 4 ) );
2356 x86_fn_arg( func
, 5 ) );
2359 get_immediate_base(),
2360 x86_fn_arg( func
, 6 ) );
2363 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2368 1, /* machine->input */
2370 8 ); /* input_stride */
2375 x86_fn_arg( func
, 1 ) );
2379 x86_fn_arg( func
, 2 ) );
2383 x86_fn_arg( func
, 3 ) );
2387 x86_fn_arg( func
, 4 ) );
2390 get_immediate_base(),
2391 x86_fn_arg( func
, 5 ) );
2394 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2395 tgsi_parse_token( &parse
);
2397 switch( parse
.FullToken
.Token
.Type
) {
2398 case TGSI_TOKEN_TYPE_DECLARATION
:
2399 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2402 &parse
.FullToken
.FullDeclaration
);
2406 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2407 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2408 if( !instruction_phase
) {
2409 /* INSTRUCTION phase, overwrite coeff with output. */
2410 instruction_phase
= TRUE
;
2414 x86_fn_arg( func
, 2 ) );
2418 ok
= emit_instruction(
2420 &parse
.FullToken
.FullInstruction
);
2423 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2424 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2425 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2426 "vertex shader" : "fragment shader");
2430 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2431 /* simply copy the immediate values into the next immediates[] slot */
2433 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2436 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2437 for( i
= 0; i
< size
; i
++ ) {
2438 immediates
[num_immediates
][i
] =
2439 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2442 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2444 immediates
[num_immediates
][0],
2445 immediates
[num_immediates
][1],
2446 immediates
[num_immediates
][2],
2447 immediates
[num_immediates
][3]);
2459 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2461 soa_to_aos( func
, 9, 2, 10, 11 );
2464 /* Can't just use EBX, EDI without save/restoring them:
2472 get_immediate_base() );
2476 tgsi_parse_free( &parse
);
2481 #endif /* PIPE_ARCH_X86 */