1 /**************************************************************************
3 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "tgsi/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
35 #include "rtasm/rtasm_x86sse.h"
41 * This costs about 100fps (close to 10%) in gears:
43 #define HIGH_PRECISION 1
46 #define FOR_EACH_CHANNEL( CHAN )\
47 for( CHAN = 0; CHAN < 4; CHAN++ )
49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
50 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
56 FOR_EACH_CHANNEL( CHAN )\
57 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
64 #define TEMP_R0 TGSI_EXEC_TEMP_R0
67 * X86 utility functions.
76 (enum x86_reg_name
) xmm
);
80 * X86 register mapping helpers.
84 get_const_base( void )
92 get_input_base( void )
100 get_output_base( void )
107 static struct x86_reg
108 get_temp_base( void )
115 static struct x86_reg
116 get_coef_base( void )
118 return get_output_base();
121 static struct x86_reg
122 get_immediate_base( void )
131 * Data access helpers.
135 static struct x86_reg
140 return x86_make_disp(
141 get_immediate_base(),
142 (vec
* 4 + chan
) * 4 );
145 static struct x86_reg
150 return x86_make_disp(
152 (vec
* 4 + chan
) * 4 );
155 static struct x86_reg
160 return x86_make_disp(
162 (vec
* 4 + chan
) * 16 );
165 static struct x86_reg
170 return x86_make_disp(
172 (vec
* 4 + chan
) * 16 );
175 static struct x86_reg
180 return x86_make_disp(
182 (vec
* 4 + chan
) * 16 );
185 static struct x86_reg
191 return x86_make_disp(
193 ((vec
* 3 + member
) * 4 + chan
) * 4 );
199 struct x86_function
*func
)
206 * Data fetch helpers.
210 * Copy a shader constant to xmm register
211 * \param xmm the destination xmm register
212 * \param vec the src const buffer index
213 * \param chan src channel to fetch (X, Y, Z or W)
217 struct x86_function
*func
,
225 get_const( vec
, chan
) );
230 SHUF( 0, 0, 0, 0 ) );
235 struct x86_function
*func
,
243 get_immediate( vec
, chan
) );
248 SHUF( 0, 0, 0, 0 ) );
253 * Copy a shader input to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src input attrib
256 * \param chan src channel to fetch (X, Y, Z or W)
260 struct x86_function
*func
,
268 get_input( vec
, chan
) );
272 * Store an xmm register to a shader output
273 * \param xmm the source xmm register
274 * \param vec the dest output attrib
275 * \param chan src dest channel to store (X, Y, Z or W)
279 struct x86_function
*func
,
286 get_output( vec
, chan
),
291 * Copy a shader temporary to xmm register
292 * \param xmm the destination xmm register
293 * \param vec the src temp register
294 * \param chan src channel to fetch (X, Y, Z or W)
298 struct x86_function
*func
,
306 get_temp( vec
, chan
) );
310 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
311 * \param xmm the destination xmm register
312 * \param vec the src input/attribute coefficient index
313 * \param chan src channel to fetch (X, Y, Z or W)
314 * \param member 0=a0, 1=dadx, 2=dady
318 struct x86_function
*func
,
327 get_coef( vec
, chan
, member
) );
332 SHUF( 0, 0, 0, 0 ) );
336 * Data store helpers.
341 struct x86_function
*func
,
348 get_input( vec
, chan
),
354 struct x86_function
*func
,
361 get_temp( vec
, chan
),
367 struct x86_function
*func
,
375 vec
+ TGSI_EXEC_NUM_TEMPS
,
380 * Coefficent fetch helpers.
385 struct x86_function
*func
,
400 struct x86_function
*func
,
415 struct x86_function
*func
,
429 * Function call helpers.
434 struct x86_function
*func
)
438 x86_make_reg( file_REG32
, reg_AX
) );
441 x86_make_reg( file_REG32
, reg_CX
) );
444 x86_make_reg( file_REG32
, reg_DX
) );
449 struct x86_function
*func
)
451 /* Restore GP registers in a reverse order.
455 x86_make_reg( file_REG32
, reg_DX
) );
458 x86_make_reg( file_REG32
, reg_CX
) );
461 x86_make_reg( file_REG32
, reg_AX
) );
466 struct x86_function
*func
,
468 void (PIPE_CDECL
*code
)() )
472 get_temp( TEMP_R0
, 0 ),
473 make_xmm( xmm_dst
) );
479 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
484 get_temp( TEMP_R0
, 0 ) );
486 x86_push( func
, ecx
);
487 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
488 x86_call( func
, ecx
);
499 get_temp( TEMP_R0
, 0 ) );
503 emit_func_call_dst_src(
504 struct x86_function
*func
,
507 void (PIPE_CDECL
*code
)() )
511 get_temp( TEMP_R0
, 1 ),
512 make_xmm( xmm_src
) );
521 * Low-level instruction translators.
526 struct x86_function
*func
,
533 TGSI_EXEC_TEMP_7FFFFFFF_I
,
534 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
539 struct x86_function
*func
,
546 make_xmm( xmm_src
) );
549 static void PIPE_CDECL
553 const unsigned X
= 0;
555 store
[X
+ 0] = cosf( store
[X
+ 0] );
556 store
[X
+ 1] = cosf( store
[X
+ 1] );
557 store
[X
+ 2] = cosf( store
[X
+ 2] );
558 store
[X
+ 3] = cosf( store
[X
+ 3] );
563 struct x86_function
*func
,
572 static void PIPE_CDECL
576 const unsigned X
= 0;
578 store
[X
+ 0] = powf( 2.0f
, store
[X
+ 0] );
579 store
[X
+ 1] = powf( 2.0f
, store
[X
+ 1] );
580 store
[X
+ 2] = powf( 2.0f
, store
[X
+ 2] );
581 store
[X
+ 3] = powf( 2.0f
, store
[X
+ 3] );
586 struct x86_function
*func
,
597 struct x86_function
*func
,
606 static void PIPE_CDECL
610 const unsigned X
= 0;
612 store
[X
+ 0] = floorf( store
[X
+ 0] );
613 store
[X
+ 1] = floorf( store
[X
+ 1] );
614 store
[X
+ 2] = floorf( store
[X
+ 2] );
615 store
[X
+ 3] = floorf( store
[X
+ 3] );
620 struct x86_function
*func
,
629 static void PIPE_CDECL
633 const unsigned X
= 0;
635 store
[X
+ 0] -= floorf( store
[X
+ 0] );
636 store
[X
+ 1] -= floorf( store
[X
+ 1] );
637 store
[X
+ 2] -= floorf( store
[X
+ 2] );
638 store
[X
+ 3] -= floorf( store
[X
+ 3] );
643 struct x86_function
*func
,
652 static void PIPE_CDECL
656 const unsigned X
= 0;
658 store
[X
+ 0] = LOG2( store
[X
+ 0] );
659 store
[X
+ 1] = LOG2( store
[X
+ 1] );
660 store
[X
+ 2] = LOG2( store
[X
+ 2] );
661 store
[X
+ 3] = LOG2( store
[X
+ 3] );
666 struct x86_function
*func
,
677 struct x86_function
*func
,
684 make_xmm( xmm_src
) );
688 emit_mul (struct x86_function
*func
,
695 make_xmm( xmm_src
) );
700 struct x86_function
*func
,
707 TGSI_EXEC_TEMP_80000000_I
,
708 TGSI_EXEC_TEMP_80000000_C
) );
711 static void PIPE_CDECL
715 const unsigned X
= 0;
717 store
[X
+ 0] = powf( store
[X
+ 0], store
[X
+ 4] );
718 store
[X
+ 1] = powf( store
[X
+ 1], store
[X
+ 5] );
719 store
[X
+ 2] = powf( store
[X
+ 2], store
[X
+ 6] );
720 store
[X
+ 3] = powf( store
[X
+ 3], store
[X
+ 7] );
725 struct x86_function
*func
,
729 emit_func_call_dst_src(
738 struct x86_function
*func
,
742 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
743 * good enough. Need to either emit a proper divide or use the
744 * iterative technique described below in emit_rsqrt().
749 make_xmm( xmm_src
) );
754 struct x86_function
*func
,
759 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
760 * implementations, it is possible to improve its precision at
761 * fairly low cost, using a newton/raphson step, as below:
763 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
764 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
766 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
769 struct x86_reg dst
= make_xmm( xmm_dst
);
770 struct x86_reg src
= make_xmm( xmm_src
);
771 struct x86_reg tmp0
= make_xmm( 2 );
772 struct x86_reg tmp1
= make_xmm( 3 );
774 assert( xmm_dst
!= xmm_src
);
775 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
776 assert( xmm_src
!= 2 && xmm_src
!= 3 );
778 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
779 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
780 sse_rsqrtps( func
, tmp1
, src
);
781 sse_mulps( func
, src
, tmp1
);
782 sse_mulps( func
, dst
, tmp1
);
783 sse_mulps( func
, src
, tmp1
);
784 sse_subps( func
, tmp0
, src
);
785 sse_mulps( func
, dst
, tmp0
);
788 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
794 make_xmm( xmm_src
) );
800 struct x86_function
*func
,
807 TGSI_EXEC_TEMP_80000000_I
,
808 TGSI_EXEC_TEMP_80000000_C
) );
811 static void PIPE_CDECL
815 const unsigned X
= 0;
817 store
[X
+ 0] = sinf( store
[X
+ 0] );
818 store
[X
+ 1] = sinf( store
[X
+ 1] );
819 store
[X
+ 2] = sinf( store
[X
+ 2] );
820 store
[X
+ 3] = sinf( store
[X
+ 3] );
824 emit_sin (struct x86_function
*func
,
835 struct x86_function
*func
,
842 make_xmm( xmm_src
) );
851 struct x86_function
*func
,
853 const struct tgsi_full_src_register
*reg
,
854 const unsigned chan_index
)
856 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
859 case TGSI_EXTSWIZZLE_X
:
860 case TGSI_EXTSWIZZLE_Y
:
861 case TGSI_EXTSWIZZLE_Z
:
862 case TGSI_EXTSWIZZLE_W
:
863 switch( reg
->SrcRegister
.File
) {
864 case TGSI_FILE_CONSTANT
:
868 reg
->SrcRegister
.Index
,
872 case TGSI_FILE_IMMEDIATE
:
876 reg
->SrcRegister
.Index
,
880 case TGSI_FILE_INPUT
:
884 reg
->SrcRegister
.Index
,
888 case TGSI_FILE_TEMPORARY
:
892 reg
->SrcRegister
.Index
,
901 case TGSI_EXTSWIZZLE_ZERO
:
905 TGSI_EXEC_TEMP_00000000_I
,
906 TGSI_EXEC_TEMP_00000000_C
);
909 case TGSI_EXTSWIZZLE_ONE
:
913 TGSI_EXEC_TEMP_ONE_I
,
914 TGSI_EXEC_TEMP_ONE_C
);
921 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
922 case TGSI_UTIL_SIGN_CLEAR
:
923 emit_abs( func
, xmm
);
926 case TGSI_UTIL_SIGN_SET
:
927 emit_setsign( func
, xmm
);
930 case TGSI_UTIL_SIGN_TOGGLE
:
931 emit_neg( func
, xmm
);
934 case TGSI_UTIL_SIGN_KEEP
:
939 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
940 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
948 struct x86_function
*func
,
950 const struct tgsi_full_dst_register
*reg
,
951 const struct tgsi_full_instruction
*inst
,
952 unsigned chan_index
)
954 switch( reg
->DstRegister
.File
) {
955 case TGSI_FILE_OUTPUT
:
959 reg
->DstRegister
.Index
,
963 case TGSI_FILE_TEMPORARY
:
967 reg
->DstRegister
.Index
,
971 case TGSI_FILE_ADDRESS
:
975 reg
->DstRegister
.Index
,
983 switch( inst
->Instruction
.Saturate
) {
987 case TGSI_SAT_ZERO_ONE
:
991 case TGSI_SAT_MINUS_PLUS_ONE
:
997 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
998 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1001 * High-level instruction translators.
1006 struct x86_function
*func
,
1007 const struct tgsi_full_src_register
*reg
)
1009 unsigned uniquemask
;
1010 unsigned registers
[4];
1011 unsigned nextregister
= 0;
1012 unsigned firstchan
= ~0;
1013 unsigned chan_index
;
1015 /* This mask stores component bits that were already tested. Note that
1016 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1018 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1020 FOR_EACH_CHANNEL( chan_index
) {
1023 /* unswizzle channel */
1024 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1028 /* check if the component has not been already tested */
1029 if( !(uniquemask
& (1 << swizzle
)) ) {
1030 uniquemask
|= 1 << swizzle
;
1032 /* allocate register */
1033 registers
[chan_index
] = nextregister
;
1041 /* mark the first channel used */
1042 if( firstchan
== ~0 ) {
1043 firstchan
= chan_index
;
1050 x86_make_reg( file_REG32
, reg_AX
) );
1053 x86_make_reg( file_REG32
, reg_DX
) );
1055 FOR_EACH_CHANNEL( chan_index
) {
1056 if( uniquemask
& (1 << chan_index
) ) {
1059 make_xmm( registers
[chan_index
] ),
1061 TGSI_EXEC_TEMP_00000000_I
,
1062 TGSI_EXEC_TEMP_00000000_C
),
1065 if( chan_index
== firstchan
) {
1068 x86_make_reg( file_REG32
, reg_AX
),
1069 make_xmm( registers
[chan_index
] ) );
1074 x86_make_reg( file_REG32
, reg_DX
),
1075 make_xmm( registers
[chan_index
] ) );
1078 x86_make_reg( file_REG32
, reg_AX
),
1079 x86_make_reg( file_REG32
, reg_DX
) );
1087 TGSI_EXEC_TEMP_KILMASK_I
,
1088 TGSI_EXEC_TEMP_KILMASK_C
),
1089 x86_make_reg( file_REG32
, reg_AX
) );
1093 x86_make_reg( file_REG32
, reg_DX
) );
1096 x86_make_reg( file_REG32
, reg_AX
) );
1102 struct x86_function
*func
)
1104 /* XXX todo / fix me */
1110 struct x86_function
*func
,
1111 struct tgsi_full_instruction
*inst
,
1114 unsigned chan_index
;
1116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1117 FETCH( func
, *inst
, 0, 0, chan_index
);
1118 FETCH( func
, *inst
, 1, 1, chan_index
);
1128 TGSI_EXEC_TEMP_ONE_I
,
1129 TGSI_EXEC_TEMP_ONE_C
) );
1130 STORE( func
, *inst
, 0, 0, chan_index
);
1136 struct x86_function
*func
,
1137 struct tgsi_full_instruction
*inst
)
1139 unsigned chan_index
;
1141 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1142 FETCH( func
, *inst
, 0, 0, chan_index
);
1143 FETCH( func
, *inst
, 1, 1, chan_index
);
1144 FETCH( func
, *inst
, 2, 2, chan_index
);
1149 TGSI_EXEC_TEMP_00000000_I
,
1150 TGSI_EXEC_TEMP_00000000_C
),
1164 STORE( func
, *inst
, 0, 0, chan_index
);
1170 struct x86_function
*func
,
1171 struct tgsi_full_instruction
*inst
)
1173 unsigned chan_index
;
1175 switch( inst
->Instruction
.Opcode
) {
1176 case TGSI_OPCODE_ARL
:
1178 /* XXX this isn't working properly (see glean vertProg1 test) */
1179 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1180 FETCH( func
, *inst
, 0, 0, chan_index
);
1181 emit_f2it( func
, 0 );
1182 STORE( func
, *inst
, 0, 0, chan_index
);
1189 case TGSI_OPCODE_MOV
:
1190 case TGSI_OPCODE_SWZ
:
1191 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1192 FETCH( func
, *inst
, 0, 0, chan_index
);
1193 STORE( func
, *inst
, 0, 0, chan_index
);
1197 case TGSI_OPCODE_LIT
:
1198 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1199 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1203 TGSI_EXEC_TEMP_ONE_I
,
1204 TGSI_EXEC_TEMP_ONE_C
);
1205 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1206 STORE( func
, *inst
, 0, 0, CHAN_X
);
1208 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1209 STORE( func
, *inst
, 0, 0, CHAN_W
);
1212 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1213 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1214 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1215 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1220 TGSI_EXEC_TEMP_00000000_I
,
1221 TGSI_EXEC_TEMP_00000000_C
) );
1222 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1224 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1225 /* XMM[1] = SrcReg[0].yyyy */
1226 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1227 /* XMM[1] = max(XMM[1], 0) */
1232 TGSI_EXEC_TEMP_00000000_I
,
1233 TGSI_EXEC_TEMP_00000000_C
) );
1234 /* XMM[2] = SrcReg[0].wwww */
1235 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1236 /* XMM[2] = min(XMM[2], 128.0) */
1241 TGSI_EXEC_TEMP_128_I
,
1242 TGSI_EXEC_TEMP_128_C
) );
1243 /* XMM[2] = max(XMM[2], -128.0) */
1248 TGSI_EXEC_TEMP_MINUS_128_I
,
1249 TGSI_EXEC_TEMP_MINUS_128_C
) );
1250 emit_pow( func
, 1, 2 );
1251 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1265 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1270 case TGSI_OPCODE_RCP
:
1271 /* TGSI_OPCODE_RECIP */
1272 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1273 emit_rcp( func
, 0, 0 );
1274 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1275 STORE( func
, *inst
, 0, 0, chan_index
);
1279 case TGSI_OPCODE_RSQ
:
1280 /* TGSI_OPCODE_RECIPSQRT */
1281 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1282 emit_rsqrt( func
, 1, 0 );
1283 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1284 STORE( func
, *inst
, 1, 0, chan_index
);
1288 case TGSI_OPCODE_EXP
:
1292 case TGSI_OPCODE_LOG
:
1296 case TGSI_OPCODE_MUL
:
1297 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1298 FETCH( func
, *inst
, 0, 0, chan_index
);
1299 FETCH( func
, *inst
, 1, 1, chan_index
);
1300 emit_mul( func
, 0, 1 );
1301 STORE( func
, *inst
, 0, 0, chan_index
);
1305 case TGSI_OPCODE_ADD
:
1306 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1307 FETCH( func
, *inst
, 0, 0, chan_index
);
1308 FETCH( func
, *inst
, 1, 1, chan_index
);
1309 emit_add( func
, 0, 1 );
1310 STORE( func
, *inst
, 0, 0, chan_index
);
1314 case TGSI_OPCODE_DP3
:
1315 /* TGSI_OPCODE_DOT3 */
1316 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1317 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1318 emit_mul( func
, 0, 1 );
1319 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1320 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1321 emit_mul( func
, 1, 2 );
1322 emit_add( func
, 0, 1 );
1323 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1324 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1325 emit_mul( func
, 1, 2 );
1326 emit_add( func
, 0, 1 );
1327 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1328 STORE( func
, *inst
, 0, 0, chan_index
);
1332 case TGSI_OPCODE_DP4
:
1333 /* TGSI_OPCODE_DOT4 */
1334 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1335 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1336 emit_mul( func
, 0, 1 );
1337 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1338 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1339 emit_mul( func
, 1, 2 );
1340 emit_add( func
, 0, 1 );
1341 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1342 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1343 emit_mul(func
, 1, 2 );
1344 emit_add(func
, 0, 1 );
1345 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1346 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1347 emit_mul( func
, 1, 2 );
1348 emit_add( func
, 0, 1 );
1349 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1350 STORE( func
, *inst
, 0, 0, chan_index
);
1354 case TGSI_OPCODE_DST
:
1355 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1359 TGSI_EXEC_TEMP_ONE_I
,
1360 TGSI_EXEC_TEMP_ONE_C
);
1361 STORE( func
, *inst
, 0, 0, CHAN_X
);
1363 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1364 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1365 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1366 emit_mul( func
, 0, 1 );
1367 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1369 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1370 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1371 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1373 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1374 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1375 STORE( func
, *inst
, 0, 0, CHAN_W
);
1379 case TGSI_OPCODE_MIN
:
1380 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1381 FETCH( func
, *inst
, 0, 0, chan_index
);
1382 FETCH( func
, *inst
, 1, 1, chan_index
);
1387 STORE( func
, *inst
, 0, 0, chan_index
);
1391 case TGSI_OPCODE_MAX
:
1392 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1393 FETCH( func
, *inst
, 0, 0, chan_index
);
1394 FETCH( func
, *inst
, 1, 1, chan_index
);
1399 STORE( func
, *inst
, 0, 0, chan_index
);
1403 case TGSI_OPCODE_SLT
:
1404 /* TGSI_OPCODE_SETLT */
1405 emit_setcc( func
, inst
, cc_LessThan
);
1408 case TGSI_OPCODE_SGE
:
1409 /* TGSI_OPCODE_SETGE */
1410 emit_setcc( func
, inst
, cc_NotLessThan
);
1413 case TGSI_OPCODE_MAD
:
1414 /* TGSI_OPCODE_MADD */
1415 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1416 FETCH( func
, *inst
, 0, 0, chan_index
);
1417 FETCH( func
, *inst
, 1, 1, chan_index
);
1418 FETCH( func
, *inst
, 2, 2, chan_index
);
1419 emit_mul( func
, 0, 1 );
1420 emit_add( func
, 0, 2 );
1421 STORE( func
, *inst
, 0, 0, chan_index
);
1425 case TGSI_OPCODE_SUB
:
1426 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1427 FETCH( func
, *inst
, 0, 0, chan_index
);
1428 FETCH( func
, *inst
, 1, 1, chan_index
);
1429 emit_sub( func
, 0, 1 );
1430 STORE( func
, *inst
, 0, 0, chan_index
);
1434 case TGSI_OPCODE_LERP
:
1435 /* TGSI_OPCODE_LRP */
1436 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1437 FETCH( func
, *inst
, 0, 0, chan_index
);
1438 FETCH( func
, *inst
, 1, 1, chan_index
);
1439 FETCH( func
, *inst
, 2, 2, chan_index
);
1440 emit_sub( func
, 1, 2 );
1441 emit_mul( func
, 0, 1 );
1442 emit_add( func
, 0, 2 );
1443 STORE( func
, *inst
, 0, 0, chan_index
);
1447 case TGSI_OPCODE_CND
:
1451 case TGSI_OPCODE_CND0
:
1455 case TGSI_OPCODE_DOT2ADD
:
1456 /* TGSI_OPCODE_DP2A */
1460 case TGSI_OPCODE_INDEX
:
1464 case TGSI_OPCODE_NEGATE
:
1468 case TGSI_OPCODE_FRAC
:
1469 /* TGSI_OPCODE_FRC */
1470 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1471 FETCH( func
, *inst
, 0, 0, chan_index
);
1472 emit_frc( func
, 0 );
1473 STORE( func
, *inst
, 0, 0, chan_index
);
1477 case TGSI_OPCODE_CLAMP
:
1481 case TGSI_OPCODE_FLOOR
:
1482 /* TGSI_OPCODE_FLR */
1483 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1484 FETCH( func
, *inst
, 0, 0, chan_index
);
1485 emit_flr( func
, 0 );
1486 STORE( func
, *inst
, 0, 0, chan_index
);
1490 case TGSI_OPCODE_ROUND
:
1494 case TGSI_OPCODE_EXPBASE2
:
1495 /* TGSI_OPCODE_EX2 */
1496 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1497 emit_ex2( func
, 0 );
1498 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1499 STORE( func
, *inst
, 0, 0, chan_index
);
1503 case TGSI_OPCODE_LOGBASE2
:
1504 /* TGSI_OPCODE_LG2 */
1505 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1506 emit_lg2( func
, 0 );
1507 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1508 STORE( func
, *inst
, 0, 0, chan_index
);
1512 case TGSI_OPCODE_POWER
:
1513 /* TGSI_OPCODE_POW */
1514 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1515 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1516 emit_pow( func
, 0, 1 );
1517 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1518 STORE( func
, *inst
, 0, 0, chan_index
);
1522 case TGSI_OPCODE_CROSSPRODUCT
:
1523 /* TGSI_OPCODE_XPD */
1524 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1525 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1526 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1527 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1529 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1530 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1531 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1532 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1534 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1535 emit_MOV( func
, 2, 0 );
1536 emit_mul( func
, 2, 1 );
1537 emit_MOV( func
, 5, 3 );
1538 emit_mul( func
, 5, 4 );
1539 emit_sub( func
, 2, 5 );
1540 STORE( func
, *inst
, 2, 0, CHAN_X
);
1542 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1543 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1544 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1545 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1547 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1548 emit_mul( func
, 3, 2 );
1549 emit_mul( func
, 1, 5 );
1550 emit_sub( func
, 3, 1 );
1551 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1553 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1554 emit_mul( func
, 5, 4 );
1555 emit_mul( func
, 0, 2 );
1556 emit_sub( func
, 5, 0 );
1557 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1559 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1563 TGSI_EXEC_TEMP_ONE_I
,
1564 TGSI_EXEC_TEMP_ONE_C
);
1565 STORE( func
, *inst
, 0, 0, CHAN_W
);
1569 case TGSI_OPCODE_MULTIPLYMATRIX
:
1573 case TGSI_OPCODE_ABS
:
1574 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1575 FETCH( func
, *inst
, 0, 0, chan_index
);
1576 emit_abs( func
, 0) ;
1578 STORE( func
, *inst
, 0, 0, chan_index
);
1582 case TGSI_OPCODE_RCC
:
1586 case TGSI_OPCODE_DPH
:
1587 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1588 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1589 emit_mul( func
, 0, 1 );
1590 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1591 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1592 emit_mul( func
, 1, 2 );
1593 emit_add( func
, 0, 1 );
1594 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1595 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1596 emit_mul( func
, 1, 2 );
1597 emit_add( func
, 0, 1 );
1598 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1599 emit_add( func
, 0, 1 );
1600 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1601 STORE( func
, *inst
, 0, 0, chan_index
);
1605 case TGSI_OPCODE_COS
:
1606 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1607 emit_cos( func
, 0 );
1608 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1609 STORE( func
, *inst
, 0, 0, chan_index
);
1613 case TGSI_OPCODE_DDX
:
1617 case TGSI_OPCODE_DDY
:
1621 case TGSI_OPCODE_KILP
:
1622 /* predicated kill */
1624 return 0; /* XXX fix me */
1627 case TGSI_OPCODE_KIL
:
1628 /* conditional kill */
1629 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1632 case TGSI_OPCODE_PK2H
:
1636 case TGSI_OPCODE_PK2US
:
1640 case TGSI_OPCODE_PK4B
:
1644 case TGSI_OPCODE_PK4UB
:
1648 case TGSI_OPCODE_RFL
:
1652 case TGSI_OPCODE_SEQ
:
1656 case TGSI_OPCODE_SFL
:
1660 case TGSI_OPCODE_SGT
:
1664 case TGSI_OPCODE_SIN
:
1665 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1666 emit_sin( func
, 0 );
1667 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1668 STORE( func
, *inst
, 0, 0, chan_index
);
1672 case TGSI_OPCODE_SLE
:
1676 case TGSI_OPCODE_SNE
:
1680 case TGSI_OPCODE_STR
:
1684 case TGSI_OPCODE_TEX
:
1686 /* Disable dummy texture code:
1691 TGSI_EXEC_TEMP_ONE_I
,
1692 TGSI_EXEC_TEMP_ONE_C
);
1693 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1694 STORE( func
, *inst
, 0, 0, chan_index
);
1702 case TGSI_OPCODE_TXD
:
1706 case TGSI_OPCODE_UP2H
:
1710 case TGSI_OPCODE_UP2US
:
1714 case TGSI_OPCODE_UP4B
:
1718 case TGSI_OPCODE_UP4UB
:
1722 case TGSI_OPCODE_X2D
:
1726 case TGSI_OPCODE_ARA
:
1730 case TGSI_OPCODE_ARR
:
1734 case TGSI_OPCODE_BRA
:
1738 case TGSI_OPCODE_CAL
:
1742 case TGSI_OPCODE_RET
:
1746 case TGSI_OPCODE_END
:
1749 case TGSI_OPCODE_SSG
:
1753 case TGSI_OPCODE_CMP
:
1754 emit_cmp (func
, inst
);
1757 case TGSI_OPCODE_SCS
:
1758 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1759 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1760 emit_cos( func
, 0 );
1761 STORE( func
, *inst
, 0, 0, CHAN_X
);
1763 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1764 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1765 emit_sin( func
, 0 );
1766 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1768 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1772 TGSI_EXEC_TEMP_00000000_I
,
1773 TGSI_EXEC_TEMP_00000000_C
);
1774 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1776 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1780 TGSI_EXEC_TEMP_ONE_I
,
1781 TGSI_EXEC_TEMP_ONE_C
);
1782 STORE( func
, *inst
, 0, 0, CHAN_W
);
1786 case TGSI_OPCODE_TXB
:
1790 case TGSI_OPCODE_NRM
:
1794 case TGSI_OPCODE_DIV
:
1798 case TGSI_OPCODE_DP2
:
1802 case TGSI_OPCODE_TXL
:
1806 case TGSI_OPCODE_BRK
:
1810 case TGSI_OPCODE_IF
:
1814 case TGSI_OPCODE_LOOP
:
1818 case TGSI_OPCODE_REP
:
1822 case TGSI_OPCODE_ELSE
:
1826 case TGSI_OPCODE_ENDIF
:
1830 case TGSI_OPCODE_ENDLOOP
:
1834 case TGSI_OPCODE_ENDREP
:
1838 case TGSI_OPCODE_PUSHA
:
1842 case TGSI_OPCODE_POPA
:
1846 case TGSI_OPCODE_CEIL
:
1850 case TGSI_OPCODE_I2F
:
1854 case TGSI_OPCODE_NOT
:
1858 case TGSI_OPCODE_TRUNC
:
1862 case TGSI_OPCODE_SHL
:
1866 case TGSI_OPCODE_SHR
:
1870 case TGSI_OPCODE_AND
:
1874 case TGSI_OPCODE_OR
:
1878 case TGSI_OPCODE_MOD
:
1882 case TGSI_OPCODE_XOR
:
1886 case TGSI_OPCODE_SAD
:
1890 case TGSI_OPCODE_TXF
:
1894 case TGSI_OPCODE_TXQ
:
1898 case TGSI_OPCODE_CONT
:
1902 case TGSI_OPCODE_EMIT
:
1906 case TGSI_OPCODE_ENDPRIM
:
1919 struct x86_function
*func
,
1920 struct tgsi_full_declaration
*decl
)
1922 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
1923 unsigned first
, last
, mask
;
1926 first
= decl
->DeclarationRange
.First
;
1927 last
= decl
->DeclarationRange
.Last
;
1928 mask
= decl
->Declaration
.UsageMask
;
1930 for( i
= first
; i
<= last
; i
++ ) {
1931 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
1932 if( mask
& (1 << j
) ) {
1933 switch( decl
->Declaration
.Interpolate
) {
1934 case TGSI_INTERPOLATE_CONSTANT
:
1935 emit_coef_a0( func
, 0, i
, j
);
1936 emit_inputs( func
, 0, i
, j
);
1939 case TGSI_INTERPOLATE_LINEAR
:
1940 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
1941 emit_coef_dadx( func
, 1, i
, j
);
1942 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
1943 emit_coef_dady( func
, 3, i
, j
);
1944 emit_mul( func
, 0, 1 ); /* x * dadx */
1945 emit_coef_a0( func
, 4, i
, j
);
1946 emit_mul( func
, 2, 3 ); /* y * dady */
1947 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
1948 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
1949 emit_inputs( func
, 0, i
, j
);
1952 case TGSI_INTERPOLATE_PERSPECTIVE
:
1953 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
1954 emit_coef_dadx( func
, 1, i
, j
);
1955 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
1956 emit_coef_dady( func
, 3, i
, j
);
1957 emit_mul( func
, 0, 1 ); /* x * dadx */
1958 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
1959 emit_coef_a0( func
, 5, i
, j
);
1960 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
1961 emit_mul( func
, 2, 3 ); /* y * dady */
1962 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
1963 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
1964 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
1965 emit_inputs( func
, 0, i
, j
);
1978 static void aos_to_soa( struct x86_function
*func
,
1984 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
1985 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
1986 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
1987 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
1992 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
1994 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
1995 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
1996 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
1997 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2000 inner_loop
= x86_get_label( func
);
2002 x86_push( func
, aos_input
);
2003 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2004 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2005 x86_add( func
, aos_input
, stride
);
2006 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2007 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2008 x86_add( func
, aos_input
, stride
);
2009 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2010 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2011 x86_add( func
, aos_input
, stride
);
2012 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2013 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2014 x86_pop( func
, aos_input
);
2016 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2017 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2018 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2019 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2020 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2021 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2023 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2024 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2025 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2026 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2028 /* Advance to next input */
2029 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2030 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2032 /* while --num_inputs */
2033 x86_dec( func
, num_inputs
);
2034 x86_jcc( func
, cc_NE
, inner_loop
);
2037 x86_pop( func
, aos_input
);
2040 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2042 struct x86_reg soa_output
;
2043 struct x86_reg aos_output
;
2044 struct x86_reg num_outputs
;
2045 struct x86_reg temp
;
2048 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2049 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2050 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2051 temp
= x86_make_reg( file_REG32
, reg_DX
);
2054 x86_push( func
, aos_output
);
2056 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2057 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2058 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2061 inner_loop
= x86_get_label( func
);
2063 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2064 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2065 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2066 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2068 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2069 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2070 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2071 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2072 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2073 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2075 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2076 x86_push( func
, aos_output
);
2077 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2078 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2079 x86_add( func
, aos_output
, temp
);
2080 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2081 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2082 x86_add( func
, aos_output
, temp
);
2083 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2084 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2085 x86_add( func
, aos_output
, temp
);
2086 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2087 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2088 x86_pop( func
, aos_output
);
2090 /* Advance to next output */
2091 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2092 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2094 /* while --num_outputs */
2095 x86_dec( func
, num_outputs
);
2096 x86_jcc( func
, cc_NE
, inner_loop
);
2099 x86_pop( func
, aos_output
);
2103 * Translate a TGSI vertex/fragment shader to SSE2 code.
2104 * Slightly different things are done for vertex vs. fragment shaders.
2106 * Note that fragment shaders are responsible for interpolating shader
2107 * inputs. Because on x86 we have only 4 GP registers, and here we
2108 * have 5 shader arguments (input, output, const, temp and coef), the
2109 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2110 * GP register holding the output argument is aliased with the coeff
2111 * argument, as outputs are not needed in the DECLARATION phase.
2113 * \param tokens the TGSI input shader
2114 * \param func the output SSE code/function
2115 * \param immediates buffer to place immediates, later passed to SSE func
2116 * \param return 1 for success, 0 if translation failed
2120 const struct tgsi_token
*tokens
,
2121 struct x86_function
*func
,
2122 float (*immediates
)[4],
2123 boolean do_swizzles
)
2125 struct tgsi_parse_context parse
;
2126 boolean instruction_phase
= FALSE
;
2128 uint num_immediates
= 0;
2130 func
->csr
= func
->store
;
2132 tgsi_parse_init( &parse
, tokens
);
2134 /* Can't just use EDI, EBX without save/restoring them:
2138 get_immediate_base() );
2146 * Different function args for vertex/fragment shaders:
2148 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2149 /* DECLARATION phase, do not load output argument. */
2153 x86_fn_arg( func
, 1 ) );
2154 /* skipping outputs argument here */
2158 x86_fn_arg( func
, 3 ) );
2162 x86_fn_arg( func
, 4 ) );
2166 x86_fn_arg( func
, 5 ) );
2169 get_immediate_base(),
2170 x86_fn_arg( func
, 6 ) );
2173 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2178 1, /* machine->input */
2180 8 ); /* input_stride */
2185 x86_fn_arg( func
, 1 ) );
2189 x86_fn_arg( func
, 2 ) );
2193 x86_fn_arg( func
, 3 ) );
2197 x86_fn_arg( func
, 4 ) );
2200 get_immediate_base(),
2201 x86_fn_arg( func
, 5 ) );
2204 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2205 tgsi_parse_token( &parse
);
2207 switch( parse
.FullToken
.Token
.Type
) {
2208 case TGSI_TOKEN_TYPE_DECLARATION
:
2209 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2212 &parse
.FullToken
.FullDeclaration
);
2216 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2217 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2218 if( !instruction_phase
) {
2219 /* INSTRUCTION phase, overwrite coeff with output. */
2220 instruction_phase
= TRUE
;
2224 x86_fn_arg( func
, 2 ) );
2228 ok
= emit_instruction(
2230 &parse
.FullToken
.FullInstruction
);
2233 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2234 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2235 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2236 "vertex shader" : "fragment shader");
2240 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2241 /* simply copy the immediate values into the next immediates[] slot */
2243 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2246 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2247 for( i
= 0; i
< size
; i
++ ) {
2248 immediates
[num_immediates
][i
] =
2249 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2252 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2254 immediates
[num_immediates
][0],
2255 immediates
[num_immediates
][1],
2256 immediates
[num_immediates
][2],
2257 immediates
[num_immediates
][3]);
2269 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2271 soa_to_aos( func
, 9, 2, 10, 11 );
2274 /* Can't just use EBX, EDI without save/restoring them:
2282 get_immediate_base() );
2286 tgsi_parse_free( &parse
);
2291 #endif /* PIPE_ARCH_X86 */