1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_util.h"
29 #include "pipe/p_shader_tokens.h"
30 #include "tgsi/tgsi_parse.h"
31 #include "tgsi/tgsi_util.h"
32 #include "tgsi_exec.h"
33 #include "tgsi_sse2.h"
35 #include "rtasm/rtasm_x86sse.h"
41 * This costs about 100fps (close to 10%) in gears:
43 #define HIGH_PRECISION 1
46 #define FOR_EACH_CHANNEL( CHAN )\
47 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
50 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
53 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
56 FOR_EACH_CHANNEL( CHAN )\
57 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
64 #define TEMP_R0 TGSI_EXEC_TEMP_R0
65 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
68 * X86 utility functions.
77 (enum x86_reg_name
) xmm
);
81 * X86 register mapping helpers.
85 get_const_base( void )
93 get_input_base( void )
100 static struct x86_reg
101 get_output_base( void )
108 static struct x86_reg
109 get_temp_base( void )
116 static struct x86_reg
117 get_coef_base( void )
119 return get_output_base();
122 static struct x86_reg
123 get_immediate_base( void )
132 * Data access helpers.
136 static struct x86_reg
141 return x86_make_disp(
142 get_immediate_base(),
143 (vec
* 4 + chan
) * 4 );
146 static struct x86_reg
151 return x86_make_disp(
153 (vec
* 4 + chan
) * 4 );
156 static struct x86_reg
161 return x86_make_disp(
163 (vec
* 4 + chan
) * 16 );
166 static struct x86_reg
171 return x86_make_disp(
173 (vec
* 4 + chan
) * 16 );
176 static struct x86_reg
181 return x86_make_disp(
183 (vec
* 4 + chan
) * 16 );
186 static struct x86_reg
192 return x86_make_disp(
194 ((vec
* 3 + member
) * 4 + chan
) * 4 );
200 struct x86_function
*func
)
207 * Data fetch helpers.
211 * Copy a shader constant to xmm register
212 * \param xmm the destination xmm register
213 * \param vec the src const buffer index
214 * \param chan src channel to fetch (X, Y, Z or W)
218 struct x86_function
*func
,
227 struct x86_reg r0
= get_input_base();
228 struct x86_reg r1
= get_output_base();
231 assert( indirectFile
== TGSI_FILE_ADDRESS
);
232 assert( indirectIndex
== 0 );
234 x86_push( func
, r0
);
235 x86_push( func
, r1
);
237 for (i
= 0; i
< QUAD_SIZE
; i
++) {
238 x86_lea( func
, r0
, get_const( vec
, chan
) );
239 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
241 /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
243 x86_add( func
, r1
, r1
);
244 x86_add( func
, r1
, r1
);
245 x86_add( func
, r1
, r1
);
246 x86_add( func
, r1
, r1
);
248 x86_add( func
, r0
, r1
);
249 x86_mov( func
, r1
, x86_deref( r0
) );
250 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
259 get_temp( TEMP_R0
, CHAN_X
) );
267 get_const( vec
, chan
) );
272 SHUF( 0, 0, 0, 0 ) );
278 struct x86_function
*func
,
286 get_immediate( vec
, chan
) );
291 SHUF( 0, 0, 0, 0 ) );
296 * Copy a shader input to xmm register
297 * \param xmm the destination xmm register
298 * \param vec the src input attrib
299 * \param chan src channel to fetch (X, Y, Z or W)
303 struct x86_function
*func
,
311 get_input( vec
, chan
) );
315 * Store an xmm register to a shader output
316 * \param xmm the source xmm register
317 * \param vec the dest output attrib
318 * \param chan src dest channel to store (X, Y, Z or W)
322 struct x86_function
*func
,
329 get_output( vec
, chan
),
334 * Copy a shader temporary to xmm register
335 * \param xmm the destination xmm register
336 * \param vec the src temp register
337 * \param chan src channel to fetch (X, Y, Z or W)
341 struct x86_function
*func
,
349 get_temp( vec
, chan
) );
353 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
354 * \param xmm the destination xmm register
355 * \param vec the src input/attribute coefficient index
356 * \param chan src channel to fetch (X, Y, Z or W)
357 * \param member 0=a0, 1=dadx, 2=dady
361 struct x86_function
*func
,
370 get_coef( vec
, chan
, member
) );
375 SHUF( 0, 0, 0, 0 ) );
379 * Data store helpers.
384 struct x86_function
*func
,
391 get_input( vec
, chan
),
397 struct x86_function
*func
,
404 get_temp( vec
, chan
),
410 struct x86_function
*func
,
420 vec
+ TGSI_EXEC_TEMP_ADDR
,
425 * Coefficent fetch helpers.
430 struct x86_function
*func
,
445 struct x86_function
*func
,
460 struct x86_function
*func
,
474 * Function call helpers.
479 struct x86_function
*func
)
483 x86_make_reg( file_REG32
, reg_AX
) );
486 x86_make_reg( file_REG32
, reg_CX
) );
489 x86_make_reg( file_REG32
, reg_DX
) );
494 struct x86_function
*func
)
496 /* Restore GP registers in a reverse order.
500 x86_make_reg( file_REG32
, reg_DX
) );
503 x86_make_reg( file_REG32
, reg_CX
) );
506 x86_make_reg( file_REG32
, reg_AX
) );
511 struct x86_function
*func
,
513 void (PIPE_CDECL
*code
)() )
517 get_temp( TEMP_R0
, 0 ),
518 make_xmm( xmm_dst
) );
524 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
529 get_temp( TEMP_R0
, 0 ) );
531 x86_push( func
, ecx
);
532 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
533 x86_call( func
, ecx
);
544 get_temp( TEMP_R0
, 0 ) );
548 emit_func_call_dst_src(
549 struct x86_function
*func
,
552 void (PIPE_CDECL
*code
)() )
556 get_temp( TEMP_R0
, 1 ),
557 make_xmm( xmm_src
) );
566 * Low-level instruction translators.
571 struct x86_function
*func
,
578 TGSI_EXEC_TEMP_7FFFFFFF_I
,
579 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
584 struct x86_function
*func
,
591 make_xmm( xmm_src
) );
594 static void PIPE_CDECL
598 const unsigned X
= 0;
600 store
[X
+ 0] = cosf( store
[X
+ 0] );
601 store
[X
+ 1] = cosf( store
[X
+ 1] );
602 store
[X
+ 2] = cosf( store
[X
+ 2] );
603 store
[X
+ 3] = cosf( store
[X
+ 3] );
608 struct x86_function
*func
,
617 static void PIPE_CDECL
621 const unsigned X
= 0;
623 store
[X
+ 0] = powf( 2.0f
, store
[X
+ 0] );
624 store
[X
+ 1] = powf( 2.0f
, store
[X
+ 1] );
625 store
[X
+ 2] = powf( 2.0f
, store
[X
+ 2] );
626 store
[X
+ 3] = powf( 2.0f
, store
[X
+ 3] );
631 struct x86_function
*func
,
642 struct x86_function
*func
,
651 static void PIPE_CDECL
655 const unsigned X
= 0;
657 store
[X
+ 0] = floorf( store
[X
+ 0] );
658 store
[X
+ 1] = floorf( store
[X
+ 1] );
659 store
[X
+ 2] = floorf( store
[X
+ 2] );
660 store
[X
+ 3] = floorf( store
[X
+ 3] );
665 struct x86_function
*func
,
674 static void PIPE_CDECL
678 const unsigned X
= 0;
680 store
[X
+ 0] -= floorf( store
[X
+ 0] );
681 store
[X
+ 1] -= floorf( store
[X
+ 1] );
682 store
[X
+ 2] -= floorf( store
[X
+ 2] );
683 store
[X
+ 3] -= floorf( store
[X
+ 3] );
688 struct x86_function
*func
,
697 static void PIPE_CDECL
701 const unsigned X
= 0;
703 store
[X
+ 0] = LOG2( store
[X
+ 0] );
704 store
[X
+ 1] = LOG2( store
[X
+ 1] );
705 store
[X
+ 2] = LOG2( store
[X
+ 2] );
706 store
[X
+ 3] = LOG2( store
[X
+ 3] );
711 struct x86_function
*func
,
722 struct x86_function
*func
,
729 make_xmm( xmm_src
) );
733 emit_mul (struct x86_function
*func
,
740 make_xmm( xmm_src
) );
745 struct x86_function
*func
,
752 TGSI_EXEC_TEMP_80000000_I
,
753 TGSI_EXEC_TEMP_80000000_C
) );
756 static void PIPE_CDECL
760 const unsigned X
= 0;
762 store
[X
+ 0] = powf( store
[X
+ 0], store
[X
+ 4] );
763 store
[X
+ 1] = powf( store
[X
+ 1], store
[X
+ 5] );
764 store
[X
+ 2] = powf( store
[X
+ 2], store
[X
+ 6] );
765 store
[X
+ 3] = powf( store
[X
+ 3], store
[X
+ 7] );
770 struct x86_function
*func
,
774 emit_func_call_dst_src(
783 struct x86_function
*func
,
787 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
788 * good enough. Need to either emit a proper divide or use the
789 * iterative technique described below in emit_rsqrt().
794 make_xmm( xmm_src
) );
799 struct x86_function
*func
,
804 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
805 * implementations, it is possible to improve its precision at
806 * fairly low cost, using a newton/raphson step, as below:
808 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
809 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
811 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
814 struct x86_reg dst
= make_xmm( xmm_dst
);
815 struct x86_reg src
= make_xmm( xmm_src
);
816 struct x86_reg tmp0
= make_xmm( 2 );
817 struct x86_reg tmp1
= make_xmm( 3 );
819 assert( xmm_dst
!= xmm_src
);
820 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
821 assert( xmm_src
!= 2 && xmm_src
!= 3 );
823 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
824 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
825 sse_rsqrtps( func
, tmp1
, src
);
826 sse_mulps( func
, src
, tmp1
);
827 sse_mulps( func
, dst
, tmp1
);
828 sse_mulps( func
, src
, tmp1
);
829 sse_subps( func
, tmp0
, src
);
830 sse_mulps( func
, dst
, tmp0
);
833 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
839 make_xmm( xmm_src
) );
845 struct x86_function
*func
,
852 TGSI_EXEC_TEMP_80000000_I
,
853 TGSI_EXEC_TEMP_80000000_C
) );
856 static void PIPE_CDECL
860 const unsigned X
= 0;
862 store
[X
+ 0] = sinf( store
[X
+ 0] );
863 store
[X
+ 1] = sinf( store
[X
+ 1] );
864 store
[X
+ 2] = sinf( store
[X
+ 2] );
865 store
[X
+ 3] = sinf( store
[X
+ 3] );
869 emit_sin (struct x86_function
*func
,
880 struct x86_function
*func
,
887 make_xmm( xmm_src
) );
896 struct x86_function
*func
,
898 const struct tgsi_full_src_register
*reg
,
899 const unsigned chan_index
)
901 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
904 case TGSI_EXTSWIZZLE_X
:
905 case TGSI_EXTSWIZZLE_Y
:
906 case TGSI_EXTSWIZZLE_Z
:
907 case TGSI_EXTSWIZZLE_W
:
908 switch (reg
->SrcRegister
.File
) {
909 case TGSI_FILE_CONSTANT
:
913 reg
->SrcRegister
.Index
,
915 reg
->SrcRegister
.Indirect
,
916 reg
->SrcRegisterInd
.File
,
917 reg
->SrcRegisterInd
.Index
);
920 case TGSI_FILE_IMMEDIATE
:
924 reg
->SrcRegister
.Index
,
928 case TGSI_FILE_INPUT
:
932 reg
->SrcRegister
.Index
,
936 case TGSI_FILE_TEMPORARY
:
940 reg
->SrcRegister
.Index
,
949 case TGSI_EXTSWIZZLE_ZERO
:
953 TGSI_EXEC_TEMP_00000000_I
,
954 TGSI_EXEC_TEMP_00000000_C
);
957 case TGSI_EXTSWIZZLE_ONE
:
961 TGSI_EXEC_TEMP_ONE_I
,
962 TGSI_EXEC_TEMP_ONE_C
);
969 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
970 case TGSI_UTIL_SIGN_CLEAR
:
971 emit_abs( func
, xmm
);
974 case TGSI_UTIL_SIGN_SET
:
975 emit_setsign( func
, xmm
);
978 case TGSI_UTIL_SIGN_TOGGLE
:
979 emit_neg( func
, xmm
);
982 case TGSI_UTIL_SIGN_KEEP
:
987 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
988 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
996 struct x86_function
*func
,
998 const struct tgsi_full_dst_register
*reg
,
999 const struct tgsi_full_instruction
*inst
,
1000 unsigned chan_index
)
1002 switch( reg
->DstRegister
.File
) {
1003 case TGSI_FILE_OUTPUT
:
1007 reg
->DstRegister
.Index
,
1011 case TGSI_FILE_TEMPORARY
:
1015 reg
->DstRegister
.Index
,
1019 case TGSI_FILE_ADDRESS
:
1023 reg
->DstRegister
.Index
,
1031 switch( inst
->Instruction
.Saturate
) {
1035 case TGSI_SAT_ZERO_ONE
:
1039 case TGSI_SAT_MINUS_PLUS_ONE
:
1045 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1046 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1049 * High-level instruction translators.
1054 struct x86_function
*func
,
1055 const struct tgsi_full_src_register
*reg
)
1057 unsigned uniquemask
;
1058 unsigned registers
[4];
1059 unsigned nextregister
= 0;
1060 unsigned firstchan
= ~0;
1061 unsigned chan_index
;
1063 /* This mask stores component bits that were already tested. Note that
1064 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1066 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1068 FOR_EACH_CHANNEL( chan_index
) {
1071 /* unswizzle channel */
1072 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1076 /* check if the component has not been already tested */
1077 if( !(uniquemask
& (1 << swizzle
)) ) {
1078 uniquemask
|= 1 << swizzle
;
1080 /* allocate register */
1081 registers
[chan_index
] = nextregister
;
1089 /* mark the first channel used */
1090 if( firstchan
== ~0 ) {
1091 firstchan
= chan_index
;
1098 x86_make_reg( file_REG32
, reg_AX
) );
1101 x86_make_reg( file_REG32
, reg_DX
) );
1103 FOR_EACH_CHANNEL( chan_index
) {
1104 if( uniquemask
& (1 << chan_index
) ) {
1107 make_xmm( registers
[chan_index
] ),
1109 TGSI_EXEC_TEMP_00000000_I
,
1110 TGSI_EXEC_TEMP_00000000_C
),
1113 if( chan_index
== firstchan
) {
1116 x86_make_reg( file_REG32
, reg_AX
),
1117 make_xmm( registers
[chan_index
] ) );
1122 x86_make_reg( file_REG32
, reg_DX
),
1123 make_xmm( registers
[chan_index
] ) );
1126 x86_make_reg( file_REG32
, reg_AX
),
1127 x86_make_reg( file_REG32
, reg_DX
) );
1135 TGSI_EXEC_TEMP_KILMASK_I
,
1136 TGSI_EXEC_TEMP_KILMASK_C
),
1137 x86_make_reg( file_REG32
, reg_AX
) );
1141 x86_make_reg( file_REG32
, reg_DX
) );
1144 x86_make_reg( file_REG32
, reg_AX
) );
1150 struct x86_function
*func
)
1152 /* XXX todo / fix me */
1158 struct x86_function
*func
,
1159 struct tgsi_full_instruction
*inst
,
1162 unsigned chan_index
;
1164 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1165 FETCH( func
, *inst
, 0, 0, chan_index
);
1166 FETCH( func
, *inst
, 1, 1, chan_index
);
1176 TGSI_EXEC_TEMP_ONE_I
,
1177 TGSI_EXEC_TEMP_ONE_C
) );
1178 STORE( func
, *inst
, 0, 0, chan_index
);
1184 struct x86_function
*func
,
1185 struct tgsi_full_instruction
*inst
)
1187 unsigned chan_index
;
1189 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1190 FETCH( func
, *inst
, 0, 0, chan_index
);
1191 FETCH( func
, *inst
, 1, 1, chan_index
);
1192 FETCH( func
, *inst
, 2, 2, chan_index
);
1197 TGSI_EXEC_TEMP_00000000_I
,
1198 TGSI_EXEC_TEMP_00000000_C
),
1212 STORE( func
, *inst
, 0, 0, chan_index
);
1218 struct x86_function
*func
,
1219 struct tgsi_full_instruction
*inst
)
1221 unsigned chan_index
;
1223 switch (inst
->Instruction
.Opcode
) {
1224 case TGSI_OPCODE_ARL
:
1225 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1226 FETCH( func
, *inst
, 0, 0, chan_index
);
1227 emit_f2it( func
, 0 );
1228 STORE( func
, *inst
, 0, 0, chan_index
);
1232 case TGSI_OPCODE_MOV
:
1233 case TGSI_OPCODE_SWZ
:
1234 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1235 FETCH( func
, *inst
, 0, 0, chan_index
);
1236 STORE( func
, *inst
, 0, 0, chan_index
);
1240 case TGSI_OPCODE_LIT
:
1241 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1242 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1246 TGSI_EXEC_TEMP_ONE_I
,
1247 TGSI_EXEC_TEMP_ONE_C
);
1248 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1249 STORE( func
, *inst
, 0, 0, CHAN_X
);
1251 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1252 STORE( func
, *inst
, 0, 0, CHAN_W
);
1255 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1256 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1257 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1258 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1263 TGSI_EXEC_TEMP_00000000_I
,
1264 TGSI_EXEC_TEMP_00000000_C
) );
1265 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1267 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1268 /* XMM[1] = SrcReg[0].yyyy */
1269 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1270 /* XMM[1] = max(XMM[1], 0) */
1275 TGSI_EXEC_TEMP_00000000_I
,
1276 TGSI_EXEC_TEMP_00000000_C
) );
1277 /* XMM[2] = SrcReg[0].wwww */
1278 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1279 /* XMM[2] = min(XMM[2], 128.0) */
1284 TGSI_EXEC_TEMP_128_I
,
1285 TGSI_EXEC_TEMP_128_C
) );
1286 /* XMM[2] = max(XMM[2], -128.0) */
1291 TGSI_EXEC_TEMP_MINUS_128_I
,
1292 TGSI_EXEC_TEMP_MINUS_128_C
) );
1293 emit_pow( func
, 1, 2 );
1294 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1308 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1313 case TGSI_OPCODE_RCP
:
1314 /* TGSI_OPCODE_RECIP */
1315 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1316 emit_rcp( func
, 0, 0 );
1317 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1318 STORE( func
, *inst
, 0, 0, chan_index
);
1322 case TGSI_OPCODE_RSQ
:
1323 /* TGSI_OPCODE_RECIPSQRT */
1324 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1325 emit_rsqrt( func
, 1, 0 );
1326 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1327 STORE( func
, *inst
, 1, 0, chan_index
);
1331 case TGSI_OPCODE_EXP
:
1335 case TGSI_OPCODE_LOG
:
1339 case TGSI_OPCODE_MUL
:
1340 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1341 FETCH( func
, *inst
, 0, 0, chan_index
);
1342 FETCH( func
, *inst
, 1, 1, chan_index
);
1343 emit_mul( func
, 0, 1 );
1344 STORE( func
, *inst
, 0, 0, chan_index
);
1348 case TGSI_OPCODE_ADD
:
1349 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1350 FETCH( func
, *inst
, 0, 0, chan_index
);
1351 FETCH( func
, *inst
, 1, 1, chan_index
);
1352 emit_add( func
, 0, 1 );
1353 STORE( func
, *inst
, 0, 0, chan_index
);
1357 case TGSI_OPCODE_DP3
:
1358 /* TGSI_OPCODE_DOT3 */
1359 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1360 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1361 emit_mul( func
, 0, 1 );
1362 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1363 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1364 emit_mul( func
, 1, 2 );
1365 emit_add( func
, 0, 1 );
1366 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1367 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1368 emit_mul( func
, 1, 2 );
1369 emit_add( func
, 0, 1 );
1370 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1371 STORE( func
, *inst
, 0, 0, chan_index
);
1375 case TGSI_OPCODE_DP4
:
1376 /* TGSI_OPCODE_DOT4 */
1377 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1378 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1379 emit_mul( func
, 0, 1 );
1380 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1381 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1382 emit_mul( func
, 1, 2 );
1383 emit_add( func
, 0, 1 );
1384 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1385 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1386 emit_mul(func
, 1, 2 );
1387 emit_add(func
, 0, 1 );
1388 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1389 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1390 emit_mul( func
, 1, 2 );
1391 emit_add( func
, 0, 1 );
1392 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1393 STORE( func
, *inst
, 0, 0, chan_index
);
1397 case TGSI_OPCODE_DST
:
1398 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1402 TGSI_EXEC_TEMP_ONE_I
,
1403 TGSI_EXEC_TEMP_ONE_C
);
1404 STORE( func
, *inst
, 0, 0, CHAN_X
);
1406 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1407 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1408 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1409 emit_mul( func
, 0, 1 );
1410 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1412 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1413 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1414 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1416 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1417 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1418 STORE( func
, *inst
, 0, 0, CHAN_W
);
1422 case TGSI_OPCODE_MIN
:
1423 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1424 FETCH( func
, *inst
, 0, 0, chan_index
);
1425 FETCH( func
, *inst
, 1, 1, chan_index
);
1430 STORE( func
, *inst
, 0, 0, chan_index
);
1434 case TGSI_OPCODE_MAX
:
1435 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1436 FETCH( func
, *inst
, 0, 0, chan_index
);
1437 FETCH( func
, *inst
, 1, 1, chan_index
);
1442 STORE( func
, *inst
, 0, 0, chan_index
);
1446 case TGSI_OPCODE_SLT
:
1447 /* TGSI_OPCODE_SETLT */
1448 emit_setcc( func
, inst
, cc_LessThan
);
1451 case TGSI_OPCODE_SGE
:
1452 /* TGSI_OPCODE_SETGE */
1453 emit_setcc( func
, inst
, cc_NotLessThan
);
1456 case TGSI_OPCODE_MAD
:
1457 /* TGSI_OPCODE_MADD */
1458 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1459 FETCH( func
, *inst
, 0, 0, chan_index
);
1460 FETCH( func
, *inst
, 1, 1, chan_index
);
1461 FETCH( func
, *inst
, 2, 2, chan_index
);
1462 emit_mul( func
, 0, 1 );
1463 emit_add( func
, 0, 2 );
1464 STORE( func
, *inst
, 0, 0, chan_index
);
1468 case TGSI_OPCODE_SUB
:
1469 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1470 FETCH( func
, *inst
, 0, 0, chan_index
);
1471 FETCH( func
, *inst
, 1, 1, chan_index
);
1472 emit_sub( func
, 0, 1 );
1473 STORE( func
, *inst
, 0, 0, chan_index
);
1477 case TGSI_OPCODE_LERP
:
1478 /* TGSI_OPCODE_LRP */
1479 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1480 FETCH( func
, *inst
, 0, 0, chan_index
);
1481 FETCH( func
, *inst
, 1, 1, chan_index
);
1482 FETCH( func
, *inst
, 2, 2, chan_index
);
1483 emit_sub( func
, 1, 2 );
1484 emit_mul( func
, 0, 1 );
1485 emit_add( func
, 0, 2 );
1486 STORE( func
, *inst
, 0, 0, chan_index
);
1490 case TGSI_OPCODE_CND
:
1494 case TGSI_OPCODE_CND0
:
1498 case TGSI_OPCODE_DOT2ADD
:
1499 /* TGSI_OPCODE_DP2A */
1503 case TGSI_OPCODE_INDEX
:
1507 case TGSI_OPCODE_NEGATE
:
1511 case TGSI_OPCODE_FRAC
:
1512 /* TGSI_OPCODE_FRC */
1513 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1514 FETCH( func
, *inst
, 0, 0, chan_index
);
1515 emit_frc( func
, 0 );
1516 STORE( func
, *inst
, 0, 0, chan_index
);
1520 case TGSI_OPCODE_CLAMP
:
1524 case TGSI_OPCODE_FLOOR
:
1525 /* TGSI_OPCODE_FLR */
1526 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1527 FETCH( func
, *inst
, 0, 0, chan_index
);
1528 emit_flr( func
, 0 );
1529 STORE( func
, *inst
, 0, 0, chan_index
);
1533 case TGSI_OPCODE_ROUND
:
1537 case TGSI_OPCODE_EXPBASE2
:
1538 /* TGSI_OPCODE_EX2 */
1539 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1540 emit_ex2( func
, 0 );
1541 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1542 STORE( func
, *inst
, 0, 0, chan_index
);
1546 case TGSI_OPCODE_LOGBASE2
:
1547 /* TGSI_OPCODE_LG2 */
1548 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1549 emit_lg2( func
, 0 );
1550 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1551 STORE( func
, *inst
, 0, 0, chan_index
);
1555 case TGSI_OPCODE_POWER
:
1556 /* TGSI_OPCODE_POW */
1557 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1558 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1559 emit_pow( func
, 0, 1 );
1560 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1561 STORE( func
, *inst
, 0, 0, chan_index
);
1565 case TGSI_OPCODE_CROSSPRODUCT
:
1566 /* TGSI_OPCODE_XPD */
1567 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1568 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1569 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
1570 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
1572 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1573 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1574 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1575 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
1577 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1578 emit_MOV( func
, 2, 0 );
1579 emit_mul( func
, 2, 1 );
1580 emit_MOV( func
, 5, 3 );
1581 emit_mul( func
, 5, 4 );
1582 emit_sub( func
, 2, 5 );
1583 STORE( func
, *inst
, 2, 0, CHAN_X
);
1585 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1586 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1587 FETCH( func
, *inst
, 2, 1, CHAN_X
);
1588 FETCH( func
, *inst
, 5, 0, CHAN_X
);
1590 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1591 emit_mul( func
, 3, 2 );
1592 emit_mul( func
, 1, 5 );
1593 emit_sub( func
, 3, 1 );
1594 STORE( func
, *inst
, 3, 0, CHAN_Y
);
1596 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1597 emit_mul( func
, 5, 4 );
1598 emit_mul( func
, 0, 2 );
1599 emit_sub( func
, 5, 0 );
1600 STORE( func
, *inst
, 5, 0, CHAN_Z
);
1602 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1606 TGSI_EXEC_TEMP_ONE_I
,
1607 TGSI_EXEC_TEMP_ONE_C
);
1608 STORE( func
, *inst
, 0, 0, CHAN_W
);
1612 case TGSI_OPCODE_MULTIPLYMATRIX
:
1616 case TGSI_OPCODE_ABS
:
1617 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1618 FETCH( func
, *inst
, 0, 0, chan_index
);
1619 emit_abs( func
, 0) ;
1621 STORE( func
, *inst
, 0, 0, chan_index
);
1625 case TGSI_OPCODE_RCC
:
1629 case TGSI_OPCODE_DPH
:
1630 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1631 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1632 emit_mul( func
, 0, 1 );
1633 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1634 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1635 emit_mul( func
, 1, 2 );
1636 emit_add( func
, 0, 1 );
1637 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1638 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1639 emit_mul( func
, 1, 2 );
1640 emit_add( func
, 0, 1 );
1641 FETCH( func
, *inst
, 1, 1, CHAN_W
);
1642 emit_add( func
, 0, 1 );
1643 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1644 STORE( func
, *inst
, 0, 0, chan_index
);
1648 case TGSI_OPCODE_COS
:
1649 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1650 emit_cos( func
, 0 );
1651 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1652 STORE( func
, *inst
, 0, 0, chan_index
);
1656 case TGSI_OPCODE_DDX
:
1660 case TGSI_OPCODE_DDY
:
1664 case TGSI_OPCODE_KILP
:
1665 /* predicated kill */
1667 return 0; /* XXX fix me */
1670 case TGSI_OPCODE_KIL
:
1671 /* conditional kill */
1672 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
1675 case TGSI_OPCODE_PK2H
:
1679 case TGSI_OPCODE_PK2US
:
1683 case TGSI_OPCODE_PK4B
:
1687 case TGSI_OPCODE_PK4UB
:
1691 case TGSI_OPCODE_RFL
:
1695 case TGSI_OPCODE_SEQ
:
1699 case TGSI_OPCODE_SFL
:
1703 case TGSI_OPCODE_SGT
:
1707 case TGSI_OPCODE_SIN
:
1708 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1709 emit_sin( func
, 0 );
1710 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1711 STORE( func
, *inst
, 0, 0, chan_index
);
1715 case TGSI_OPCODE_SLE
:
1719 case TGSI_OPCODE_SNE
:
1723 case TGSI_OPCODE_STR
:
1727 case TGSI_OPCODE_TEX
:
1729 /* Disable dummy texture code:
1734 TGSI_EXEC_TEMP_ONE_I
,
1735 TGSI_EXEC_TEMP_ONE_C
);
1736 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1737 STORE( func
, *inst
, 0, 0, chan_index
);
1745 case TGSI_OPCODE_TXD
:
1749 case TGSI_OPCODE_UP2H
:
1753 case TGSI_OPCODE_UP2US
:
1757 case TGSI_OPCODE_UP4B
:
1761 case TGSI_OPCODE_UP4UB
:
1765 case TGSI_OPCODE_X2D
:
1769 case TGSI_OPCODE_ARA
:
1773 case TGSI_OPCODE_ARR
:
1777 case TGSI_OPCODE_BRA
:
1781 case TGSI_OPCODE_CAL
:
1785 case TGSI_OPCODE_RET
:
1789 case TGSI_OPCODE_END
:
1792 case TGSI_OPCODE_SSG
:
1796 case TGSI_OPCODE_CMP
:
1797 emit_cmp (func
, inst
);
1800 case TGSI_OPCODE_SCS
:
1801 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1802 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1803 emit_cos( func
, 0 );
1804 STORE( func
, *inst
, 0, 0, CHAN_X
);
1806 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1807 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1808 emit_sin( func
, 0 );
1809 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1811 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1815 TGSI_EXEC_TEMP_00000000_I
,
1816 TGSI_EXEC_TEMP_00000000_C
);
1817 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1819 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1823 TGSI_EXEC_TEMP_ONE_I
,
1824 TGSI_EXEC_TEMP_ONE_C
);
1825 STORE( func
, *inst
, 0, 0, CHAN_W
);
1829 case TGSI_OPCODE_TXB
:
1833 case TGSI_OPCODE_NRM
:
1837 case TGSI_OPCODE_DIV
:
1841 case TGSI_OPCODE_DP2
:
1845 case TGSI_OPCODE_TXL
:
1849 case TGSI_OPCODE_BRK
:
1853 case TGSI_OPCODE_IF
:
1857 case TGSI_OPCODE_LOOP
:
1861 case TGSI_OPCODE_REP
:
1865 case TGSI_OPCODE_ELSE
:
1869 case TGSI_OPCODE_ENDIF
:
1873 case TGSI_OPCODE_ENDLOOP
:
1877 case TGSI_OPCODE_ENDREP
:
1881 case TGSI_OPCODE_PUSHA
:
1885 case TGSI_OPCODE_POPA
:
1889 case TGSI_OPCODE_CEIL
:
1893 case TGSI_OPCODE_I2F
:
1897 case TGSI_OPCODE_NOT
:
1901 case TGSI_OPCODE_TRUNC
:
1905 case TGSI_OPCODE_SHL
:
1909 case TGSI_OPCODE_SHR
:
1913 case TGSI_OPCODE_AND
:
1917 case TGSI_OPCODE_OR
:
1921 case TGSI_OPCODE_MOD
:
1925 case TGSI_OPCODE_XOR
:
1929 case TGSI_OPCODE_SAD
:
1933 case TGSI_OPCODE_TXF
:
1937 case TGSI_OPCODE_TXQ
:
1941 case TGSI_OPCODE_CONT
:
1945 case TGSI_OPCODE_EMIT
:
1949 case TGSI_OPCODE_ENDPRIM
:
1962 struct x86_function
*func
,
1963 struct tgsi_full_declaration
*decl
)
1965 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
1966 unsigned first
, last
, mask
;
1969 first
= decl
->DeclarationRange
.First
;
1970 last
= decl
->DeclarationRange
.Last
;
1971 mask
= decl
->Declaration
.UsageMask
;
1973 for( i
= first
; i
<= last
; i
++ ) {
1974 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
1975 if( mask
& (1 << j
) ) {
1976 switch( decl
->Declaration
.Interpolate
) {
1977 case TGSI_INTERPOLATE_CONSTANT
:
1978 emit_coef_a0( func
, 0, i
, j
);
1979 emit_inputs( func
, 0, i
, j
);
1982 case TGSI_INTERPOLATE_LINEAR
:
1983 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
1984 emit_coef_dadx( func
, 1, i
, j
);
1985 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
1986 emit_coef_dady( func
, 3, i
, j
);
1987 emit_mul( func
, 0, 1 ); /* x * dadx */
1988 emit_coef_a0( func
, 4, i
, j
);
1989 emit_mul( func
, 2, 3 ); /* y * dady */
1990 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
1991 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
1992 emit_inputs( func
, 0, i
, j
);
1995 case TGSI_INTERPOLATE_PERSPECTIVE
:
1996 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
1997 emit_coef_dadx( func
, 1, i
, j
);
1998 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
1999 emit_coef_dady( func
, 3, i
, j
);
2000 emit_mul( func
, 0, 1 ); /* x * dadx */
2001 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2002 emit_coef_a0( func
, 5, i
, j
);
2003 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2004 emit_mul( func
, 2, 3 ); /* y * dady */
2005 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2006 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2007 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2008 emit_inputs( func
, 0, i
, j
);
2021 static void aos_to_soa( struct x86_function
*func
,
2027 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2028 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2029 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2030 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2035 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2037 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2038 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_soa
) );
2039 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2040 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2043 inner_loop
= x86_get_label( func
);
2045 x86_push( func
, aos_input
);
2046 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2047 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2048 x86_add( func
, aos_input
, stride
);
2049 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2050 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2051 x86_add( func
, aos_input
, stride
);
2052 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2053 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2054 x86_add( func
, aos_input
, stride
);
2055 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2056 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2057 x86_pop( func
, aos_input
);
2059 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2060 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2061 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2062 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2063 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2064 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2066 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2067 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2068 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2069 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2071 /* Advance to next input */
2072 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2073 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2075 /* while --num_inputs */
2076 x86_dec( func
, num_inputs
);
2077 x86_jcc( func
, cc_NE
, inner_loop
);
2080 x86_pop( func
, aos_input
);
2083 static void soa_to_aos( struct x86_function
*func
, uint aos
, uint soa
, uint num
, uint stride
)
2085 struct x86_reg soa_output
;
2086 struct x86_reg aos_output
;
2087 struct x86_reg num_outputs
;
2088 struct x86_reg temp
;
2091 soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2092 aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2093 num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2094 temp
= x86_make_reg( file_REG32
, reg_DX
);
2097 x86_push( func
, aos_output
);
2099 x86_mov( func
, soa_output
, x86_fn_arg( func
, soa
) );
2100 x86_mov( func
, aos_output
, x86_fn_arg( func
, aos
) );
2101 x86_mov( func
, num_outputs
, x86_fn_arg( func
, num
) );
2104 inner_loop
= x86_get_label( func
);
2106 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2107 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2108 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2109 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2111 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2112 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2113 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2114 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2115 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2116 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2118 x86_mov( func
, temp
, x86_fn_arg( func
, stride
) );
2119 x86_push( func
, aos_output
);
2120 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2121 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2122 x86_add( func
, aos_output
, temp
);
2123 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2124 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2125 x86_add( func
, aos_output
, temp
);
2126 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2127 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2128 x86_add( func
, aos_output
, temp
);
2129 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2130 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2131 x86_pop( func
, aos_output
);
2133 /* Advance to next output */
2134 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2135 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2137 /* while --num_outputs */
2138 x86_dec( func
, num_outputs
);
2139 x86_jcc( func
, cc_NE
, inner_loop
);
2142 x86_pop( func
, aos_output
);
2146 * Translate a TGSI vertex/fragment shader to SSE2 code.
2147 * Slightly different things are done for vertex vs. fragment shaders.
2149 * Note that fragment shaders are responsible for interpolating shader
2150 * inputs. Because on x86 we have only 4 GP registers, and here we
2151 * have 5 shader arguments (input, output, const, temp and coef), the
2152 * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2153 * GP register holding the output argument is aliased with the coeff
2154 * argument, as outputs are not needed in the DECLARATION phase.
2156 * \param tokens the TGSI input shader
2157 * \param func the output SSE code/function
2158 * \param immediates buffer to place immediates, later passed to SSE func
2159 * \param return 1 for success, 0 if translation failed
2163 const struct tgsi_token
*tokens
,
2164 struct x86_function
*func
,
2165 float (*immediates
)[4],
2166 boolean do_swizzles
)
2168 struct tgsi_parse_context parse
;
2169 boolean instruction_phase
= FALSE
;
2171 uint num_immediates
= 0;
2173 func
->csr
= func
->store
;
2175 tgsi_parse_init( &parse
, tokens
);
2177 /* Can't just use EDI, EBX without save/restoring them:
2181 get_immediate_base() );
2189 * Different function args for vertex/fragment shaders:
2191 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2192 /* DECLARATION phase, do not load output argument. */
2196 x86_fn_arg( func
, 1 ) );
2197 /* skipping outputs argument here */
2201 x86_fn_arg( func
, 3 ) );
2205 x86_fn_arg( func
, 4 ) );
2209 x86_fn_arg( func
, 5 ) );
2212 get_immediate_base(),
2213 x86_fn_arg( func
, 6 ) );
2216 assert(parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
);
2221 1, /* machine->input */
2223 8 ); /* input_stride */
2228 x86_fn_arg( func
, 1 ) );
2232 x86_fn_arg( func
, 2 ) );
2236 x86_fn_arg( func
, 3 ) );
2240 x86_fn_arg( func
, 4 ) );
2243 get_immediate_base(),
2244 x86_fn_arg( func
, 5 ) );
2247 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2248 tgsi_parse_token( &parse
);
2250 switch( parse
.FullToken
.Token
.Type
) {
2251 case TGSI_TOKEN_TYPE_DECLARATION
:
2252 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2255 &parse
.FullToken
.FullDeclaration
);
2259 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2260 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2261 if( !instruction_phase
) {
2262 /* INSTRUCTION phase, overwrite coeff with output. */
2263 instruction_phase
= TRUE
;
2267 x86_fn_arg( func
, 2 ) );
2271 ok
= emit_instruction(
2273 &parse
.FullToken
.FullInstruction
);
2276 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2277 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2278 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2279 "vertex shader" : "fragment shader");
2283 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2284 /* simply copy the immediate values into the next immediates[] slot */
2286 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.Size
- 1;
2289 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2290 for( i
= 0; i
< size
; i
++ ) {
2291 immediates
[num_immediates
][i
] =
2292 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2295 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2297 immediates
[num_immediates
][0],
2298 immediates
[num_immediates
][1],
2299 immediates
[num_immediates
][2],
2300 immediates
[num_immediates
][3]);
2312 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2314 soa_to_aos( func
, 9, 2, 10, 11 );
2317 /* Can't just use EBX, EDI without save/restoring them:
2325 get_immediate_base() );
2329 tgsi_parse_free( &parse
);
2334 #endif /* PIPE_ARCH_X86 */