1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
29 #include "pipe/p_config.h"
31 #if defined(PIPE_ARCH_X86)
33 #include "util/u_debug.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37 #if defined(PIPE_ARCH_SSE)
38 #include "util/u_sse.h"
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_parse.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44 #include "tgsi/tgsi_exec.h"
45 #include "tgsi/tgsi_sse2.h"
47 #include "rtasm/rtasm_x86sse.h"
51 * This costs about 100fps (close to 10%) in gears:
53 #define HIGH_PRECISION 1
58 #define FOR_EACH_CHANNEL( CHAN )\
59 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
65 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
68 FOR_EACH_CHANNEL( CHAN )\
69 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
76 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
77 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
79 #define TEMP_R0 TGSI_EXEC_TEMP_R0
80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
86 * X86 utility functions.
95 (enum x86_reg_name
) xmm
);
99 * X86 register mapping helpers.
102 static struct x86_reg
103 get_const_base( void )
110 static struct x86_reg
111 get_machine_base( void )
118 static struct x86_reg
119 get_input_base( void )
121 return x86_make_disp(
123 Offset(struct tgsi_exec_machine
, Inputs
) );
126 static struct x86_reg
127 get_output_base( void )
129 return x86_make_disp(
131 Offset(struct tgsi_exec_machine
, Outputs
) );
134 static struct x86_reg
135 get_temp_base( void )
137 return x86_make_disp(
139 Offset(struct tgsi_exec_machine
, Temps
) );
142 static struct x86_reg
143 get_coef_base( void )
150 static struct x86_reg
151 get_sampler_base( void )
158 static struct x86_reg
159 get_immediate_base( void )
168 * Data access helpers.
172 static struct x86_reg
177 return x86_make_disp(
178 get_immediate_base(),
179 (vec
* 4 + chan
) * 4 );
182 static struct x86_reg
187 return x86_make_disp(
189 (vec
* 4 + chan
) * 4 );
192 static struct x86_reg
196 return x86_make_disp(
198 unit
* sizeof( struct tgsi_sampler
* ) );
201 static struct x86_reg
206 return x86_make_disp(
208 (vec
* 4 + chan
) * 16 );
211 static struct x86_reg
216 return x86_make_disp(
218 (vec
* 4 + chan
) * 16 );
221 static struct x86_reg
226 return x86_make_disp(
228 (vec
* 4 + chan
) * 16 );
231 static struct x86_reg
237 return x86_make_disp(
239 ((vec
* 3 + member
) * 4 + chan
) * 4 );
245 struct x86_function
*func
)
252 * Data fetch helpers.
256 * Copy a shader constant to xmm register
257 * \param xmm the destination xmm register
258 * \param vec the src const buffer index
259 * \param chan src channel to fetch (X, Y, Z or W)
263 struct x86_function
*func
,
272 /* 'vec' is the offset from the address register's value.
273 * We're loading CONST[ADDR+vec] into an xmm register.
275 struct x86_reg r0
= get_immediate_base();
276 struct x86_reg r1
= get_coef_base();
279 assert( indirectFile
== TGSI_FILE_ADDRESS
);
280 assert( indirectIndex
== 0 );
281 assert( r0
.mod
== mod_REG
);
282 assert( r1
.mod
== mod_REG
);
284 x86_push( func
, r0
);
285 x86_push( func
, r1
);
288 * Loop over the four pixels or vertices in the quad.
289 * Get the value of the address (offset) register for pixel/vertex[i],
290 * add it to the src offset and index into the constant buffer.
291 * Note that we're working on SOA data.
292 * If any of the pixel/vertex execution channels are unused their
293 * values will be garbage. It's very important that we don't use
294 * those garbage values as indexes into the constant buffer since
295 * that'll cause segfaults.
296 * The solution is to bitwise-AND the offset with the execution mask
297 * register whose values are either 0 or ~0.
298 * The caller must setup the execution mask register to indicate
299 * which channels are valid/alive before running the shader.
300 * The execution mask will also figure into loops and conditionals
303 for (i
= 0; i
< QUAD_SIZE
; i
++) {
304 /* r1 = address register[i] */
305 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
306 /* r0 = execution mask[i] */
307 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
309 x86_and( func
, r1
, r0
);
310 /* r0 = 'vec', the offset */
311 x86_lea( func
, r0
, get_const( vec
, chan
) );
313 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
315 x86_add( func
, r1
, r1
);
316 x86_add( func
, r1
, r1
);
317 x86_add( func
, r1
, r1
);
318 x86_add( func
, r1
, r1
);
320 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
321 x86_mov( func
, r1
, x86_deref( r0
) );
322 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
331 get_temp( TEMP_R0
, CHAN_X
) );
334 /* 'vec' is the index into the src register file, such as TEMP[vec] */
340 get_const( vec
, chan
) );
345 SHUF( 0, 0, 0, 0 ) );
351 struct x86_function
*func
,
359 get_immediate( vec
, chan
) );
364 SHUF( 0, 0, 0, 0 ) );
369 * Copy a shader input to xmm register
370 * \param xmm the destination xmm register
371 * \param vec the src input attrib
372 * \param chan src channel to fetch (X, Y, Z or W)
376 struct x86_function
*func
,
384 get_input( vec
, chan
) );
388 * Store an xmm register to a shader output
389 * \param xmm the source xmm register
390 * \param vec the dest output attrib
391 * \param chan src dest channel to store (X, Y, Z or W)
395 struct x86_function
*func
,
402 get_output( vec
, chan
),
407 * Copy a shader temporary to xmm register
408 * \param xmm the destination xmm register
409 * \param vec the src temp register
410 * \param chan src channel to fetch (X, Y, Z or W)
414 struct x86_function
*func
,
422 get_temp( vec
, chan
) );
426 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
427 * \param xmm the destination xmm register
428 * \param vec the src input/attribute coefficient index
429 * \param chan src channel to fetch (X, Y, Z or W)
430 * \param member 0=a0, 1=dadx, 2=dady
434 struct x86_function
*func
,
443 get_coef( vec
, chan
, member
) );
448 SHUF( 0, 0, 0, 0 ) );
452 * Data store helpers.
457 struct x86_function
*func
,
464 get_input( vec
, chan
),
470 struct x86_function
*func
,
477 get_temp( vec
, chan
),
483 struct x86_function
*func
,
493 vec
+ TGSI_EXEC_TEMP_ADDR
,
498 * Coefficent fetch helpers.
503 struct x86_function
*func
,
518 struct x86_function
*func
,
533 struct x86_function
*func
,
547 * Function call helpers.
551 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
552 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
553 * that the stack pointer is 16 byte aligned, as expected.
557 struct x86_function
*func
,
558 unsigned xmm_save_mask
,
559 const struct x86_reg
*arg
,
561 void (PIPE_CDECL
*code
)() )
563 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
568 x86_make_reg( file_REG32
, reg_AX
) );
571 x86_make_reg( file_REG32
, reg_CX
) );
574 x86_make_reg( file_REG32
, reg_DX
) );
576 /* Store XMM regs to the stack
578 for(i
= 0, n
= 0; i
< 8; ++i
)
579 if(xmm_save_mask
& (1 << i
))
584 x86_make_reg( file_REG32
, reg_SP
),
587 for(i
= 0, n
= 0; i
< 8; ++i
)
588 if(xmm_save_mask
& (1 << i
)) {
591 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
596 for (i
= 0; i
< nr_args
; i
++) {
597 /* Load the address of the buffer we use for passing arguments and
605 /* Push actual function arguments (currently just the pointer to
606 * the buffer above), and call the function:
608 x86_push( func
, ecx
);
611 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
612 x86_call( func
, ecx
);
614 /* Pop the arguments (or just add an immediate to esp)
616 for (i
= 0; i
< nr_args
; i
++) {
620 /* Pop the saved XMM regs:
622 for(i
= 0, n
= 0; i
< 8; ++i
)
623 if(xmm_save_mask
& (1 << i
)) {
627 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
633 x86_make_reg( file_REG32
, reg_SP
),
636 /* Restore GP registers in a reverse order.
640 x86_make_reg( file_REG32
, reg_DX
) );
643 x86_make_reg( file_REG32
, reg_CX
) );
646 x86_make_reg( file_REG32
, reg_AX
) );
650 emit_func_call_dst_src1(
651 struct x86_function
*func
,
655 void (PIPE_CDECL
*code
)() )
657 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
658 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
660 /* Store our input parameters (in xmm regs) to the buffer we use
661 * for passing arguments. We will pass a pointer to this buffer as
662 * the actual function argument.
667 make_xmm( xmm_src0
) );
669 emit_func_call( func
,
683 emit_func_call_dst_src2(
684 struct x86_function
*func
,
689 void (PIPE_CDECL
*code
)() )
691 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
692 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
694 /* Store two inputs to parameter buffer.
699 make_xmm( xmm_src0
) );
703 x86_make_disp( store
, 4 * sizeof(float) ),
704 make_xmm( xmm_src1
) );
709 emit_func_call( func
,
715 /* Retrieve the results:
727 #if defined(PIPE_ARCH_SSE)
730 * Fast SSE2 implementation of special math functions.
733 #define POLY0(x, c0) _mm_set1_ps(c0)
734 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
735 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
736 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
737 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
738 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
740 #define EXP_POLY_DEGREE 3
741 #define LOG_POLY_DEGREE 5
744 * See http://www.devmaster.net/forums/showthread.php?p=43580
750 __m128 fpart
, expipart
, expfpart
;
752 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
753 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
755 /* ipart = int(x - 0.5) */
756 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
758 /* fpart = x - ipart */
759 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
761 /* expipart = (float) (1 << ipart) */
762 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
764 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
765 #if EXP_POLY_DEGREE == 5
766 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
767 #elif EXP_POLY_DEGREE == 4
768 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
769 #elif EXP_POLY_DEGREE == 3
770 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
771 #elif EXP_POLY_DEGREE == 2
772 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
777 return _mm_mul_ps(expipart
, expfpart
);
782 * See http://www.devmaster.net/forums/showthread.php?p=43580
787 __m128i expmask
= _mm_set1_epi32(0x7f800000);
788 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
789 __m128 one
= _mm_set1_ps(1.0f
);
791 __m128i i
= _mm_castps_si128(x
);
793 /* exp = (float) exponent(x) */
794 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
796 /* mant = (float) mantissa(x) */
797 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
801 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
802 * These coefficients can be generate with
803 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
805 #if LOG_POLY_DEGREE == 6
806 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
807 #elif LOG_POLY_DEGREE == 5
808 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
809 #elif LOG_POLY_DEGREE == 4
810 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
811 #elif LOG_POLY_DEGREE == 3
812 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
817 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
818 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
820 return _mm_add_ps(logmant
, exp
);
825 powf4(__m128 x
, __m128 y
)
827 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
830 #endif /* PIPE_ARCH_SSE */
835 * Low-level instruction translators.
840 struct x86_function
*func
,
847 TGSI_EXEC_TEMP_7FFFFFFF_I
,
848 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
853 struct x86_function
*func
,
860 make_xmm( xmm_src
) );
863 static void PIPE_CDECL
867 store
[0] = cosf( store
[0] );
868 store
[1] = cosf( store
[1] );
869 store
[2] = cosf( store
[2] );
870 store
[3] = cosf( store
[3] );
875 struct x86_function
*func
,
879 emit_func_call_dst_src1(
887 static void PIPE_CDECL
888 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
889 __attribute__((force_align_arg_pointer
))
894 #if defined(PIPE_ARCH_SSE)
895 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
897 store
[0] = util_fast_exp2( store
[0] );
898 store
[1] = util_fast_exp2( store
[1] );
899 store
[2] = util_fast_exp2( store
[2] );
900 store
[3] = util_fast_exp2( store
[3] );
906 struct x86_function
*func
,
910 emit_func_call_dst_src1(
920 struct x86_function
*func
,
931 struct x86_function
*func
,
940 static void PIPE_CDECL
944 store
[0] = floorf( store
[0] );
945 store
[1] = floorf( store
[1] );
946 store
[2] = floorf( store
[2] );
947 store
[3] = floorf( store
[3] );
952 struct x86_function
*func
,
956 emit_func_call_dst_src1(
964 static void PIPE_CDECL
968 store
[0] -= floorf( store
[0] );
969 store
[1] -= floorf( store
[1] );
970 store
[2] -= floorf( store
[2] );
971 store
[3] -= floorf( store
[3] );
976 struct x86_function
*func
,
980 emit_func_call_dst_src1(
988 static void PIPE_CDECL
989 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
990 __attribute__((force_align_arg_pointer
))
995 #if defined(PIPE_ARCH_SSE)
996 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
998 store
[0] = util_fast_log2( store
[0] );
999 store
[1] = util_fast_log2( store
[1] );
1000 store
[2] = util_fast_log2( store
[2] );
1001 store
[3] = util_fast_log2( store
[3] );
1007 struct x86_function
*func
,
1011 emit_func_call_dst_src1(
1021 struct x86_function
*func
,
1027 make_xmm( xmm_dst
),
1028 make_xmm( xmm_src
) );
1032 emit_mul (struct x86_function
*func
,
1038 make_xmm( xmm_dst
),
1039 make_xmm( xmm_src
) );
1044 struct x86_function
*func
,
1051 TGSI_EXEC_TEMP_80000000_I
,
1052 TGSI_EXEC_TEMP_80000000_C
) );
1055 static void PIPE_CDECL
1056 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1057 __attribute__((force_align_arg_pointer
))
1062 #if defined(PIPE_ARCH_SSE)
1063 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1065 store
[0] = util_fast_pow( store
[0], store
[4] );
1066 store
[1] = util_fast_pow( store
[1], store
[5] );
1067 store
[2] = util_fast_pow( store
[2], store
[6] );
1068 store
[3] = util_fast_pow( store
[3], store
[7] );
1074 struct x86_function
*func
,
1080 emit_func_call_dst_src2(
1091 struct x86_function
*func
,
1095 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1096 * good enough. Need to either emit a proper divide or use the
1097 * iterative technique described below in emit_rsqrt().
1101 make_xmm( xmm_dst
),
1102 make_xmm( xmm_src
) );
1105 static void PIPE_CDECL
1109 store
[0] = floorf( store
[0] + 0.5f
);
1110 store
[1] = floorf( store
[1] + 0.5f
);
1111 store
[2] = floorf( store
[2] + 0.5f
);
1112 store
[3] = floorf( store
[3] + 0.5f
);
1117 struct x86_function
*func
,
1121 emit_func_call_dst_src1(
1131 struct x86_function
*func
,
1136 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1137 * implementations, it is possible to improve its precision at
1138 * fairly low cost, using a newton/raphson step, as below:
1140 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1141 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1143 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1146 struct x86_reg dst
= make_xmm( xmm_dst
);
1147 struct x86_reg src
= make_xmm( xmm_src
);
1148 struct x86_reg tmp0
= make_xmm( 2 );
1149 struct x86_reg tmp1
= make_xmm( 3 );
1151 assert( xmm_dst
!= xmm_src
);
1152 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1153 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1155 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1156 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1157 sse_rsqrtps( func
, tmp1
, src
);
1158 sse_mulps( func
, src
, tmp1
);
1159 sse_mulps( func
, dst
, tmp1
);
1160 sse_mulps( func
, src
, tmp1
);
1161 sse_subps( func
, tmp0
, src
);
1162 sse_mulps( func
, dst
, tmp0
);
1165 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1170 make_xmm( xmm_dst
),
1171 make_xmm( xmm_src
) );
1177 struct x86_function
*func
,
1184 TGSI_EXEC_TEMP_80000000_I
,
1185 TGSI_EXEC_TEMP_80000000_C
) );
1188 static void PIPE_CDECL
1192 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1193 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1194 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1195 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1200 struct x86_function
*func
,
1204 emit_func_call_dst_src1(
1212 static void PIPE_CDECL
1216 store
[0] = sinf( store
[0] );
1217 store
[1] = sinf( store
[1] );
1218 store
[2] = sinf( store
[2] );
1219 store
[3] = sinf( store
[3] );
1223 emit_sin (struct x86_function
*func
,
1227 emit_func_call_dst_src1(
1237 struct x86_function
*func
,
1243 make_xmm( xmm_dst
),
1244 make_xmm( xmm_src
) );
1259 struct x86_function
*func
,
1261 const struct tgsi_full_src_register
*reg
,
1262 const unsigned chan_index
)
1264 unsigned swizzle
= tgsi_util_get_full_src_register_swizzle( reg
, chan_index
);
1267 case TGSI_SWIZZLE_X
:
1268 case TGSI_SWIZZLE_Y
:
1269 case TGSI_SWIZZLE_Z
:
1270 case TGSI_SWIZZLE_W
:
1271 switch (reg
->Register
.File
) {
1272 case TGSI_FILE_CONSTANT
:
1276 reg
->Register
.Index
,
1278 reg
->Register
.Indirect
,
1280 reg
->Indirect
.Index
);
1283 case TGSI_FILE_IMMEDIATE
:
1287 reg
->Register
.Index
,
1291 case TGSI_FILE_INPUT
:
1292 case TGSI_FILE_SYSTEM_VALUE
:
1296 reg
->Register
.Index
,
1300 case TGSI_FILE_TEMPORARY
:
1304 reg
->Register
.Index
,
1317 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1318 case TGSI_UTIL_SIGN_CLEAR
:
1319 emit_abs( func
, xmm
);
1322 case TGSI_UTIL_SIGN_SET
:
1323 emit_setsign( func
, xmm
);
1326 case TGSI_UTIL_SIGN_TOGGLE
:
1327 emit_neg( func
, xmm
);
1330 case TGSI_UTIL_SIGN_KEEP
:
1335 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1336 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1344 struct x86_function
*func
,
1346 const struct tgsi_full_dst_register
*reg
,
1347 const struct tgsi_full_instruction
*inst
,
1348 unsigned chan_index
)
1350 switch( inst
->Instruction
.Saturate
) {
1354 case TGSI_SAT_ZERO_ONE
:
1359 TGSI_EXEC_TEMP_00000000_I
,
1360 TGSI_EXEC_TEMP_00000000_C
) );
1366 TGSI_EXEC_TEMP_ONE_I
,
1367 TGSI_EXEC_TEMP_ONE_C
) );
1370 case TGSI_SAT_MINUS_PLUS_ONE
:
1376 switch( reg
->Register
.File
) {
1377 case TGSI_FILE_OUTPUT
:
1381 reg
->Register
.Index
,
1385 case TGSI_FILE_TEMPORARY
:
1389 reg
->Register
.Index
,
1393 case TGSI_FILE_ADDRESS
:
1397 reg
->Register
.Index
,
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler
**sampler
,
1417 debug_printf("%s sampler: %p (%p) store: %p\n",
1422 for (j
= 0; j
< 4; j
++)
1423 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1432 float rgba
[NUM_CHANNELS
][QUAD_SIZE
];
1433 (*sampler
)->get_samples(*sampler
,
1437 &store
[12], /* lodbias */
1438 tgsi_sampler_lod_bias
,
1439 rgba
); /* results */
1441 memcpy( store
, rgba
, 16 * sizeof(float));
1445 for (j
= 0; j
< 4; j
++)
1446 debug_printf("sample %d result %f %f %f %f\n",
1456 * High-level instruction translators.
1460 emit_tex( struct x86_function
*func
,
1461 const struct tgsi_full_instruction
*inst
,
1465 const uint unit
= inst
->Src
[1].Register
.Index
;
1466 struct x86_reg args
[2];
1470 assert(inst
->Instruction
.Texture
);
1471 switch (inst
->Texture
.Texture
) {
1472 case TGSI_TEXTURE_1D
:
1475 case TGSI_TEXTURE_2D
:
1476 case TGSI_TEXTURE_RECT
:
1479 case TGSI_TEXTURE_SHADOW1D
:
1480 case TGSI_TEXTURE_SHADOW2D
:
1481 case TGSI_TEXTURE_SHADOWRECT
:
1482 case TGSI_TEXTURE_3D
:
1483 case TGSI_TEXTURE_CUBE
:
1492 FETCH( func
, *inst
, 3, 0, 3 );
1498 TGSI_EXEC_TEMP_00000000_I
,
1499 TGSI_EXEC_TEMP_00000000_C
);
1503 /* store lodbias whether enabled or not -- fetch_texel currently
1504 * respects it always.
1507 get_temp( TEMP_R0
, 3 ),
1512 FETCH( func
, *inst
, 3, 0, 3 );
1514 emit_rcp( func
, 3, 3 );
1517 for (i
= 0; i
< count
; i
++) {
1518 FETCH( func
, *inst
, i
, 0, i
);
1527 /* Store in the argument buffer:
1531 get_temp( TEMP_R0
, i
),
1535 args
[0] = get_temp( TEMP_R0
, 0 );
1536 args
[1] = get_sampler_ptr( unit
);
1539 emit_func_call( func
,
1545 /* If all four channels are enabled, could use a pointer to
1546 * dst[0].x instead of TEMP_R0 for store?
1548 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, i
) {
1553 get_temp( TEMP_R0
, i
) );
1555 STORE( func
, *inst
, 0, 0, i
);
1562 struct x86_function
*func
,
1563 const struct tgsi_full_src_register
*reg
)
1565 unsigned uniquemask
;
1566 unsigned unique_count
= 0;
1567 unsigned chan_index
;
1570 /* This mask stores component bits that were already tested. Note that
1571 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1575 FOR_EACH_CHANNEL( chan_index
) {
1578 /* unswizzle channel */
1579 swizzle
= tgsi_util_get_full_src_register_swizzle(
1583 /* check if the component has not been already tested */
1584 if( !(uniquemask
& (1 << swizzle
)) ) {
1585 uniquemask
|= 1 << swizzle
;
1587 /* allocate register */
1598 x86_make_reg( file_REG32
, reg_AX
) );
1601 x86_make_reg( file_REG32
, reg_DX
) );
1603 for (i
= 0 ; i
< unique_count
; i
++ ) {
1604 struct x86_reg dataXMM
= make_xmm(i
);
1610 TGSI_EXEC_TEMP_00000000_I
,
1611 TGSI_EXEC_TEMP_00000000_C
),
1617 x86_make_reg( file_REG32
, reg_AX
),
1623 x86_make_reg( file_REG32
, reg_DX
),
1627 x86_make_reg( file_REG32
, reg_AX
),
1628 x86_make_reg( file_REG32
, reg_DX
) );
1635 TGSI_EXEC_TEMP_KILMASK_I
,
1636 TGSI_EXEC_TEMP_KILMASK_C
),
1637 x86_make_reg( file_REG32
, reg_AX
) );
1641 x86_make_reg( file_REG32
, reg_DX
) );
1644 x86_make_reg( file_REG32
, reg_AX
) );
1650 struct x86_function
*func
)
1652 /* XXX todo / fix me */
1658 struct x86_function
*func
,
1659 struct tgsi_full_instruction
*inst
,
1662 unsigned chan_index
;
1664 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1665 FETCH( func
, *inst
, 0, 0, chan_index
);
1666 FETCH( func
, *inst
, 1, 1, chan_index
);
1678 STORE( func
, *inst
, 0, 0, chan_index
);
1684 struct x86_function
*func
,
1685 struct tgsi_full_instruction
*inst
)
1687 unsigned chan_index
;
1689 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1690 FETCH( func
, *inst
, 0, 0, chan_index
);
1691 FETCH( func
, *inst
, 1, 1, chan_index
);
1692 FETCH( func
, *inst
, 2, 2, chan_index
);
1697 TGSI_EXEC_TEMP_00000000_I
,
1698 TGSI_EXEC_TEMP_00000000_C
),
1712 STORE( func
, *inst
, 0, 0, chan_index
);
1718 * Check if inst src/dest regs use indirect addressing into temporary
1722 indirect_temp_reference(const struct tgsi_full_instruction
*inst
)
1725 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1726 const struct tgsi_full_src_register
*reg
= &inst
->Src
[i
];
1727 if (reg
->Register
.File
== TGSI_FILE_TEMPORARY
&&
1728 reg
->Register
.Indirect
)
1731 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1732 const struct tgsi_full_dst_register
*reg
= &inst
->Dst
[i
];
1733 if (reg
->Register
.File
== TGSI_FILE_TEMPORARY
&&
1734 reg
->Register
.Indirect
)
1743 struct x86_function
*func
,
1744 struct tgsi_full_instruction
*inst
)
1746 unsigned chan_index
;
1748 /* we can't handle indirect addressing into temp register file yet */
1749 if (indirect_temp_reference(inst
))
1752 switch (inst
->Instruction
.Opcode
) {
1753 case TGSI_OPCODE_ARL
:
1754 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1755 FETCH( func
, *inst
, 0, 0, chan_index
);
1756 emit_flr(func
, 0, 0);
1757 emit_f2it( func
, 0 );
1758 STORE( func
, *inst
, 0, 0, chan_index
);
1762 case TGSI_OPCODE_MOV
:
1763 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1764 FETCH( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1766 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1767 STORE( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1771 case TGSI_OPCODE_LIT
:
1772 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1773 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1779 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1780 STORE( func
, *inst
, 0, 0, CHAN_X
);
1782 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1783 STORE( func
, *inst
, 0, 0, CHAN_W
);
1786 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1787 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1788 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1789 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1794 TGSI_EXEC_TEMP_00000000_I
,
1795 TGSI_EXEC_TEMP_00000000_C
) );
1796 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1798 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1799 /* XMM[1] = SrcReg[0].yyyy */
1800 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1801 /* XMM[1] = max(XMM[1], 0) */
1806 TGSI_EXEC_TEMP_00000000_I
,
1807 TGSI_EXEC_TEMP_00000000_C
) );
1808 /* XMM[2] = SrcReg[0].wwww */
1809 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1810 /* XMM[2] = min(XMM[2], 128.0) */
1815 TGSI_EXEC_TEMP_128_I
,
1816 TGSI_EXEC_TEMP_128_C
) );
1817 /* XMM[2] = max(XMM[2], -128.0) */
1822 TGSI_EXEC_TEMP_MINUS_128_I
,
1823 TGSI_EXEC_TEMP_MINUS_128_C
) );
1824 emit_pow( func
, 3, 1, 1, 2 );
1825 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1839 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1844 case TGSI_OPCODE_RCP
:
1845 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1846 emit_rcp( func
, 0, 0 );
1847 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1848 STORE( func
, *inst
, 0, 0, chan_index
);
1852 case TGSI_OPCODE_RSQ
:
1853 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1854 emit_abs( func
, 0 );
1855 emit_rsqrt( func
, 1, 0 );
1856 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1857 STORE( func
, *inst
, 1, 0, chan_index
);
1861 case TGSI_OPCODE_EXP
:
1862 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1863 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1864 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1865 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1866 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1867 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1868 emit_MOV( func
, 1, 0 );
1869 emit_flr( func
, 2, 1 );
1870 /* dst.x = ex2(floor(src.x)) */
1871 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1872 emit_MOV( func
, 2, 1 );
1873 emit_ex2( func
, 3, 2 );
1874 STORE( func
, *inst
, 2, 0, CHAN_X
);
1876 /* dst.y = src.x - floor(src.x) */
1877 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1878 emit_MOV( func
, 2, 0 );
1879 emit_sub( func
, 2, 1 );
1880 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1883 /* dst.z = ex2(src.x) */
1884 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1885 emit_ex2( func
, 3, 0 );
1886 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1890 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1891 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1892 STORE( func
, *inst
, 0, 0, CHAN_W
);
1896 case TGSI_OPCODE_LOG
:
1897 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1898 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1899 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1900 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1901 emit_abs( func
, 0 );
1902 emit_MOV( func
, 1, 0 );
1903 emit_lg2( func
, 2, 1 );
1904 /* dst.z = lg2(abs(src.x)) */
1905 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1906 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1908 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1909 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1910 emit_flr( func
, 2, 1 );
1911 /* dst.x = floor(lg2(abs(src.x))) */
1912 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1913 STORE( func
, *inst
, 1, 0, CHAN_X
);
1915 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1916 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1917 emit_ex2( func
, 2, 1 );
1918 emit_rcp( func
, 1, 1 );
1919 emit_mul( func
, 0, 1 );
1920 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1925 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1926 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1927 STORE( func
, *inst
, 0, 0, CHAN_W
);
1931 case TGSI_OPCODE_MUL
:
1932 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1933 FETCH( func
, *inst
, 0, 0, chan_index
);
1934 FETCH( func
, *inst
, 1, 1, chan_index
);
1935 emit_mul( func
, 0, 1 );
1936 STORE( func
, *inst
, 0, 0, chan_index
);
1940 case TGSI_OPCODE_ADD
:
1941 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1942 FETCH( func
, *inst
, 0, 0, chan_index
);
1943 FETCH( func
, *inst
, 1, 1, chan_index
);
1944 emit_add( func
, 0, 1 );
1945 STORE( func
, *inst
, 0, 0, chan_index
);
1949 case TGSI_OPCODE_DP3
:
1950 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1951 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1952 emit_mul( func
, 0, 1 );
1953 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1954 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1955 emit_mul( func
, 1, 2 );
1956 emit_add( func
, 0, 1 );
1957 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1958 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1959 emit_mul( func
, 1, 2 );
1960 emit_add( func
, 0, 1 );
1961 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1962 STORE( func
, *inst
, 0, 0, chan_index
);
1966 case TGSI_OPCODE_DP4
:
1967 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1968 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1969 emit_mul( func
, 0, 1 );
1970 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1971 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1972 emit_mul( func
, 1, 2 );
1973 emit_add( func
, 0, 1 );
1974 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1975 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1976 emit_mul(func
, 1, 2 );
1977 emit_add(func
, 0, 1 );
1978 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1979 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1980 emit_mul( func
, 1, 2 );
1981 emit_add( func
, 0, 1 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1983 STORE( func
, *inst
, 0, 0, chan_index
);
1987 case TGSI_OPCODE_DST
:
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1994 STORE( func
, *inst
, 0, 0, CHAN_X
);
1996 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1997 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1998 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1999 emit_mul( func
, 0, 1 );
2000 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2003 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
2004 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2007 FETCH( func
, *inst
, 0, 1, CHAN_W
);
2008 STORE( func
, *inst
, 0, 0, CHAN_W
);
2012 case TGSI_OPCODE_MIN
:
2013 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2014 FETCH( func
, *inst
, 0, 0, chan_index
);
2015 FETCH( func
, *inst
, 1, 1, chan_index
);
2020 STORE( func
, *inst
, 0, 0, chan_index
);
2024 case TGSI_OPCODE_MAX
:
2025 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2026 FETCH( func
, *inst
, 0, 0, chan_index
);
2027 FETCH( func
, *inst
, 1, 1, chan_index
);
2032 STORE( func
, *inst
, 0, 0, chan_index
);
2036 case TGSI_OPCODE_SLT
:
2037 emit_setcc( func
, inst
, cc_LessThan
);
2040 case TGSI_OPCODE_SGE
:
2041 emit_setcc( func
, inst
, cc_NotLessThan
);
2044 case TGSI_OPCODE_MAD
:
2045 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2046 FETCH( func
, *inst
, 0, 0, chan_index
);
2047 FETCH( func
, *inst
, 1, 1, chan_index
);
2048 FETCH( func
, *inst
, 2, 2, chan_index
);
2049 emit_mul( func
, 0, 1 );
2050 emit_add( func
, 0, 2 );
2051 STORE( func
, *inst
, 0, 0, chan_index
);
2055 case TGSI_OPCODE_SUB
:
2056 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2057 FETCH( func
, *inst
, 0, 0, chan_index
);
2058 FETCH( func
, *inst
, 1, 1, chan_index
);
2059 emit_sub( func
, 0, 1 );
2060 STORE( func
, *inst
, 0, 0, chan_index
);
2064 case TGSI_OPCODE_LRP
:
2065 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2066 FETCH( func
, *inst
, 0, 0, chan_index
);
2067 FETCH( func
, *inst
, 1, 1, chan_index
);
2068 FETCH( func
, *inst
, 2, 2, chan_index
);
2069 emit_sub( func
, 1, 2 );
2070 emit_mul( func
, 0, 1 );
2071 emit_add( func
, 0, 2 );
2072 STORE( func
, *inst
, 0, 0, chan_index
);
2076 case TGSI_OPCODE_CND
:
2080 case TGSI_OPCODE_DP2A
:
2081 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2082 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2083 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2084 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2085 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2086 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2087 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2088 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
2089 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2090 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2091 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2095 case TGSI_OPCODE_FRC
:
2096 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2097 FETCH( func
, *inst
, 0, 0, chan_index
);
2098 emit_frc( func
, 0, 0 );
2099 STORE( func
, *inst
, 0, 0, chan_index
);
2103 case TGSI_OPCODE_CLAMP
:
2107 case TGSI_OPCODE_FLR
:
2108 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2109 FETCH( func
, *inst
, 0, 0, chan_index
);
2110 emit_flr( func
, 0, 0 );
2111 STORE( func
, *inst
, 0, 0, chan_index
);
2115 case TGSI_OPCODE_ROUND
:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2117 FETCH( func
, *inst
, 0, 0, chan_index
);
2118 emit_rnd( func
, 0, 0 );
2119 STORE( func
, *inst
, 0, 0, chan_index
);
2123 case TGSI_OPCODE_EX2
:
2124 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2125 emit_ex2( func
, 0, 0 );
2126 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2127 STORE( func
, *inst
, 0, 0, chan_index
);
2131 case TGSI_OPCODE_LG2
:
2132 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2133 emit_lg2( func
, 0, 0 );
2134 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2135 STORE( func
, *inst
, 0, 0, chan_index
);
2139 case TGSI_OPCODE_POW
:
2140 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2141 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2142 emit_pow( func
, 0, 0, 0, 1 );
2143 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2144 STORE( func
, *inst
, 0, 0, chan_index
);
2148 case TGSI_OPCODE_XPD
:
2149 /* Note: we do all stores after all operands have been fetched
2150 * to avoid src/dst register aliasing issues for an instruction
2151 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2153 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2154 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2155 FETCH( func
, *inst
, 1, 1, CHAN_Z
); /* xmm[1] = src[1].z */
2156 FETCH( func
, *inst
, 3, 0, CHAN_Z
); /* xmm[3] = src[0].z */
2158 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2159 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2160 FETCH( func
, *inst
, 0, 0, CHAN_Y
); /* xmm[0] = src[0].y */
2161 FETCH( func
, *inst
, 4, 1, CHAN_Y
); /* xmm[4] = src[1].y */
2163 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2164 emit_MOV( func
, 7, 0 ); /* xmm[7] = xmm[0] */
2165 emit_mul( func
, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2166 emit_MOV( func
, 5, 3 ); /* xmm[5] = xmm[3] */
2167 emit_mul( func
, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2168 emit_sub( func
, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2169 /* store xmm[7] in dst.x below */
2171 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2172 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2173 FETCH( func
, *inst
, 2, 1, CHAN_X
); /* xmm[2] = src[1].x */
2174 FETCH( func
, *inst
, 5, 0, CHAN_X
); /* xmm[5] = src[0].x */
2176 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2177 emit_mul( func
, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2178 emit_mul( func
, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2179 emit_sub( func
, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2180 /* store xmm[3] in dst.y below */
2182 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2183 emit_mul( func
, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2184 emit_mul( func
, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2185 emit_sub( func
, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2186 STORE( func
, *inst
, 5, 0, CHAN_Z
); /* dst.z = xmm[5] */
2188 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2189 STORE( func
, *inst
, 7, 0, CHAN_X
); /* dst.x = xmm[7] */
2191 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2192 STORE( func
, *inst
, 3, 0, CHAN_Y
); /* dst.y = xmm[3] */
2194 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2200 STORE( func
, *inst
, 0, 0, CHAN_W
);
2204 case TGSI_OPCODE_ABS
:
2205 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2206 FETCH( func
, *inst
, 0, 0, chan_index
);
2207 emit_abs( func
, 0) ;
2209 STORE( func
, *inst
, 0, 0, chan_index
);
2213 case TGSI_OPCODE_RCC
:
2217 case TGSI_OPCODE_DPH
:
2218 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2219 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2220 emit_mul( func
, 0, 1 );
2221 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2222 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2223 emit_mul( func
, 1, 2 );
2224 emit_add( func
, 0, 1 );
2225 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2226 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2227 emit_mul( func
, 1, 2 );
2228 emit_add( func
, 0, 1 );
2229 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2230 emit_add( func
, 0, 1 );
2231 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2232 STORE( func
, *inst
, 0, 0, chan_index
);
2236 case TGSI_OPCODE_COS
:
2237 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2238 emit_cos( func
, 0, 0 );
2239 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2240 STORE( func
, *inst
, 0, 0, chan_index
);
2244 case TGSI_OPCODE_DDX
:
2248 case TGSI_OPCODE_DDY
:
2252 case TGSI_OPCODE_KILP
:
2253 /* predicated kill */
2255 return 0; /* XXX fix me */
2258 case TGSI_OPCODE_KIL
:
2259 /* conditional kill */
2260 emit_kil( func
, &inst
->Src
[0] );
2263 case TGSI_OPCODE_PK2H
:
2267 case TGSI_OPCODE_PK2US
:
2271 case TGSI_OPCODE_PK4B
:
2275 case TGSI_OPCODE_PK4UB
:
2279 case TGSI_OPCODE_RFL
:
2283 case TGSI_OPCODE_SEQ
:
2284 emit_setcc( func
, inst
, cc_Equal
);
2287 case TGSI_OPCODE_SFL
:
2291 case TGSI_OPCODE_SGT
:
2292 emit_setcc( func
, inst
, cc_NotLessThanEqual
);
2295 case TGSI_OPCODE_SIN
:
2296 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2297 emit_sin( func
, 0, 0 );
2298 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2299 STORE( func
, *inst
, 0, 0, chan_index
);
2303 case TGSI_OPCODE_SLE
:
2304 emit_setcc( func
, inst
, cc_LessThanEqual
);
2307 case TGSI_OPCODE_SNE
:
2308 emit_setcc( func
, inst
, cc_NotEqual
);
2311 case TGSI_OPCODE_STR
:
2315 case TGSI_OPCODE_TEX
:
2316 emit_tex( func
, inst
, FALSE
, FALSE
);
2319 case TGSI_OPCODE_TXD
:
2323 case TGSI_OPCODE_UP2H
:
2327 case TGSI_OPCODE_UP2US
:
2331 case TGSI_OPCODE_UP4B
:
2335 case TGSI_OPCODE_UP4UB
:
2339 case TGSI_OPCODE_X2D
:
2343 case TGSI_OPCODE_ARA
:
2347 case TGSI_OPCODE_ARR
:
2348 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2349 FETCH( func
, *inst
, 0, 0, chan_index
);
2350 emit_rnd( func
, 0, 0 );
2351 emit_f2it( func
, 0 );
2352 STORE( func
, *inst
, 0, 0, chan_index
);
2356 case TGSI_OPCODE_BRA
:
2360 case TGSI_OPCODE_CAL
:
2364 case TGSI_OPCODE_RET
:
2368 case TGSI_OPCODE_END
:
2371 case TGSI_OPCODE_SSG
:
2372 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2373 FETCH( func
, *inst
, 0, 0, chan_index
);
2374 emit_sgn( func
, 0, 0 );
2375 STORE( func
, *inst
, 0, 0, chan_index
);
2379 case TGSI_OPCODE_CMP
:
2380 emit_cmp (func
, inst
);
2383 case TGSI_OPCODE_SCS
:
2384 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2385 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2386 emit_cos( func
, 0, 0 );
2387 STORE( func
, *inst
, 0, 0, CHAN_X
);
2389 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2390 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2391 emit_sin( func
, 0, 0 );
2392 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2394 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2398 TGSI_EXEC_TEMP_00000000_I
,
2399 TGSI_EXEC_TEMP_00000000_C
);
2400 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2402 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2408 STORE( func
, *inst
, 0, 0, CHAN_W
);
2412 case TGSI_OPCODE_TXB
:
2413 emit_tex( func
, inst
, TRUE
, FALSE
);
2416 case TGSI_OPCODE_NRM
:
2418 case TGSI_OPCODE_NRM4
:
2419 /* 3 or 4-component normalization */
2421 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2423 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2424 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2425 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2426 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2428 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2431 /* xmm0 = src.x * src.x */
2432 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2433 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2434 emit_MOV(func
, 4, 0);
2436 emit_mul(func
, 0, 0);
2439 /* xmm0 = xmm0 + src.y * src.y */
2440 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2441 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2442 emit_MOV(func
, 5, 1);
2444 emit_mul(func
, 1, 1);
2445 emit_add(func
, 0, 1);
2448 /* xmm0 = xmm0 + src.z * src.z */
2449 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2450 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2451 emit_MOV(func
, 6, 1);
2453 emit_mul(func
, 1, 1);
2454 emit_add(func
, 0, 1);
2458 /* xmm0 = xmm0 + src.w * src.w */
2459 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2460 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2461 emit_MOV(func
, 7, 1);
2463 emit_mul(func
, 1, 1);
2464 emit_add(func
, 0, 1);
2467 /* xmm1 = 1 / sqrt(xmm0) */
2468 emit_rsqrt(func
, 1, 0);
2470 /* dst.x = xmm1 * src.x */
2471 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2472 emit_mul(func
, 4, 1);
2473 STORE(func
, *inst
, 4, 0, CHAN_X
);
2476 /* dst.y = xmm1 * src.y */
2477 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2478 emit_mul(func
, 5, 1);
2479 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2482 /* dst.z = xmm1 * src.z */
2483 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2484 emit_mul(func
, 6, 1);
2485 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2488 /* dst.w = xmm1 * src.w */
2489 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2490 emit_mul(func
, 7, 1);
2491 STORE(func
, *inst
, 7, 0, CHAN_W
);
2496 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2497 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2498 STORE(func
, *inst
, 0, 0, CHAN_W
);
2503 case TGSI_OPCODE_DIV
:
2507 case TGSI_OPCODE_DP2
:
2508 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2509 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2510 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2511 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2512 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2513 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2514 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2515 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2516 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2520 case TGSI_OPCODE_TXL
:
2524 case TGSI_OPCODE_TXP
:
2525 emit_tex( func
, inst
, FALSE
, TRUE
);
2528 case TGSI_OPCODE_BRK
:
2532 case TGSI_OPCODE_IF
:
2536 case TGSI_OPCODE_BGNFOR
:
2540 case TGSI_OPCODE_REP
:
2544 case TGSI_OPCODE_ELSE
:
2548 case TGSI_OPCODE_ENDIF
:
2552 case TGSI_OPCODE_ENDFOR
:
2556 case TGSI_OPCODE_ENDREP
:
2560 case TGSI_OPCODE_PUSHA
:
2564 case TGSI_OPCODE_POPA
:
2568 case TGSI_OPCODE_CEIL
:
2572 case TGSI_OPCODE_I2F
:
2576 case TGSI_OPCODE_NOT
:
2580 case TGSI_OPCODE_TRUNC
:
2581 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2582 FETCH( func
, *inst
, 0, 0, chan_index
);
2583 emit_f2it( func
, 0 );
2584 emit_i2f( func
, 0 );
2585 STORE( func
, *inst
, 0, 0, chan_index
);
2589 case TGSI_OPCODE_SHL
:
2593 case TGSI_OPCODE_ISHR
:
2597 case TGSI_OPCODE_AND
:
2601 case TGSI_OPCODE_OR
:
2605 case TGSI_OPCODE_MOD
:
2609 case TGSI_OPCODE_XOR
:
2613 case TGSI_OPCODE_SAD
:
2617 case TGSI_OPCODE_TXF
:
2621 case TGSI_OPCODE_TXQ
:
2625 case TGSI_OPCODE_CONT
:
2629 case TGSI_OPCODE_EMIT
:
2633 case TGSI_OPCODE_ENDPRIM
:
2646 struct x86_function
*func
,
2647 struct tgsi_full_declaration
*decl
)
2649 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
||
2650 decl
->Declaration
.File
== TGSI_FILE_SYSTEM_VALUE
) {
2651 unsigned first
, last
, mask
;
2654 first
= decl
->Range
.First
;
2655 last
= decl
->Range
.Last
;
2656 mask
= decl
->Declaration
.UsageMask
;
2658 for( i
= first
; i
<= last
; i
++ ) {
2659 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2660 if( mask
& (1 << j
) ) {
2661 switch( decl
->Declaration
.Interpolate
) {
2662 case TGSI_INTERPOLATE_CONSTANT
:
2663 emit_coef_a0( func
, 0, i
, j
);
2664 emit_inputs( func
, 0, i
, j
);
2667 case TGSI_INTERPOLATE_LINEAR
:
2668 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2669 emit_coef_dadx( func
, 1, i
, j
);
2670 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2671 emit_coef_dady( func
, 3, i
, j
);
2672 emit_mul( func
, 0, 1 ); /* x * dadx */
2673 emit_coef_a0( func
, 4, i
, j
);
2674 emit_mul( func
, 2, 3 ); /* y * dady */
2675 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2676 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2677 emit_inputs( func
, 0, i
, j
);
2680 case TGSI_INTERPOLATE_PERSPECTIVE
:
2681 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2682 emit_coef_dadx( func
, 1, i
, j
);
2683 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2684 emit_coef_dady( func
, 3, i
, j
);
2685 emit_mul( func
, 0, 1 ); /* x * dadx */
2686 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2687 emit_coef_a0( func
, 5, i
, j
);
2688 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2689 emit_mul( func
, 2, 3 ); /* y * dady */
2690 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2691 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2692 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2693 emit_inputs( func
, 0, i
, j
);
2706 static void aos_to_soa( struct x86_function
*func
,
2712 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2713 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2714 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2715 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2720 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2722 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2723 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2724 x86_lea( func
, soa_input
,
2725 x86_make_disp( soa_input
,
2726 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2727 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2728 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2731 inner_loop
= x86_get_label( func
);
2733 x86_push( func
, aos_input
);
2734 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2735 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2736 x86_add( func
, aos_input
, stride
);
2737 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2738 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2739 x86_add( func
, aos_input
, stride
);
2740 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2741 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2742 x86_add( func
, aos_input
, stride
);
2743 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2744 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2745 x86_pop( func
, aos_input
);
2747 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2748 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2749 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2750 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2751 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2752 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2754 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2755 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2756 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2757 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2759 /* Advance to next input */
2760 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2761 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2763 /* while --num_inputs */
2764 x86_dec( func
, num_inputs
);
2765 x86_jcc( func
, cc_NE
, inner_loop
);
2768 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2771 static void soa_to_aos( struct x86_function
*func
,
2777 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2778 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2779 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2780 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2784 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2786 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2787 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2788 x86_lea( func
, soa_output
,
2789 x86_make_disp( soa_output
,
2790 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2791 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2794 inner_loop
= x86_get_label( func
);
2796 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2797 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2798 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2799 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2801 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2802 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2803 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2804 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2805 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2806 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2808 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2809 x86_push( func
, aos_output
);
2810 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2811 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2812 x86_add( func
, aos_output
, temp
);
2813 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2814 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2815 x86_add( func
, aos_output
, temp
);
2816 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2817 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2818 x86_add( func
, aos_output
, temp
);
2819 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2820 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2821 x86_pop( func
, aos_output
);
2823 /* Advance to next output */
2824 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2825 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2827 /* while --num_outputs */
2828 x86_dec( func
, num_outputs
);
2829 x86_jcc( func
, cc_NE
, inner_loop
);
2832 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2836 * Translate a TGSI vertex/fragment shader to SSE2 code.
2837 * Slightly different things are done for vertex vs. fragment shaders.
2839 * \param tokens the TGSI input shader
2840 * \param func the output SSE code/function
2841 * \param immediates buffer to place immediates, later passed to SSE func
2842 * \param return 1 for success, 0 if translation failed
2846 const struct tgsi_token
*tokens
,
2847 struct x86_function
*func
,
2848 float (*immediates
)[4],
2849 boolean do_swizzles
)
2851 struct tgsi_parse_context parse
;
2853 uint num_immediates
= 0;
2857 func
->csr
= func
->store
;
2859 tgsi_parse_init( &parse
, tokens
);
2861 /* Can't just use EDI, EBX without save/restoring them:
2863 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2864 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2867 * Different function args for vertex/fragment shaders:
2869 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2875 6 ); /* input_stride */
2881 x86_fn_arg( func
, 1 ) );
2885 x86_fn_arg( func
, 2 ) );
2888 get_immediate_base(),
2889 x86_fn_arg( func
, 3 ) );
2891 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2895 x86_fn_arg( func
, 4 ) );
2901 x86_make_disp( get_machine_base(),
2902 Offset( struct tgsi_exec_machine
, Samplers
) ) );
2905 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2906 tgsi_parse_token( &parse
);
2908 switch( parse
.FullToken
.Token
.Type
) {
2909 case TGSI_TOKEN_TYPE_DECLARATION
:
2910 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2913 &parse
.FullToken
.FullDeclaration
);
2917 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2918 ok
= emit_instruction(
2920 &parse
.FullToken
.FullInstruction
);
2923 uint opcode
= parse
.FullToken
.FullInstruction
.Instruction
.Opcode
;
2924 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2926 tgsi_get_opcode_name(opcode
),
2927 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2928 "vertex shader" : "fragment shader");
2931 if (tgsi_check_soa_dependencies(&parse
.FullToken
.FullInstruction
)) {
2932 uint opcode
= parse
.FullToken
.FullInstruction
.Instruction
.Opcode
;
2934 /* XXX: we only handle src/dst aliasing in a few opcodes
2935 * currently. Need to use an additional temporay to hold
2936 * the result in the cases where the code is too opaque to
2939 if (opcode
!= TGSI_OPCODE_MOV
) {
2940 debug_printf("Warning: src/dst aliasing in instruction"
2941 " is not handled:\n");
2942 tgsi_dump_instruction(&parse
.FullToken
.FullInstruction
, 1);
2947 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2948 /* simply copy the immediate values into the next immediates[] slot */
2950 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2953 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2954 for( i
= 0; i
< size
; i
++ ) {
2955 immediates
[num_immediates
][i
] =
2956 parse
.FullToken
.FullImmediate
.u
[i
].Float
;
2959 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2961 immediates
[num_immediates
][0],
2962 immediates
[num_immediates
][1],
2963 immediates
[num_immediates
][2],
2964 immediates
[num_immediates
][3]);
2969 case TGSI_TOKEN_TYPE_PROPERTY
:
2970 /* we just ignore them for now */
2979 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2984 8, /* num_outputs */
2985 9 ); /* output_stride */
2988 /* Can't just use EBX, EDI without save/restoring them:
2990 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
2991 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2995 tgsi_parse_free( &parse
);
3000 #endif /* PIPE_ARCH_X86 */