1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * Copyright 2009-2010 VMware, Inc. All rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
29 #include "pipe/p_config.h"
31 #if defined(PIPE_ARCH_X86)
33 #include "util/u_debug.h"
34 #include "pipe/p_shader_tokens.h"
35 #include "util/u_math.h"
36 #include "util/u_memory.h"
37 #if defined(PIPE_ARCH_SSE)
38 #include "util/u_sse.h"
40 #include "tgsi/tgsi_info.h"
41 #include "tgsi/tgsi_parse.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44 #include "tgsi/tgsi_exec.h"
45 #include "tgsi/tgsi_sse2.h"
47 #include "rtasm/rtasm_x86sse.h"
51 * This costs about 100fps (close to 10%) in gears:
53 #define HIGH_PRECISION 1
58 #define FOR_EACH_CHANNEL( CHAN )\
59 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
65 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
68 FOR_EACH_CHANNEL( CHAN )\
69 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
76 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
77 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
79 #define TEMP_R0 TGSI_EXEC_TEMP_R0
80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
86 * X86 utility functions.
95 (enum x86_reg_name
) xmm
);
99 * X86 register mapping helpers.
102 static struct x86_reg
103 get_const_base( void )
110 static struct x86_reg
111 get_machine_base( void )
118 static struct x86_reg
119 get_input_base( void )
121 return x86_make_disp(
123 Offset(struct tgsi_exec_machine
, Inputs
) );
126 static struct x86_reg
127 get_output_base( void )
129 return x86_make_disp(
131 Offset(struct tgsi_exec_machine
, Outputs
) );
134 static struct x86_reg
135 get_temp_base( void )
137 return x86_make_disp(
139 Offset(struct tgsi_exec_machine
, Temps
) );
142 static struct x86_reg
143 get_coef_base( void )
150 static struct x86_reg
151 get_sampler_base( void )
158 static struct x86_reg
159 get_immediate_base( void )
166 static struct x86_reg
167 get_system_value_base( void )
169 return x86_make_disp(
171 Offset(struct tgsi_exec_machine
, SystemValue
) );
176 * Data access helpers.
180 static struct x86_reg
185 return x86_make_disp(
186 get_immediate_base(),
187 (vec
* 4 + chan
) * 4 );
190 static struct x86_reg
195 return x86_make_disp(
197 (vec
* 4 + chan
) * 4 );
200 static struct x86_reg
204 return x86_make_disp(
206 unit
* sizeof( struct tgsi_sampler
* ) );
209 static struct x86_reg
214 return x86_make_disp(
216 (vec
* 4 + chan
) * 16 );
219 static struct x86_reg
224 return x86_make_disp(
226 (vec
* 4 + chan
) * 16 );
229 static struct x86_reg
234 return x86_make_disp(
236 (vec
* 4 + chan
) * 16 );
239 static struct x86_reg
244 return x86_make_disp(
245 get_system_value_base(), /* base */
246 (vec
* 4 + chan
) * 4 ); /* byte offset from base */
249 static struct x86_reg
255 return x86_make_disp(
257 ((vec
* 3 + member
) * 4 + chan
) * 4 );
263 struct x86_function
*func
)
270 * Data fetch helpers.
274 * Copy a shader constant to xmm register
275 * \param xmm the destination xmm register
276 * \param vec the src const buffer index
277 * \param chan src channel to fetch (X, Y, Z or W)
281 struct x86_function
*func
,
290 /* 'vec' is the offset from the address register's value.
291 * We're loading CONST[ADDR+vec] into an xmm register.
293 struct x86_reg r0
= get_immediate_base();
294 struct x86_reg r1
= get_coef_base();
297 assert( indirectFile
== TGSI_FILE_ADDRESS
);
298 assert( indirectIndex
== 0 );
299 assert( r0
.mod
== mod_REG
);
300 assert( r1
.mod
== mod_REG
);
302 x86_push( func
, r0
);
303 x86_push( func
, r1
);
306 * Loop over the four pixels or vertices in the quad.
307 * Get the value of the address (offset) register for pixel/vertex[i],
308 * add it to the src offset and index into the constant buffer.
309 * Note that we're working on SOA data.
310 * If any of the pixel/vertex execution channels are unused their
311 * values will be garbage. It's very important that we don't use
312 * those garbage values as indexes into the constant buffer since
313 * that'll cause segfaults.
314 * The solution is to bitwise-AND the offset with the execution mask
315 * register whose values are either 0 or ~0.
316 * The caller must setup the execution mask register to indicate
317 * which channels are valid/alive before running the shader.
318 * The execution mask will also figure into loops and conditionals
321 for (i
= 0; i
< QUAD_SIZE
; i
++) {
322 /* r1 = address register[i] */
323 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
324 /* r0 = execution mask[i] */
325 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
327 x86_and( func
, r1
, r0
);
328 /* r0 = 'vec', the offset */
329 x86_lea( func
, r0
, get_const( vec
, chan
) );
331 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
333 x86_add( func
, r1
, r1
);
334 x86_add( func
, r1
, r1
);
335 x86_add( func
, r1
, r1
);
336 x86_add( func
, r1
, r1
);
338 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
339 x86_mov( func
, r1
, x86_deref( r0
) );
340 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
349 get_temp( TEMP_R0
, CHAN_X
) );
352 /* 'vec' is the index into the src register file, such as TEMP[vec] */
358 get_const( vec
, chan
) );
363 SHUF( 0, 0, 0, 0 ) );
369 struct x86_function
*func
,
377 get_immediate( vec
, chan
) );
382 SHUF( 0, 0, 0, 0 ) );
387 * Copy a shader input to xmm register
388 * \param xmm the destination xmm register
389 * \param vec the src input attrib
390 * \param chan src channel to fetch (X, Y, Z or W)
394 struct x86_function
*func
,
402 get_input( vec
, chan
) );
406 * Store an xmm register to a shader output
407 * \param xmm the source xmm register
408 * \param vec the dest output attrib
409 * \param chan src dest channel to store (X, Y, Z or W)
413 struct x86_function
*func
,
420 get_output( vec
, chan
),
425 * Copy a shader temporary to xmm register
426 * \param xmm the destination xmm register
427 * \param vec the src temp register
428 * \param chan src channel to fetch (X, Y, Z or W)
432 struct x86_function
*func
,
440 get_temp( vec
, chan
) );
444 * Copy a system value to xmm register
445 * \param xmm the destination xmm register
446 * \param vec the source system value register
447 * \param chan src channel to fetch (X, Y, Z or W)
451 struct x86_function
*func
,
459 get_system_value( vec
, chan
) );
464 SHUF( 0, 0, 0, 0 ) );
468 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
469 * \param xmm the destination xmm register
470 * \param vec the src input/attribute coefficient index
471 * \param chan src channel to fetch (X, Y, Z or W)
472 * \param member 0=a0, 1=dadx, 2=dady
476 struct x86_function
*func
,
485 get_coef( vec
, chan
, member
) );
490 SHUF( 0, 0, 0, 0 ) );
494 * Data store helpers.
499 struct x86_function
*func
,
506 get_input( vec
, chan
),
512 struct x86_function
*func
,
519 get_temp( vec
, chan
),
525 struct x86_function
*func
,
535 vec
+ TGSI_EXEC_TEMP_ADDR
,
540 * Coefficent fetch helpers.
545 struct x86_function
*func
,
560 struct x86_function
*func
,
575 struct x86_function
*func
,
589 * Function call helpers.
593 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
594 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
595 * that the stack pointer is 16 byte aligned, as expected.
599 struct x86_function
*func
,
600 unsigned xmm_save_mask
,
601 const struct x86_reg
*arg
,
603 void (PIPE_CDECL
*code
)() )
605 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
610 x86_make_reg( file_REG32
, reg_AX
) );
613 x86_make_reg( file_REG32
, reg_CX
) );
616 x86_make_reg( file_REG32
, reg_DX
) );
618 /* Store XMM regs to the stack
620 for(i
= 0, n
= 0; i
< 8; ++i
)
621 if(xmm_save_mask
& (1 << i
))
626 x86_make_reg( file_REG32
, reg_SP
),
629 for(i
= 0, n
= 0; i
< 8; ++i
)
630 if(xmm_save_mask
& (1 << i
)) {
633 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
638 for (i
= 0; i
< nr_args
; i
++) {
639 /* Load the address of the buffer we use for passing arguments and
647 /* Push actual function arguments (currently just the pointer to
648 * the buffer above), and call the function:
650 x86_push( func
, ecx
);
653 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
654 x86_call( func
, ecx
);
656 /* Pop the arguments (or just add an immediate to esp)
658 for (i
= 0; i
< nr_args
; i
++) {
662 /* Pop the saved XMM regs:
664 for(i
= 0, n
= 0; i
< 8; ++i
)
665 if(xmm_save_mask
& (1 << i
)) {
669 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
675 x86_make_reg( file_REG32
, reg_SP
),
678 /* Restore GP registers in a reverse order.
682 x86_make_reg( file_REG32
, reg_DX
) );
685 x86_make_reg( file_REG32
, reg_CX
) );
688 x86_make_reg( file_REG32
, reg_AX
) );
692 emit_func_call_dst_src1(
693 struct x86_function
*func
,
697 void (PIPE_CDECL
*code
)() )
699 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
700 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
702 /* Store our input parameters (in xmm regs) to the buffer we use
703 * for passing arguments. We will pass a pointer to this buffer as
704 * the actual function argument.
709 make_xmm( xmm_src0
) );
711 emit_func_call( func
,
725 emit_func_call_dst_src2(
726 struct x86_function
*func
,
731 void (PIPE_CDECL
*code
)() )
733 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
734 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
736 /* Store two inputs to parameter buffer.
741 make_xmm( xmm_src0
) );
745 x86_make_disp( store
, 4 * sizeof(float) ),
746 make_xmm( xmm_src1
) );
751 emit_func_call( func
,
757 /* Retrieve the results:
769 #if defined(PIPE_ARCH_SSE)
772 * Fast SSE2 implementation of special math functions.
775 #define POLY0(x, c0) _mm_set1_ps(c0)
776 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
777 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
778 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
779 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
780 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
782 #define EXP_POLY_DEGREE 3
783 #define LOG_POLY_DEGREE 5
786 * See http://www.devmaster.net/forums/showthread.php?p=43580
792 __m128 fpart
, expipart
, expfpart
;
794 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
795 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
797 /* ipart = int(x - 0.5) */
798 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
800 /* fpart = x - ipart */
801 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
803 /* expipart = (float) (1 << ipart) */
804 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
806 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
807 #if EXP_POLY_DEGREE == 5
808 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
809 #elif EXP_POLY_DEGREE == 4
810 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
811 #elif EXP_POLY_DEGREE == 3
812 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
813 #elif EXP_POLY_DEGREE == 2
814 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
819 return _mm_mul_ps(expipart
, expfpart
);
824 * See http://www.devmaster.net/forums/showthread.php?p=43580
829 __m128i expmask
= _mm_set1_epi32(0x7f800000);
830 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
831 __m128 one
= _mm_set1_ps(1.0f
);
833 __m128i i
= _mm_castps_si128(x
);
835 /* exp = (float) exponent(x) */
836 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
838 /* mant = (float) mantissa(x) */
839 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
843 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
844 * These coefficients can be generate with
845 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
847 #if LOG_POLY_DEGREE == 6
848 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
849 #elif LOG_POLY_DEGREE == 5
850 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
851 #elif LOG_POLY_DEGREE == 4
852 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
853 #elif LOG_POLY_DEGREE == 3
854 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
859 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
860 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
862 return _mm_add_ps(logmant
, exp
);
867 powf4(__m128 x
, __m128 y
)
869 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
872 #endif /* PIPE_ARCH_SSE */
877 * Low-level instruction translators.
882 struct x86_function
*func
,
889 TGSI_EXEC_TEMP_7FFFFFFF_I
,
890 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
895 struct x86_function
*func
,
902 make_xmm( xmm_src
) );
905 static void PIPE_CDECL
909 store
[0] = cosf( store
[0] );
910 store
[1] = cosf( store
[1] );
911 store
[2] = cosf( store
[2] );
912 store
[3] = cosf( store
[3] );
917 struct x86_function
*func
,
921 emit_func_call_dst_src1(
929 static void PIPE_CDECL
930 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
931 __attribute__((force_align_arg_pointer
))
936 #if defined(PIPE_ARCH_SSE)
937 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
939 store
[0] = util_fast_exp2( store
[0] );
940 store
[1] = util_fast_exp2( store
[1] );
941 store
[2] = util_fast_exp2( store
[2] );
942 store
[3] = util_fast_exp2( store
[3] );
948 struct x86_function
*func
,
952 emit_func_call_dst_src1(
962 struct x86_function
*func
,
973 struct x86_function
*func
,
982 static void PIPE_CDECL
986 store
[0] = floorf( store
[0] );
987 store
[1] = floorf( store
[1] );
988 store
[2] = floorf( store
[2] );
989 store
[3] = floorf( store
[3] );
994 struct x86_function
*func
,
998 emit_func_call_dst_src1(
1006 static void PIPE_CDECL
1010 store
[0] -= floorf( store
[0] );
1011 store
[1] -= floorf( store
[1] );
1012 store
[2] -= floorf( store
[2] );
1013 store
[3] -= floorf( store
[3] );
1018 struct x86_function
*func
,
1022 emit_func_call_dst_src1(
1030 static void PIPE_CDECL
1031 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1032 __attribute__((force_align_arg_pointer
))
1037 #if defined(PIPE_ARCH_SSE)
1038 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
1040 store
[0] = util_fast_log2( store
[0] );
1041 store
[1] = util_fast_log2( store
[1] );
1042 store
[2] = util_fast_log2( store
[2] );
1043 store
[3] = util_fast_log2( store
[3] );
1049 struct x86_function
*func
,
1053 emit_func_call_dst_src1(
1063 struct x86_function
*func
,
1069 make_xmm( xmm_dst
),
1070 make_xmm( xmm_src
) );
1074 emit_mul (struct x86_function
*func
,
1080 make_xmm( xmm_dst
),
1081 make_xmm( xmm_src
) );
1086 struct x86_function
*func
,
1093 TGSI_EXEC_TEMP_80000000_I
,
1094 TGSI_EXEC_TEMP_80000000_C
) );
1097 static void PIPE_CDECL
1098 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1099 __attribute__((force_align_arg_pointer
))
1104 #if defined(PIPE_ARCH_SSE)
1105 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1107 store
[0] = util_fast_pow( store
[0], store
[4] );
1108 store
[1] = util_fast_pow( store
[1], store
[5] );
1109 store
[2] = util_fast_pow( store
[2], store
[6] );
1110 store
[3] = util_fast_pow( store
[3], store
[7] );
1116 struct x86_function
*func
,
1122 emit_func_call_dst_src2(
1133 struct x86_function
*func
,
1137 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1138 * good enough. Need to either emit a proper divide or use the
1139 * iterative technique described below in emit_rsqrt().
1143 make_xmm( xmm_dst
),
1144 make_xmm( xmm_src
) );
1147 static void PIPE_CDECL
1151 store
[0] = floorf( store
[0] + 0.5f
);
1152 store
[1] = floorf( store
[1] + 0.5f
);
1153 store
[2] = floorf( store
[2] + 0.5f
);
1154 store
[3] = floorf( store
[3] + 0.5f
);
1159 struct x86_function
*func
,
1163 emit_func_call_dst_src1(
1173 struct x86_function
*func
,
1178 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1179 * implementations, it is possible to improve its precision at
1180 * fairly low cost, using a newton/raphson step, as below:
1182 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1183 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1185 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1188 struct x86_reg dst
= make_xmm( xmm_dst
);
1189 struct x86_reg src
= make_xmm( xmm_src
);
1190 struct x86_reg tmp0
= make_xmm( 2 );
1191 struct x86_reg tmp1
= make_xmm( 3 );
1193 assert( xmm_dst
!= xmm_src
);
1194 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1195 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1197 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1198 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1199 sse_rsqrtps( func
, tmp1
, src
);
1200 sse_mulps( func
, src
, tmp1
);
1201 sse_mulps( func
, dst
, tmp1
);
1202 sse_mulps( func
, src
, tmp1
);
1203 sse_subps( func
, tmp0
, src
);
1204 sse_mulps( func
, dst
, tmp0
);
1207 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1212 make_xmm( xmm_dst
),
1213 make_xmm( xmm_src
) );
1219 struct x86_function
*func
,
1226 TGSI_EXEC_TEMP_80000000_I
,
1227 TGSI_EXEC_TEMP_80000000_C
) );
1230 static void PIPE_CDECL
1234 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1235 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1236 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1237 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1242 struct x86_function
*func
,
1246 emit_func_call_dst_src1(
1254 static void PIPE_CDECL
1258 store
[0] = sinf( store
[0] );
1259 store
[1] = sinf( store
[1] );
1260 store
[2] = sinf( store
[2] );
1261 store
[3] = sinf( store
[3] );
1265 emit_sin (struct x86_function
*func
,
1269 emit_func_call_dst_src1(
1279 struct x86_function
*func
,
1285 make_xmm( xmm_dst
),
1286 make_xmm( xmm_src
) );
1294 struct x86_function
*func
,
1296 const struct tgsi_full_src_register
*reg
,
1297 const unsigned chan_index
)
1299 unsigned swizzle
= tgsi_util_get_full_src_register_swizzle( reg
, chan_index
);
1302 case TGSI_SWIZZLE_X
:
1303 case TGSI_SWIZZLE_Y
:
1304 case TGSI_SWIZZLE_Z
:
1305 case TGSI_SWIZZLE_W
:
1306 switch (reg
->Register
.File
) {
1307 case TGSI_FILE_CONSTANT
:
1311 reg
->Register
.Index
,
1313 reg
->Register
.Indirect
,
1315 reg
->Indirect
.Index
);
1318 case TGSI_FILE_IMMEDIATE
:
1322 reg
->Register
.Index
,
1326 case TGSI_FILE_SYSTEM_VALUE
:
1330 reg
->Register
.Index
,
1334 case TGSI_FILE_INPUT
:
1338 reg
->Register
.Index
,
1342 case TGSI_FILE_TEMPORARY
:
1346 reg
->Register
.Index
,
1359 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1360 case TGSI_UTIL_SIGN_CLEAR
:
1361 emit_abs( func
, xmm
);
1364 case TGSI_UTIL_SIGN_SET
:
1365 emit_setsign( func
, xmm
);
1368 case TGSI_UTIL_SIGN_TOGGLE
:
1369 emit_neg( func
, xmm
);
1372 case TGSI_UTIL_SIGN_KEEP
:
1377 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1378 emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1385 struct x86_function
*func
,
1387 const struct tgsi_full_dst_register
*reg
,
1388 const struct tgsi_full_instruction
*inst
,
1389 unsigned chan_index
)
1391 switch( inst
->Instruction
.Saturate
) {
1395 case TGSI_SAT_ZERO_ONE
:
1400 TGSI_EXEC_TEMP_00000000_I
,
1401 TGSI_EXEC_TEMP_00000000_C
) );
1407 TGSI_EXEC_TEMP_ONE_I
,
1408 TGSI_EXEC_TEMP_ONE_C
) );
1411 case TGSI_SAT_MINUS_PLUS_ONE
:
1417 switch( reg
->Register
.File
) {
1418 case TGSI_FILE_OUTPUT
:
1422 reg
->Register
.Index
,
1426 case TGSI_FILE_TEMPORARY
:
1430 reg
->Register
.Index
,
1434 case TGSI_FILE_ADDRESS
:
1438 reg
->Register
.Index
,
1447 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1448 emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1451 static void PIPE_CDECL
1452 fetch_texel( struct tgsi_sampler
**sampler
,
1458 debug_printf("%s sampler: %p (%p) store: %p\n",
1463 for (j
= 0; j
< 4; j
++)
1464 debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1473 float rgba
[NUM_CHANNELS
][QUAD_SIZE
];
1474 (*sampler
)->get_samples(*sampler
,
1478 &store
[12], /* lodbias */
1479 tgsi_sampler_lod_bias
,
1480 rgba
); /* results */
1482 memcpy( store
, rgba
, 16 * sizeof(float));
1486 for (j
= 0; j
< 4; j
++)
1487 debug_printf("sample %d result %f %f %f %f\n",
1497 * High-level instruction translators.
1500 emit_tex( struct x86_function
*func
,
1501 const struct tgsi_full_instruction
*inst
,
1505 const uint unit
= inst
->Src
[1].Register
.Index
;
1506 struct x86_reg args
[2];
1510 assert(inst
->Instruction
.Texture
);
1511 switch (inst
->Texture
.Texture
) {
1512 case TGSI_TEXTURE_1D
:
1515 case TGSI_TEXTURE_2D
:
1516 case TGSI_TEXTURE_RECT
:
1517 case TGSI_TEXTURE_1D_ARRAY
:
1520 case TGSI_TEXTURE_SHADOW1D
:
1521 case TGSI_TEXTURE_SHADOW2D
:
1522 case TGSI_TEXTURE_SHADOWRECT
:
1523 case TGSI_TEXTURE_3D
:
1524 case TGSI_TEXTURE_CUBE
:
1525 case TGSI_TEXTURE_2D_ARRAY
:
1534 FETCH( func
, *inst
, 3, 0, 3 );
1540 TGSI_EXEC_TEMP_00000000_I
,
1541 TGSI_EXEC_TEMP_00000000_C
);
1545 /* store lodbias whether enabled or not -- fetch_texel currently
1546 * respects it always.
1549 get_temp( TEMP_R0
, 3 ),
1553 FETCH( func
, *inst
, 3, 0, 3 );
1555 emit_rcp( func
, 3, 3 );
1558 for (i
= 0; i
< count
; i
++) {
1559 FETCH( func
, *inst
, i
, 0, i
);
1568 /* Store in the argument buffer:
1572 get_temp( TEMP_R0
, i
),
1576 args
[0] = get_temp( TEMP_R0
, 0 );
1577 args
[1] = get_sampler_ptr( unit
);
1579 emit_func_call( func
,
1585 /* If all four channels are enabled, could use a pointer to
1586 * dst[0].x instead of TEMP_R0 for store?
1588 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, i
) {
1593 get_temp( TEMP_R0
, i
) );
1595 STORE( func
, *inst
, 0, 0, i
);
1602 struct x86_function
*func
,
1603 const struct tgsi_full_src_register
*reg
)
1605 unsigned uniquemask
;
1606 unsigned unique_count
= 0;
1607 unsigned chan_index
;
1610 /* This mask stores component bits that were already tested. Note that
1611 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1616 FOR_EACH_CHANNEL( chan_index
) {
1619 /* unswizzle channel */
1620 swizzle
= tgsi_util_get_full_src_register_swizzle(
1624 /* check if the component has not been already tested */
1625 if( !(uniquemask
& (1 << swizzle
)) ) {
1626 uniquemask
|= 1 << swizzle
;
1628 /* allocate register */
1639 x86_make_reg( file_REG32
, reg_AX
) );
1642 x86_make_reg( file_REG32
, reg_DX
) );
1644 for (i
= 0 ; i
< unique_count
; i
++ ) {
1645 struct x86_reg dataXMM
= make_xmm(i
);
1651 TGSI_EXEC_TEMP_00000000_I
,
1652 TGSI_EXEC_TEMP_00000000_C
),
1658 x86_make_reg( file_REG32
, reg_AX
),
1664 x86_make_reg( file_REG32
, reg_DX
),
1668 x86_make_reg( file_REG32
, reg_AX
),
1669 x86_make_reg( file_REG32
, reg_DX
) );
1676 TGSI_EXEC_TEMP_KILMASK_I
,
1677 TGSI_EXEC_TEMP_KILMASK_C
),
1678 x86_make_reg( file_REG32
, reg_AX
) );
1682 x86_make_reg( file_REG32
, reg_DX
) );
1685 x86_make_reg( file_REG32
, reg_AX
) );
1691 struct x86_function
*func
)
1693 /* XXX todo / fix me */
1699 struct x86_function
*func
,
1700 struct tgsi_full_instruction
*inst
,
1703 unsigned chan_index
;
1705 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1706 FETCH( func
, *inst
, 0, 0, chan_index
);
1707 FETCH( func
, *inst
, 1, 1, chan_index
);
1719 STORE( func
, *inst
, 0, 0, chan_index
);
1725 struct x86_function
*func
,
1726 struct tgsi_full_instruction
*inst
)
1728 unsigned chan_index
;
1730 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1731 FETCH( func
, *inst
, 0, 0, chan_index
);
1732 FETCH( func
, *inst
, 1, 1, chan_index
);
1733 FETCH( func
, *inst
, 2, 2, chan_index
);
1738 TGSI_EXEC_TEMP_00000000_I
,
1739 TGSI_EXEC_TEMP_00000000_C
),
1753 STORE( func
, *inst
, 0, 0, chan_index
);
1759 * Check if inst src/dest regs use indirect addressing into temporary,
1760 * input or output register files.
1763 indirect_reg_reference(const struct tgsi_full_instruction
*inst
)
1766 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1767 const struct tgsi_full_src_register
*reg
= &inst
->Src
[i
];
1768 if ((reg
->Register
.File
== TGSI_FILE_TEMPORARY
||
1769 reg
->Register
.File
== TGSI_FILE_INPUT
||
1770 reg
->Register
.File
== TGSI_FILE_OUTPUT
) &&
1771 reg
->Register
.Indirect
)
1774 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1775 const struct tgsi_full_dst_register
*reg
= &inst
->Dst
[i
];
1776 if ((reg
->Register
.File
== TGSI_FILE_TEMPORARY
||
1777 reg
->Register
.File
== TGSI_FILE_INPUT
||
1778 reg
->Register
.File
== TGSI_FILE_OUTPUT
) &&
1779 reg
->Register
.Indirect
)
1788 struct x86_function
*func
,
1789 struct tgsi_full_instruction
*inst
)
1791 unsigned chan_index
;
1793 /* we can't handle indirect addressing into temp register file yet */
1794 if (indirect_reg_reference(inst
))
1797 switch (inst
->Instruction
.Opcode
) {
1798 case TGSI_OPCODE_ARL
:
1799 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1800 FETCH( func
, *inst
, 0, 0, chan_index
);
1801 emit_flr(func
, 0, 0);
1802 emit_f2it( func
, 0 );
1803 STORE( func
, *inst
, 0, 0, chan_index
);
1807 case TGSI_OPCODE_MOV
:
1808 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1809 FETCH( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1811 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1812 STORE( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1816 case TGSI_OPCODE_LIT
:
1817 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1818 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1824 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1825 STORE( func
, *inst
, 0, 0, CHAN_X
);
1827 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1828 STORE( func
, *inst
, 0, 0, CHAN_W
);
1831 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1832 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1833 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1834 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1839 TGSI_EXEC_TEMP_00000000_I
,
1840 TGSI_EXEC_TEMP_00000000_C
) );
1841 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1843 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1844 /* XMM[1] = SrcReg[0].yyyy */
1845 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1846 /* XMM[1] = max(XMM[1], 0) */
1851 TGSI_EXEC_TEMP_00000000_I
,
1852 TGSI_EXEC_TEMP_00000000_C
) );
1853 /* XMM[2] = SrcReg[0].wwww */
1854 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1855 /* XMM[2] = min(XMM[2], 128.0) */
1860 TGSI_EXEC_TEMP_128_I
,
1861 TGSI_EXEC_TEMP_128_C
) );
1862 /* XMM[2] = max(XMM[2], -128.0) */
1867 TGSI_EXEC_TEMP_MINUS_128_I
,
1868 TGSI_EXEC_TEMP_MINUS_128_C
) );
1869 emit_pow( func
, 3, 1, 1, 2 );
1870 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1884 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1889 case TGSI_OPCODE_RCP
:
1890 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1891 emit_rcp( func
, 0, 0 );
1892 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1893 STORE( func
, *inst
, 0, 0, chan_index
);
1897 case TGSI_OPCODE_RSQ
:
1898 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1899 emit_abs( func
, 0 );
1900 emit_rsqrt( func
, 1, 0 );
1901 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1902 STORE( func
, *inst
, 1, 0, chan_index
);
1906 case TGSI_OPCODE_EXP
:
1907 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1908 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1909 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1910 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1911 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1912 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1913 emit_MOV( func
, 1, 0 );
1914 emit_flr( func
, 2, 1 );
1915 /* dst.x = ex2(floor(src.x)) */
1916 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1917 emit_MOV( func
, 2, 1 );
1918 emit_ex2( func
, 3, 2 );
1919 STORE( func
, *inst
, 2, 0, CHAN_X
);
1921 /* dst.y = src.x - floor(src.x) */
1922 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1923 emit_MOV( func
, 2, 0 );
1924 emit_sub( func
, 2, 1 );
1925 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1928 /* dst.z = ex2(src.x) */
1929 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1930 emit_ex2( func
, 3, 0 );
1931 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1935 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1936 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1937 STORE( func
, *inst
, 0, 0, CHAN_W
);
1941 case TGSI_OPCODE_LOG
:
1942 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1943 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1944 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1945 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1946 emit_abs( func
, 0 );
1947 emit_MOV( func
, 1, 0 );
1948 emit_lg2( func
, 2, 1 );
1949 /* dst.z = lg2(abs(src.x)) */
1950 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1951 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1953 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1954 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1955 emit_flr( func
, 2, 1 );
1956 /* dst.x = floor(lg2(abs(src.x))) */
1957 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1958 STORE( func
, *inst
, 1, 0, CHAN_X
);
1960 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1961 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1962 emit_ex2( func
, 2, 1 );
1963 emit_rcp( func
, 1, 1 );
1964 emit_mul( func
, 0, 1 );
1965 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1970 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1971 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1972 STORE( func
, *inst
, 0, 0, CHAN_W
);
1976 case TGSI_OPCODE_MUL
:
1977 /* do all fetches and adds, storing results in temp regs */
1978 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1979 int r
= chan_index
+ 1;
1980 FETCH( func
, *inst
, 0, 0, chan_index
); /* load xmm[0] */
1981 FETCH( func
, *inst
, r
, 1, chan_index
); /* load xmm[r] */
1982 emit_mul( func
, r
, 0 ); /* xmm[r] = xmm[r] * xmm[0] */
1984 /* do all stores of the temp regs */
1985 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1986 int r
= chan_index
+ 1;
1987 STORE( func
, *inst
, r
, 0, chan_index
); /* store xmm[r] */
1991 case TGSI_OPCODE_ADD
:
1992 /* do all fetches and adds, storing results in temp regs */
1993 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1994 int r
= chan_index
+ 1;
1995 FETCH( func
, *inst
, 0, 0, chan_index
); /* load xmm[0] */
1996 FETCH( func
, *inst
, r
, 1, chan_index
); /* load xmm[r] */
1997 emit_add( func
, r
, 0 ); /* xmm[r] = xmm[r] + xmm[0] */
1999 /* do all stores of the temp regs */
2000 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2001 int r
= chan_index
+ 1;
2002 STORE( func
, *inst
, r
, 0, chan_index
); /* store xmm[r] */
2006 case TGSI_OPCODE_DP3
:
2007 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2008 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2009 emit_mul( func
, 0, 1 );
2010 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2011 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2012 emit_mul( func
, 1, 2 );
2013 emit_add( func
, 0, 1 );
2014 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2015 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2016 emit_mul( func
, 1, 2 );
2017 emit_add( func
, 0, 1 );
2018 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2019 STORE( func
, *inst
, 0, 0, chan_index
);
2023 case TGSI_OPCODE_DP4
:
2024 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2025 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2026 emit_mul( func
, 0, 1 );
2027 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2028 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2029 emit_mul( func
, 1, 2 );
2030 emit_add( func
, 0, 1 );
2031 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2032 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2033 emit_mul(func
, 1, 2 );
2034 emit_add(func
, 0, 1 );
2035 FETCH( func
, *inst
, 1, 0, CHAN_W
);
2036 FETCH( func
, *inst
, 2, 1, CHAN_W
);
2037 emit_mul( func
, 1, 2 );
2038 emit_add( func
, 0, 1 );
2039 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2040 STORE( func
, *inst
, 0, 0, chan_index
);
2044 case TGSI_OPCODE_DST
:
2045 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2051 STORE( func
, *inst
, 0, 0, CHAN_X
);
2053 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2054 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2055 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
2056 emit_mul( func
, 0, 1 );
2057 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2059 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2060 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
2061 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2063 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2064 FETCH( func
, *inst
, 0, 1, CHAN_W
);
2065 STORE( func
, *inst
, 0, 0, CHAN_W
);
2069 case TGSI_OPCODE_MIN
:
2070 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2071 FETCH( func
, *inst
, 0, 0, chan_index
);
2072 FETCH( func
, *inst
, 1, 1, chan_index
);
2077 STORE( func
, *inst
, 0, 0, chan_index
);
2081 case TGSI_OPCODE_MAX
:
2082 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2083 FETCH( func
, *inst
, 0, 0, chan_index
);
2084 FETCH( func
, *inst
, 1, 1, chan_index
);
2089 STORE( func
, *inst
, 0, 0, chan_index
);
2093 case TGSI_OPCODE_SLT
:
2094 emit_setcc( func
, inst
, cc_LessThan
);
2097 case TGSI_OPCODE_SGE
:
2098 emit_setcc( func
, inst
, cc_NotLessThan
);
2101 case TGSI_OPCODE_MAD
:
2102 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2103 FETCH( func
, *inst
, 0, 0, chan_index
);
2104 FETCH( func
, *inst
, 1, 1, chan_index
);
2105 FETCH( func
, *inst
, 2, 2, chan_index
);
2106 emit_mul( func
, 0, 1 );
2107 emit_add( func
, 0, 2 );
2108 STORE( func
, *inst
, 0, 0, chan_index
);
2112 case TGSI_OPCODE_SUB
:
2113 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2114 FETCH( func
, *inst
, 0, 0, chan_index
);
2115 FETCH( func
, *inst
, 1, 1, chan_index
);
2116 emit_sub( func
, 0, 1 );
2117 STORE( func
, *inst
, 0, 0, chan_index
);
2121 case TGSI_OPCODE_LRP
:
2122 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2123 FETCH( func
, *inst
, 0, 0, chan_index
);
2124 FETCH( func
, *inst
, 1, 1, chan_index
);
2125 FETCH( func
, *inst
, 2, 2, chan_index
);
2126 emit_sub( func
, 1, 2 );
2127 emit_mul( func
, 0, 1 );
2128 emit_add( func
, 0, 2 );
2129 STORE( func
, *inst
, 0, 0, chan_index
);
2133 case TGSI_OPCODE_CND
:
2137 case TGSI_OPCODE_DP2A
:
2138 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2139 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2140 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2141 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2142 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2143 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2144 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2145 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
2146 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2147 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2148 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2152 case TGSI_OPCODE_FRC
:
2153 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2154 FETCH( func
, *inst
, 0, 0, chan_index
);
2155 emit_frc( func
, 0, 0 );
2156 STORE( func
, *inst
, 0, 0, chan_index
);
2160 case TGSI_OPCODE_CLAMP
:
2164 case TGSI_OPCODE_FLR
:
2165 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2166 FETCH( func
, *inst
, 0, 0, chan_index
);
2167 emit_flr( func
, 0, 0 );
2168 STORE( func
, *inst
, 0, 0, chan_index
);
2172 case TGSI_OPCODE_ROUND
:
2173 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2174 FETCH( func
, *inst
, 0, 0, chan_index
);
2175 emit_rnd( func
, 0, 0 );
2176 STORE( func
, *inst
, 0, 0, chan_index
);
2180 case TGSI_OPCODE_EX2
:
2181 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2182 emit_ex2( func
, 0, 0 );
2183 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2184 STORE( func
, *inst
, 0, 0, chan_index
);
2188 case TGSI_OPCODE_LG2
:
2189 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2190 emit_lg2( func
, 0, 0 );
2191 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2192 STORE( func
, *inst
, 0, 0, chan_index
);
2196 case TGSI_OPCODE_POW
:
2197 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2198 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2199 emit_pow( func
, 0, 0, 0, 1 );
2200 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2201 STORE( func
, *inst
, 0, 0, chan_index
);
2205 case TGSI_OPCODE_XPD
:
2206 /* Note: we do all stores after all operands have been fetched
2207 * to avoid src/dst register aliasing issues for an instruction
2208 * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2210 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2211 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2212 FETCH( func
, *inst
, 1, 1, CHAN_Z
); /* xmm[1] = src[1].z */
2213 FETCH( func
, *inst
, 3, 0, CHAN_Z
); /* xmm[3] = src[0].z */
2215 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2216 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2217 FETCH( func
, *inst
, 0, 0, CHAN_Y
); /* xmm[0] = src[0].y */
2218 FETCH( func
, *inst
, 4, 1, CHAN_Y
); /* xmm[4] = src[1].y */
2220 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2221 emit_MOV( func
, 7, 0 ); /* xmm[7] = xmm[0] */
2222 emit_mul( func
, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
2223 emit_MOV( func
, 5, 3 ); /* xmm[5] = xmm[3] */
2224 emit_mul( func
, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2225 emit_sub( func
, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
2226 /* store xmm[7] in dst.x below */
2228 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2229 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2230 FETCH( func
, *inst
, 2, 1, CHAN_X
); /* xmm[2] = src[1].x */
2231 FETCH( func
, *inst
, 5, 0, CHAN_X
); /* xmm[5] = src[0].x */
2233 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2234 emit_mul( func
, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
2235 emit_mul( func
, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
2236 emit_sub( func
, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
2237 /* store xmm[3] in dst.y below */
2239 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2240 emit_mul( func
, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
2241 emit_mul( func
, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
2242 emit_sub( func
, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
2243 STORE( func
, *inst
, 5, 0, CHAN_Z
); /* dst.z = xmm[5] */
2245 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2246 STORE( func
, *inst
, 7, 0, CHAN_X
); /* dst.x = xmm[7] */
2248 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2249 STORE( func
, *inst
, 3, 0, CHAN_Y
); /* dst.y = xmm[3] */
2251 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2257 STORE( func
, *inst
, 0, 0, CHAN_W
);
2261 case TGSI_OPCODE_ABS
:
2262 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2263 FETCH( func
, *inst
, 0, 0, chan_index
);
2264 emit_abs( func
, 0) ;
2266 STORE( func
, *inst
, 0, 0, chan_index
);
2270 case TGSI_OPCODE_RCC
:
2274 case TGSI_OPCODE_DPH
:
2275 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2276 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2277 emit_mul( func
, 0, 1 );
2278 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2279 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2280 emit_mul( func
, 1, 2 );
2281 emit_add( func
, 0, 1 );
2282 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2283 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2284 emit_mul( func
, 1, 2 );
2285 emit_add( func
, 0, 1 );
2286 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2287 emit_add( func
, 0, 1 );
2288 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2289 STORE( func
, *inst
, 0, 0, chan_index
);
2293 case TGSI_OPCODE_COS
:
2294 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2295 emit_cos( func
, 0, 0 );
2296 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2297 STORE( func
, *inst
, 0, 0, chan_index
);
2301 case TGSI_OPCODE_DDX
:
2305 case TGSI_OPCODE_DDY
:
2309 case TGSI_OPCODE_KILP
:
2310 /* predicated kill */
2312 return 0; /* XXX fix me */
2315 case TGSI_OPCODE_KIL
:
2316 /* conditional kill */
2317 emit_kil( func
, &inst
->Src
[0] );
2320 case TGSI_OPCODE_PK2H
:
2324 case TGSI_OPCODE_PK2US
:
2328 case TGSI_OPCODE_PK4B
:
2332 case TGSI_OPCODE_PK4UB
:
2336 case TGSI_OPCODE_RFL
:
2340 case TGSI_OPCODE_SEQ
:
2341 emit_setcc( func
, inst
, cc_Equal
);
2344 case TGSI_OPCODE_SFL
:
2348 case TGSI_OPCODE_SGT
:
2349 emit_setcc( func
, inst
, cc_NotLessThanEqual
);
2352 case TGSI_OPCODE_SIN
:
2353 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2354 emit_sin( func
, 0, 0 );
2355 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2356 STORE( func
, *inst
, 0, 0, chan_index
);
2360 case TGSI_OPCODE_SLE
:
2361 emit_setcc( func
, inst
, cc_LessThanEqual
);
2364 case TGSI_OPCODE_SNE
:
2365 emit_setcc( func
, inst
, cc_NotEqual
);
2368 case TGSI_OPCODE_STR
:
2372 case TGSI_OPCODE_TEX
:
2373 emit_tex( func
, inst
, FALSE
, FALSE
);
2376 case TGSI_OPCODE_TXD
:
2380 case TGSI_OPCODE_UP2H
:
2384 case TGSI_OPCODE_UP2US
:
2388 case TGSI_OPCODE_UP4B
:
2392 case TGSI_OPCODE_UP4UB
:
2396 case TGSI_OPCODE_X2D
:
2400 case TGSI_OPCODE_ARA
:
2404 case TGSI_OPCODE_ARR
:
2405 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2406 FETCH( func
, *inst
, 0, 0, chan_index
);
2407 emit_rnd( func
, 0, 0 );
2408 emit_f2it( func
, 0 );
2409 STORE( func
, *inst
, 0, 0, chan_index
);
2413 case TGSI_OPCODE_BRA
:
2417 case TGSI_OPCODE_CAL
:
2421 case TGSI_OPCODE_RET
:
2425 case TGSI_OPCODE_END
:
2428 case TGSI_OPCODE_SSG
:
2429 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2430 FETCH( func
, *inst
, 0, 0, chan_index
);
2431 emit_sgn( func
, 0, 0 );
2432 STORE( func
, *inst
, 0, 0, chan_index
);
2436 case TGSI_OPCODE_CMP
:
2437 emit_cmp (func
, inst
);
2440 case TGSI_OPCODE_SCS
:
2441 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2442 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2443 emit_cos( func
, 0, 0 );
2444 STORE( func
, *inst
, 0, 0, CHAN_X
);
2446 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2447 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2448 emit_sin( func
, 0, 0 );
2449 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2451 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2455 TGSI_EXEC_TEMP_00000000_I
,
2456 TGSI_EXEC_TEMP_00000000_C
);
2457 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2459 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2465 STORE( func
, *inst
, 0, 0, CHAN_W
);
2469 case TGSI_OPCODE_TXB
:
2470 emit_tex( func
, inst
, TRUE
, FALSE
);
2473 case TGSI_OPCODE_NRM
:
2475 case TGSI_OPCODE_NRM4
:
2476 /* 3 or 4-component normalization */
2478 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2480 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2481 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2482 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2483 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2485 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2488 /* xmm0 = src.x * src.x */
2489 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2490 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2491 emit_MOV(func
, 4, 0);
2493 emit_mul(func
, 0, 0);
2496 /* xmm0 = xmm0 + src.y * src.y */
2497 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2498 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2499 emit_MOV(func
, 5, 1);
2501 emit_mul(func
, 1, 1);
2502 emit_add(func
, 0, 1);
2505 /* xmm0 = xmm0 + src.z * src.z */
2506 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2507 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2508 emit_MOV(func
, 6, 1);
2510 emit_mul(func
, 1, 1);
2511 emit_add(func
, 0, 1);
2515 /* xmm0 = xmm0 + src.w * src.w */
2516 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2517 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2518 emit_MOV(func
, 7, 1);
2520 emit_mul(func
, 1, 1);
2521 emit_add(func
, 0, 1);
2524 /* xmm1 = 1 / sqrt(xmm0) */
2525 emit_rsqrt(func
, 1, 0);
2527 /* dst.x = xmm1 * src.x */
2528 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2529 emit_mul(func
, 4, 1);
2530 STORE(func
, *inst
, 4, 0, CHAN_X
);
2533 /* dst.y = xmm1 * src.y */
2534 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2535 emit_mul(func
, 5, 1);
2536 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2539 /* dst.z = xmm1 * src.z */
2540 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2541 emit_mul(func
, 6, 1);
2542 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2545 /* dst.w = xmm1 * src.w */
2546 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2547 emit_mul(func
, 7, 1);
2548 STORE(func
, *inst
, 7, 0, CHAN_W
);
2553 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2554 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2555 STORE(func
, *inst
, 0, 0, CHAN_W
);
2560 case TGSI_OPCODE_DIV
:
2564 case TGSI_OPCODE_DP2
:
2565 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2566 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2567 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2568 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2569 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2570 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2571 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2572 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2573 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2577 case TGSI_OPCODE_TXL
:
2581 case TGSI_OPCODE_TXP
:
2582 emit_tex( func
, inst
, FALSE
, TRUE
);
2585 case TGSI_OPCODE_BRK
:
2589 case TGSI_OPCODE_IF
:
2593 case TGSI_OPCODE_ELSE
:
2597 case TGSI_OPCODE_ENDIF
:
2601 case TGSI_OPCODE_PUSHA
:
2605 case TGSI_OPCODE_POPA
:
2609 case TGSI_OPCODE_CEIL
:
2613 case TGSI_OPCODE_I2F
:
2617 case TGSI_OPCODE_NOT
:
2621 case TGSI_OPCODE_TRUNC
:
2622 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2623 FETCH( func
, *inst
, 0, 0, chan_index
);
2624 emit_f2it( func
, 0 );
2625 emit_i2f( func
, 0 );
2626 STORE( func
, *inst
, 0, 0, chan_index
);
2630 case TGSI_OPCODE_SHL
:
2634 case TGSI_OPCODE_ISHR
:
2638 case TGSI_OPCODE_AND
:
2642 case TGSI_OPCODE_OR
:
2646 case TGSI_OPCODE_MOD
:
2650 case TGSI_OPCODE_XOR
:
2654 case TGSI_OPCODE_SAD
:
2658 case TGSI_OPCODE_TXF
:
2662 case TGSI_OPCODE_TXQ
:
2666 case TGSI_OPCODE_CONT
:
2670 case TGSI_OPCODE_EMIT
:
2674 case TGSI_OPCODE_ENDPRIM
:
2687 struct x86_function
*func
,
2688 struct tgsi_full_declaration
*decl
)
2690 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2691 unsigned first
, last
, mask
;
2694 first
= decl
->Range
.First
;
2695 last
= decl
->Range
.Last
;
2696 mask
= decl
->Declaration
.UsageMask
;
2698 for( i
= first
; i
<= last
; i
++ ) {
2699 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2700 if( mask
& (1 << j
) ) {
2701 switch( decl
->Declaration
.Interpolate
) {
2702 case TGSI_INTERPOLATE_CONSTANT
:
2703 emit_coef_a0( func
, 0, i
, j
);
2704 emit_inputs( func
, 0, i
, j
);
2707 case TGSI_INTERPOLATE_LINEAR
:
2708 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2709 emit_coef_dadx( func
, 1, i
, j
);
2710 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2711 emit_coef_dady( func
, 3, i
, j
);
2712 emit_mul( func
, 0, 1 ); /* x * dadx */
2713 emit_coef_a0( func
, 4, i
, j
);
2714 emit_mul( func
, 2, 3 ); /* y * dady */
2715 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2716 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2717 emit_inputs( func
, 0, i
, j
);
2720 case TGSI_INTERPOLATE_PERSPECTIVE
:
2721 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2722 emit_coef_dadx( func
, 1, i
, j
);
2723 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2724 emit_coef_dady( func
, 3, i
, j
);
2725 emit_mul( func
, 0, 1 ); /* x * dadx */
2726 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2727 emit_coef_a0( func
, 5, i
, j
);
2728 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2729 emit_mul( func
, 2, 3 ); /* y * dady */
2730 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2731 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2732 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2733 emit_inputs( func
, 0, i
, j
);
2746 static void aos_to_soa( struct x86_function
*func
,
2752 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2753 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2754 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2755 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2756 int loop_top
, loop_exit_fixup
;
2759 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2761 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2762 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2763 x86_lea( func
, soa_input
,
2764 x86_make_disp( soa_input
,
2765 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2766 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2767 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2769 /* while (num_inputs != 0) */
2770 loop_top
= x86_get_label( func
);
2771 x86_cmp_imm( func
, num_inputs
, 0 );
2772 loop_exit_fixup
= x86_jcc_forward( func
, cc_E
);
2775 x86_push( func
, aos_input
);
2776 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2777 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2778 x86_add( func
, aos_input
, stride
);
2779 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2780 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2781 x86_add( func
, aos_input
, stride
);
2782 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2783 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2784 x86_add( func
, aos_input
, stride
);
2785 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2786 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2787 x86_pop( func
, aos_input
);
2789 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2790 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2791 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2792 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2793 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2794 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2796 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2797 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2798 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2799 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2801 /* Advance to next input */
2802 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2803 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2806 x86_dec( func
, num_inputs
);
2807 x86_jmp( func
, loop_top
);
2808 x86_fixup_fwd_jump( func
, loop_exit_fixup
);
2811 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2814 static void soa_to_aos( struct x86_function
*func
,
2820 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2821 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2822 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2823 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2827 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2829 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2830 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2831 x86_lea( func
, soa_output
,
2832 x86_make_disp( soa_output
,
2833 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2834 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2837 inner_loop
= x86_get_label( func
);
2839 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2840 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2841 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2842 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2844 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2845 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2846 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2847 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2848 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2849 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2851 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2852 x86_push( func
, aos_output
);
2853 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2854 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2855 x86_add( func
, aos_output
, temp
);
2856 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2857 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2858 x86_add( func
, aos_output
, temp
);
2859 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2860 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2861 x86_add( func
, aos_output
, temp
);
2862 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2863 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2864 x86_pop( func
, aos_output
);
2866 /* Advance to next output */
2867 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2868 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2870 /* while --num_outputs */
2871 x86_dec( func
, num_outputs
);
2872 x86_jcc( func
, cc_NE
, inner_loop
);
2875 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2880 * Check if the instructions dst register is the same as any src
2881 * register and warn if there's a posible SOA dependency.
2884 check_soa_dependencies(const struct tgsi_full_instruction
*inst
)
2886 uint opcode
= inst
->Instruction
.Opcode
;
2888 /* XXX: we only handle src/dst aliasing in a few opcodes currently.
2889 * Need to use an additional temporay to hold the result in the
2890 * cases where the code is too opaque to fix.
2894 case TGSI_OPCODE_ADD
:
2895 case TGSI_OPCODE_MOV
:
2896 case TGSI_OPCODE_MUL
:
2897 case TGSI_OPCODE_RCP
:
2898 case TGSI_OPCODE_RSQ
:
2899 case TGSI_OPCODE_EXP
:
2900 case TGSI_OPCODE_LOG
:
2901 case TGSI_OPCODE_DP3
:
2902 case TGSI_OPCODE_DP4
:
2903 case TGSI_OPCODE_DP2A
:
2904 case TGSI_OPCODE_EX2
:
2905 case TGSI_OPCODE_LG2
:
2906 case TGSI_OPCODE_POW
:
2907 case TGSI_OPCODE_XPD
:
2908 case TGSI_OPCODE_DPH
:
2909 case TGSI_OPCODE_COS
:
2910 case TGSI_OPCODE_SIN
:
2911 case TGSI_OPCODE_TEX
:
2912 case TGSI_OPCODE_TXB
:
2913 case TGSI_OPCODE_TXP
:
2914 case TGSI_OPCODE_NRM
:
2915 case TGSI_OPCODE_NRM4
:
2916 case TGSI_OPCODE_DP2
:
2917 /* OK - these opcodes correctly handle SOA dependencies */
2920 if (!tgsi_check_soa_dependencies(inst
))
2923 debug_printf("Warning: src/dst aliasing in instruction"
2924 " is not handled:\n");
2925 debug_printf("Warning: ");
2926 tgsi_dump_instruction(inst
, 1);
2934 * Translate a TGSI vertex/fragment shader to SSE2 code.
2935 * Slightly different things are done for vertex vs. fragment shaders.
2937 * \param tokens the TGSI input shader
2938 * \param func the output SSE code/function
2939 * \param immediates buffer to place immediates, later passed to SSE func
2940 * \param return 1 for success, 0 if translation failed
2944 const struct tgsi_token
*tokens
,
2945 struct x86_function
*func
,
2946 float (*immediates
)[4],
2947 boolean do_swizzles
)
2949 struct tgsi_parse_context parse
;
2951 uint num_immediates
= 0;
2955 func
->csr
= func
->store
;
2957 tgsi_parse_init( &parse
, tokens
);
2959 /* Can't just use EDI, EBX without save/restoring them:
2961 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2962 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2965 * Different function args for vertex/fragment shaders:
2967 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2973 6 ); /* input_stride */
2979 x86_fn_arg( func
, 1 ) );
2983 x86_fn_arg( func
, 2 ) );
2986 get_immediate_base(),
2987 x86_fn_arg( func
, 3 ) );
2989 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2993 x86_fn_arg( func
, 4 ) );
2999 x86_make_disp( get_machine_base(),
3000 Offset( struct tgsi_exec_machine
, Samplers
) ) );
3002 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
3003 tgsi_parse_token( &parse
);
3005 switch( parse
.FullToken
.Token
.Type
) {
3006 case TGSI_TOKEN_TYPE_DECLARATION
:
3007 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
3010 &parse
.FullToken
.FullDeclaration
);
3014 case TGSI_TOKEN_TYPE_INSTRUCTION
:
3015 ok
= emit_instruction(
3017 &parse
.FullToken
.FullInstruction
);
3020 uint opcode
= parse
.FullToken
.FullInstruction
.Instruction
.Opcode
;
3021 uint proc
= parse
.FullHeader
.Processor
.Processor
;
3022 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
3024 tgsi_get_opcode_name(opcode
),
3025 tgsi_get_processor_name(proc
));
3029 ok
= check_soa_dependencies(&parse
.FullToken
.FullInstruction
);
3032 case TGSI_TOKEN_TYPE_IMMEDIATE
:
3033 /* simply copy the immediate values into the next immediates[] slot */
3035 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
3038 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
3039 for( i
= 0; i
< size
; i
++ ) {
3040 immediates
[num_immediates
][i
] =
3041 parse
.FullToken
.FullImmediate
.u
[i
].Float
;
3044 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
3046 immediates
[num_immediates
][0],
3047 immediates
[num_immediates
][1],
3048 immediates
[num_immediates
][2],
3049 immediates
[num_immediates
][3]);
3054 case TGSI_TOKEN_TYPE_PROPERTY
:
3055 /* we just ignore them for now */
3064 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
3069 8, /* num_outputs */
3070 9 ); /* output_stride */
3073 /* Can't just use EBX, EDI without save/restoring them:
3075 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
3076 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
3080 tgsi_parse_free( &parse
);
3085 #endif /* PIPE_ARCH_X86 */