1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
44 #include "rtasm/rtasm_x86sse.h"
48 * This costs about 100fps (close to 10%) in gears:
50 #define HIGH_PRECISION 1
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83 * X86 utility functions.
92 (enum x86_reg_name
) xmm
);
96 * X86 register mapping helpers.
100 get_const_base( void )
107 static struct x86_reg
108 get_machine_base( void )
115 static struct x86_reg
116 get_input_base( void )
118 return x86_make_disp(
120 Offset(struct tgsi_exec_machine
, Inputs
) );
123 static struct x86_reg
124 get_output_base( void )
126 return x86_make_disp(
128 Offset(struct tgsi_exec_machine
, Outputs
) );
131 static struct x86_reg
132 get_temp_base( void )
134 return x86_make_disp(
136 Offset(struct tgsi_exec_machine
, Temps
) );
139 static struct x86_reg
140 get_coef_base( void )
147 static struct x86_reg
148 get_sampler_base( void )
155 static struct x86_reg
156 get_immediate_base( void )
165 * Data access helpers.
169 static struct x86_reg
174 return x86_make_disp(
175 get_immediate_base(),
176 (vec
* 4 + chan
) * 4 );
179 static struct x86_reg
184 return x86_make_disp(
186 (vec
* 4 + chan
) * 4 );
189 static struct x86_reg
193 return x86_make_disp(
195 unit
* sizeof( struct tgsi_sampler
* ) );
198 static struct x86_reg
203 return x86_make_disp(
205 (vec
* 4 + chan
) * 16 );
208 static struct x86_reg
213 return x86_make_disp(
215 (vec
* 4 + chan
) * 16 );
218 static struct x86_reg
223 return x86_make_disp(
225 (vec
* 4 + chan
) * 16 );
228 static struct x86_reg
234 return x86_make_disp(
236 ((vec
* 3 + member
) * 4 + chan
) * 4 );
242 struct x86_function
*func
)
249 * Data fetch helpers.
253 * Copy a shader constant to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src const buffer index
256 * \param chan src channel to fetch (X, Y, Z or W)
260 struct x86_function
*func
,
269 /* 'vec' is the offset from the address register's value.
270 * We're loading CONST[ADDR+vec] into an xmm register.
272 struct x86_reg r0
= get_input_base();
273 struct x86_reg r1
= get_output_base();
276 assert( indirectFile
== TGSI_FILE_ADDRESS
);
277 assert( indirectIndex
== 0 );
279 x86_push( func
, r0
);
280 x86_push( func
, r1
);
283 * Loop over the four pixels or vertices in the quad.
284 * Get the value of the address (offset) register for pixel/vertex[i],
285 * add it to the src offset and index into the constant buffer.
286 * Note that we're working on SOA data.
287 * If any of the pixel/vertex execution channels are unused their
288 * values will be garbage. It's very important that we don't use
289 * those garbage values as indexes into the constant buffer since
290 * that'll cause segfaults.
291 * The solution is to bitwise-AND the offset with the execution mask
292 * register whose values are either 0 or ~0.
293 * The caller must setup the execution mask register to indicate
294 * which channels are valid/alive before running the shader.
295 * The execution mask will also figure into loops and conditionals
298 for (i
= 0; i
< QUAD_SIZE
; i
++) {
299 /* r1 = address register[i] */
300 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
301 /* r0 = execution mask[i] */
302 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
304 x86_and( func
, r1
, r0
);
305 /* r0 = 'vec', the offset */
306 x86_lea( func
, r0
, get_const( vec
, chan
) );
308 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
310 x86_add( func
, r1
, r1
);
311 x86_add( func
, r1
, r1
);
312 x86_add( func
, r1
, r1
);
313 x86_add( func
, r1
, r1
);
315 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
316 x86_mov( func
, r1
, x86_deref( r0
) );
317 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
326 get_temp( TEMP_R0
, CHAN_X
) );
329 /* 'vec' is the index into the src register file, such as TEMP[vec] */
335 get_const( vec
, chan
) );
340 SHUF( 0, 0, 0, 0 ) );
346 struct x86_function
*func
,
354 get_immediate( vec
, chan
) );
359 SHUF( 0, 0, 0, 0 ) );
364 * Copy a shader input to xmm register
365 * \param xmm the destination xmm register
366 * \param vec the src input attrib
367 * \param chan src channel to fetch (X, Y, Z or W)
371 struct x86_function
*func
,
379 get_input( vec
, chan
) );
383 * Store an xmm register to a shader output
384 * \param xmm the source xmm register
385 * \param vec the dest output attrib
386 * \param chan src dest channel to store (X, Y, Z or W)
390 struct x86_function
*func
,
397 get_output( vec
, chan
),
402 * Copy a shader temporary to xmm register
403 * \param xmm the destination xmm register
404 * \param vec the src temp register
405 * \param chan src channel to fetch (X, Y, Z or W)
409 struct x86_function
*func
,
417 get_temp( vec
, chan
) );
421 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
422 * \param xmm the destination xmm register
423 * \param vec the src input/attribute coefficient index
424 * \param chan src channel to fetch (X, Y, Z or W)
425 * \param member 0=a0, 1=dadx, 2=dady
429 struct x86_function
*func
,
438 get_coef( vec
, chan
, member
) );
443 SHUF( 0, 0, 0, 0 ) );
447 * Data store helpers.
452 struct x86_function
*func
,
459 get_input( vec
, chan
),
465 struct x86_function
*func
,
472 get_temp( vec
, chan
),
478 struct x86_function
*func
,
488 vec
+ TGSI_EXEC_TEMP_ADDR
,
493 * Coefficent fetch helpers.
498 struct x86_function
*func
,
513 struct x86_function
*func
,
528 struct x86_function
*func
,
542 * Function call helpers.
546 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
547 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
548 * that the stack pointer is 16 byte aligned, as expected.
552 struct x86_function
*func
,
553 unsigned xmm_save_mask
,
554 const struct x86_reg
*arg
,
556 void (PIPE_CDECL
*code
)() )
558 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
563 x86_make_reg( file_REG32
, reg_AX
) );
566 x86_make_reg( file_REG32
, reg_CX
) );
569 x86_make_reg( file_REG32
, reg_DX
) );
571 /* Store XMM regs to the stack
573 for(i
= 0, n
= 0; i
< 8; ++i
)
574 if(xmm_save_mask
& (1 << i
))
579 x86_make_reg( file_REG32
, reg_SP
),
582 for(i
= 0, n
= 0; i
< 8; ++i
)
583 if(xmm_save_mask
& (1 << i
)) {
586 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
591 for (i
= 0; i
< nr_args
; i
++) {
592 /* Load the address of the buffer we use for passing arguments and
600 /* Push actual function arguments (currently just the pointer to
601 * the buffer above), and call the function:
603 x86_push( func
, ecx
);
606 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
607 x86_call( func
, ecx
);
609 /* Pop the arguments (or just add an immediate to esp)
611 for (i
= 0; i
< nr_args
; i
++) {
615 /* Pop the saved XMM regs:
617 for(i
= 0, n
= 0; i
< 8; ++i
)
618 if(xmm_save_mask
& (1 << i
)) {
622 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
628 x86_make_reg( file_REG32
, reg_SP
),
631 /* Restore GP registers in a reverse order.
635 x86_make_reg( file_REG32
, reg_DX
) );
638 x86_make_reg( file_REG32
, reg_CX
) );
641 x86_make_reg( file_REG32
, reg_AX
) );
645 emit_func_call_dst_src1(
646 struct x86_function
*func
,
650 void (PIPE_CDECL
*code
)() )
652 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
653 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
655 /* Store our input parameters (in xmm regs) to the buffer we use
656 * for passing arguments. We will pass a pointer to this buffer as
657 * the actual function argument.
662 make_xmm( xmm_src0
) );
664 emit_func_call( func
,
678 emit_func_call_dst_src2(
679 struct x86_function
*func
,
684 void (PIPE_CDECL
*code
)() )
686 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
687 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
689 /* Store two inputs to parameter buffer.
694 make_xmm( xmm_src0
) );
698 x86_make_disp( store
, 4 * sizeof(float) ),
699 make_xmm( xmm_src1
) );
704 emit_func_call( func
,
710 /* Retrieve the results:
722 #if defined(PIPE_ARCH_SSE)
725 * Fast SSE2 implementation of special math functions.
728 #define POLY0(x, c0) _mm_set1_ps(c0)
729 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
730 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
731 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
732 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
733 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
735 #define EXP_POLY_DEGREE 3
736 #define LOG_POLY_DEGREE 5
739 * See http://www.devmaster.net/forums/showthread.php?p=43580
745 __m128 fpart
, expipart
, expfpart
;
747 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
748 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
750 /* ipart = int(x - 0.5) */
751 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
753 /* fpart = x - ipart */
754 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
756 /* expipart = (float) (1 << ipart) */
757 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
759 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
760 #if EXP_POLY_DEGREE == 5
761 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
762 #elif EXP_POLY_DEGREE == 4
763 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
764 #elif EXP_POLY_DEGREE == 3
765 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
766 #elif EXP_POLY_DEGREE == 2
767 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
772 return _mm_mul_ps(expipart
, expfpart
);
777 * See http://www.devmaster.net/forums/showthread.php?p=43580
782 __m128i expmask
= _mm_set1_epi32(0x7f800000);
783 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
784 __m128 one
= _mm_set1_ps(1.0f
);
786 __m128i i
= _mm_castps_si128(x
);
788 /* exp = (float) exponent(x) */
789 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
791 /* mant = (float) mantissa(x) */
792 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
796 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
797 * These coefficients can be generate with
798 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
800 #if LOG_POLY_DEGREE == 6
801 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
802 #elif LOG_POLY_DEGREE == 5
803 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
804 #elif LOG_POLY_DEGREE == 4
805 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
806 #elif LOG_POLY_DEGREE == 3
807 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
812 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
813 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
815 return _mm_add_ps(logmant
, exp
);
820 powf4(__m128 x
, __m128 y
)
822 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
825 #endif /* PIPE_ARCH_SSE */
830 * Low-level instruction translators.
835 struct x86_function
*func
,
842 TGSI_EXEC_TEMP_7FFFFFFF_I
,
843 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
848 struct x86_function
*func
,
855 make_xmm( xmm_src
) );
858 static void PIPE_CDECL
862 store
[0] = cosf( store
[0] );
863 store
[1] = cosf( store
[1] );
864 store
[2] = cosf( store
[2] );
865 store
[3] = cosf( store
[3] );
870 struct x86_function
*func
,
874 emit_func_call_dst_src1(
882 static void PIPE_CDECL
883 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
884 __attribute__((force_align_arg_pointer
))
889 #if defined(PIPE_ARCH_SSE)
890 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
892 store
[0] = util_fast_exp2( store
[0] );
893 store
[1] = util_fast_exp2( store
[1] );
894 store
[2] = util_fast_exp2( store
[2] );
895 store
[3] = util_fast_exp2( store
[3] );
901 struct x86_function
*func
,
905 emit_func_call_dst_src1(
915 struct x86_function
*func
,
926 struct x86_function
*func
,
935 static void PIPE_CDECL
939 store
[0] = floorf( store
[0] );
940 store
[1] = floorf( store
[1] );
941 store
[2] = floorf( store
[2] );
942 store
[3] = floorf( store
[3] );
947 struct x86_function
*func
,
951 emit_func_call_dst_src1(
959 static void PIPE_CDECL
963 store
[0] -= floorf( store
[0] );
964 store
[1] -= floorf( store
[1] );
965 store
[2] -= floorf( store
[2] );
966 store
[3] -= floorf( store
[3] );
971 struct x86_function
*func
,
975 emit_func_call_dst_src1(
983 static void PIPE_CDECL
984 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
985 __attribute__((force_align_arg_pointer
))
990 #if defined(PIPE_ARCH_SSE)
991 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
993 store
[0] = util_fast_log2( store
[0] );
994 store
[1] = util_fast_log2( store
[1] );
995 store
[2] = util_fast_log2( store
[2] );
996 store
[3] = util_fast_log2( store
[3] );
1002 struct x86_function
*func
,
1006 emit_func_call_dst_src1(
1016 struct x86_function
*func
,
1022 make_xmm( xmm_dst
),
1023 make_xmm( xmm_src
) );
1027 emit_mul (struct x86_function
*func
,
1033 make_xmm( xmm_dst
),
1034 make_xmm( xmm_src
) );
1039 struct x86_function
*func
,
1046 TGSI_EXEC_TEMP_80000000_I
,
1047 TGSI_EXEC_TEMP_80000000_C
) );
1050 static void PIPE_CDECL
1051 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1052 __attribute__((force_align_arg_pointer
))
1057 #if defined(PIPE_ARCH_SSE)
1058 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1060 store
[0] = util_fast_pow( store
[0], store
[4] );
1061 store
[1] = util_fast_pow( store
[1], store
[5] );
1062 store
[2] = util_fast_pow( store
[2], store
[6] );
1063 store
[3] = util_fast_pow( store
[3], store
[7] );
1069 struct x86_function
*func
,
1075 emit_func_call_dst_src2(
1086 struct x86_function
*func
,
1090 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1091 * good enough. Need to either emit a proper divide or use the
1092 * iterative technique described below in emit_rsqrt().
1096 make_xmm( xmm_dst
),
1097 make_xmm( xmm_src
) );
1100 static void PIPE_CDECL
1104 store
[0] = floorf( store
[0] + 0.5f
);
1105 store
[1] = floorf( store
[1] + 0.5f
);
1106 store
[2] = floorf( store
[2] + 0.5f
);
1107 store
[3] = floorf( store
[3] + 0.5f
);
1112 struct x86_function
*func
,
1116 emit_func_call_dst_src1(
1126 struct x86_function
*func
,
1131 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1132 * implementations, it is possible to improve its precision at
1133 * fairly low cost, using a newton/raphson step, as below:
1135 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1136 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1138 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1141 struct x86_reg dst
= make_xmm( xmm_dst
);
1142 struct x86_reg src
= make_xmm( xmm_src
);
1143 struct x86_reg tmp0
= make_xmm( 2 );
1144 struct x86_reg tmp1
= make_xmm( 3 );
1146 assert( xmm_dst
!= xmm_src
);
1147 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1148 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1150 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1151 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1152 sse_rsqrtps( func
, tmp1
, src
);
1153 sse_mulps( func
, src
, tmp1
);
1154 sse_mulps( func
, dst
, tmp1
);
1155 sse_mulps( func
, src
, tmp1
);
1156 sse_subps( func
, tmp0
, src
);
1157 sse_mulps( func
, dst
, tmp0
);
1160 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1165 make_xmm( xmm_dst
),
1166 make_xmm( xmm_src
) );
1172 struct x86_function
*func
,
1179 TGSI_EXEC_TEMP_80000000_I
,
1180 TGSI_EXEC_TEMP_80000000_C
) );
1183 static void PIPE_CDECL
1187 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1188 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1189 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1190 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1195 struct x86_function
*func
,
1199 emit_func_call_dst_src1(
1207 static void PIPE_CDECL
1211 store
[0] = sinf( store
[0] );
1212 store
[1] = sinf( store
[1] );
1213 store
[2] = sinf( store
[2] );
1214 store
[3] = sinf( store
[3] );
1218 emit_sin (struct x86_function
*func
,
1222 emit_func_call_dst_src1(
1232 struct x86_function
*func
,
1238 make_xmm( xmm_dst
),
1239 make_xmm( xmm_src
) );
1254 struct x86_function
*func
,
1256 const struct tgsi_full_src_register
*reg
,
1257 const unsigned chan_index
)
1259 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1262 case TGSI_EXTSWIZZLE_X
:
1263 case TGSI_EXTSWIZZLE_Y
:
1264 case TGSI_EXTSWIZZLE_Z
:
1265 case TGSI_EXTSWIZZLE_W
:
1266 switch (reg
->SrcRegister
.File
) {
1267 case TGSI_FILE_CONSTANT
:
1271 reg
->SrcRegister
.Index
,
1273 reg
->SrcRegister
.Indirect
,
1274 reg
->SrcRegisterInd
.File
,
1275 reg
->SrcRegisterInd
.Index
);
1278 case TGSI_FILE_IMMEDIATE
:
1282 reg
->SrcRegister
.Index
,
1286 case TGSI_FILE_INPUT
:
1290 reg
->SrcRegister
.Index
,
1294 case TGSI_FILE_TEMPORARY
:
1298 reg
->SrcRegister
.Index
,
1307 case TGSI_EXTSWIZZLE_ZERO
:
1311 TGSI_EXEC_TEMP_00000000_I
,
1312 TGSI_EXEC_TEMP_00000000_C
);
1315 case TGSI_EXTSWIZZLE_ONE
:
1327 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1328 case TGSI_UTIL_SIGN_CLEAR
:
1329 emit_abs( func
, xmm
);
1332 case TGSI_UTIL_SIGN_SET
:
1333 emit_setsign( func
, xmm
);
1336 case TGSI_UTIL_SIGN_TOGGLE
:
1337 emit_neg( func
, xmm
);
1340 case TGSI_UTIL_SIGN_KEEP
:
1345 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1346 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1354 struct x86_function
*func
,
1356 const struct tgsi_full_dst_register
*reg
,
1357 const struct tgsi_full_instruction
*inst
,
1358 unsigned chan_index
)
1360 switch( reg
->DstRegister
.File
) {
1361 case TGSI_FILE_OUTPUT
:
1365 reg
->DstRegister
.Index
,
1369 case TGSI_FILE_TEMPORARY
:
1373 reg
->DstRegister
.Index
,
1377 case TGSI_FILE_ADDRESS
:
1381 reg
->DstRegister
.Index
,
1389 switch( inst
->Instruction
.Saturate
) {
1393 case TGSI_SAT_ZERO_ONE
:
1397 case TGSI_SAT_MINUS_PLUS_ONE
:
1403 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1404 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1407 static void PIPE_CDECL
1408 fetch_texel( struct tgsi_sampler
**sampler
,
1414 debug_printf("%s sampler: %p (%p) store: %p\n",
1419 debug_printf("lodbias %f\n", store
[12]);
1421 for (j
= 0; j
< 4; j
++)
1422 debug_printf("sample %d texcoord %f %f\n",
1429 float rgba
[NUM_CHANNELS
][QUAD_SIZE
];
1430 (*sampler
)->get_samples(*sampler
,
1434 0.0f
, /*store[12], lodbias */
1437 memcpy( store
, rgba
, 16 * sizeof(float));
1441 for (j
= 0; j
< 4; j
++)
1442 debug_printf("sample %d result %f %f %f %f\n",
1452 * High-level instruction translators.
1456 emit_tex( struct x86_function
*func
,
1457 const struct tgsi_full_instruction
*inst
,
1461 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
1462 struct x86_reg args
[2];
1466 switch (inst
->InstructionExtTexture
.Texture
) {
1467 case TGSI_TEXTURE_1D
:
1468 case TGSI_TEXTURE_SHADOW1D
:
1471 case TGSI_TEXTURE_2D
:
1472 case TGSI_TEXTURE_RECT
:
1473 case TGSI_TEXTURE_SHADOW2D
:
1474 case TGSI_TEXTURE_SHADOWRECT
:
1477 case TGSI_TEXTURE_3D
:
1478 case TGSI_TEXTURE_CUBE
:
1487 FETCH( func
, *inst
, 3, 0, 3 );
1493 TGSI_EXEC_TEMP_00000000_I
,
1494 TGSI_EXEC_TEMP_00000000_C
);
1498 /* store lodbias whether enabled or not -- fetch_texel currently
1499 * respects it always.
1502 get_temp( TEMP_R0
, 3 ),
1507 FETCH( func
, *inst
, 3, 0, 3 );
1509 emit_rcp( func
, 3, 3 );
1512 for (i
= 0; i
< count
; i
++) {
1513 FETCH( func
, *inst
, i
, 0, i
);
1522 /* Store in the argument buffer:
1526 get_temp( TEMP_R0
, i
),
1530 args
[0] = get_temp( TEMP_R0
, 0 );
1531 args
[1] = get_sampler_ptr( unit
);
1534 emit_func_call( func
,
1540 /* If all four channels are enabled, could use a pointer to
1541 * dst[0].x instead of TEMP_R0 for store?
1543 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, i
) {
1548 get_temp( TEMP_R0
, i
) );
1550 STORE( func
, *inst
, 0, 0, i
);
1557 struct x86_function
*func
,
1558 const struct tgsi_full_src_register
*reg
)
1560 unsigned uniquemask
;
1561 unsigned unique_count
= 0;
1562 unsigned chan_index
;
1565 /* This mask stores component bits that were already tested. Note that
1566 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1568 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1570 FOR_EACH_CHANNEL( chan_index
) {
1573 /* unswizzle channel */
1574 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1578 /* check if the component has not been already tested */
1579 if( !(uniquemask
& (1 << swizzle
)) ) {
1580 uniquemask
|= 1 << swizzle
;
1582 /* allocate register */
1593 x86_make_reg( file_REG32
, reg_AX
) );
1596 x86_make_reg( file_REG32
, reg_DX
) );
1598 for (i
= 0 ; i
< unique_count
; i
++ ) {
1599 struct x86_reg dataXMM
= make_xmm(i
);
1605 TGSI_EXEC_TEMP_00000000_I
,
1606 TGSI_EXEC_TEMP_00000000_C
),
1612 x86_make_reg( file_REG32
, reg_AX
),
1618 x86_make_reg( file_REG32
, reg_DX
),
1622 x86_make_reg( file_REG32
, reg_AX
),
1623 x86_make_reg( file_REG32
, reg_DX
) );
1630 TGSI_EXEC_TEMP_KILMASK_I
,
1631 TGSI_EXEC_TEMP_KILMASK_C
),
1632 x86_make_reg( file_REG32
, reg_AX
) );
1636 x86_make_reg( file_REG32
, reg_DX
) );
1639 x86_make_reg( file_REG32
, reg_AX
) );
1645 struct x86_function
*func
)
1647 /* XXX todo / fix me */
1653 struct x86_function
*func
,
1654 struct tgsi_full_instruction
*inst
,
1657 unsigned chan_index
;
1659 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1660 FETCH( func
, *inst
, 0, 0, chan_index
);
1661 FETCH( func
, *inst
, 1, 1, chan_index
);
1673 STORE( func
, *inst
, 0, 0, chan_index
);
1679 struct x86_function
*func
,
1680 struct tgsi_full_instruction
*inst
)
1682 unsigned chan_index
;
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1685 FETCH( func
, *inst
, 0, 0, chan_index
);
1686 FETCH( func
, *inst
, 1, 1, chan_index
);
1687 FETCH( func
, *inst
, 2, 2, chan_index
);
1692 TGSI_EXEC_TEMP_00000000_I
,
1693 TGSI_EXEC_TEMP_00000000_C
),
1707 STORE( func
, *inst
, 0, 0, chan_index
);
1713 * Check if inst src/dest regs use indirect addressing into temporary
1717 indirect_temp_reference(const struct tgsi_full_instruction
*inst
)
1720 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1721 const struct tgsi_full_src_register
*reg
= &inst
->FullSrcRegisters
[i
];
1722 if (reg
->SrcRegister
.File
== TGSI_FILE_TEMPORARY
&&
1723 reg
->SrcRegister
.Indirect
)
1726 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1727 const struct tgsi_full_dst_register
*reg
= &inst
->FullDstRegisters
[i
];
1728 if (reg
->DstRegister
.File
== TGSI_FILE_TEMPORARY
&&
1729 reg
->DstRegister
.Indirect
)
1738 struct x86_function
*func
,
1739 struct tgsi_full_instruction
*inst
)
1741 unsigned chan_index
;
1743 /* we can't handle indirect addressing into temp register file yet */
1744 if (indirect_temp_reference(inst
))
1747 switch (inst
->Instruction
.Opcode
) {
1748 case TGSI_OPCODE_ARL
:
1749 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1750 FETCH( func
, *inst
, 0, 0, chan_index
);
1751 emit_flr(func
, 0, 0);
1752 emit_f2it( func
, 0 );
1753 STORE( func
, *inst
, 0, 0, chan_index
);
1757 case TGSI_OPCODE_MOV
:
1758 case TGSI_OPCODE_SWZ
:
1759 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1760 FETCH( func
, *inst
, 0, 0, chan_index
);
1761 STORE( func
, *inst
, 0, 0, chan_index
);
1765 case TGSI_OPCODE_LIT
:
1766 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1767 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1773 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1774 STORE( func
, *inst
, 0, 0, CHAN_X
);
1776 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1777 STORE( func
, *inst
, 0, 0, CHAN_W
);
1780 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1781 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1782 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1783 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1788 TGSI_EXEC_TEMP_00000000_I
,
1789 TGSI_EXEC_TEMP_00000000_C
) );
1790 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1792 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1793 /* XMM[1] = SrcReg[0].yyyy */
1794 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1795 /* XMM[1] = max(XMM[1], 0) */
1800 TGSI_EXEC_TEMP_00000000_I
,
1801 TGSI_EXEC_TEMP_00000000_C
) );
1802 /* XMM[2] = SrcReg[0].wwww */
1803 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1804 /* XMM[2] = min(XMM[2], 128.0) */
1809 TGSI_EXEC_TEMP_128_I
,
1810 TGSI_EXEC_TEMP_128_C
) );
1811 /* XMM[2] = max(XMM[2], -128.0) */
1816 TGSI_EXEC_TEMP_MINUS_128_I
,
1817 TGSI_EXEC_TEMP_MINUS_128_C
) );
1818 emit_pow( func
, 3, 1, 1, 2 );
1819 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1833 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1838 case TGSI_OPCODE_RCP
:
1839 /* TGSI_OPCODE_RECIP */
1840 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1841 emit_rcp( func
, 0, 0 );
1842 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1843 STORE( func
, *inst
, 0, 0, chan_index
);
1847 case TGSI_OPCODE_RSQ
:
1848 /* TGSI_OPCODE_RECIPSQRT */
1849 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1850 emit_abs( func
, 0 );
1851 emit_rsqrt( func
, 1, 0 );
1852 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1853 STORE( func
, *inst
, 1, 0, chan_index
);
1857 case TGSI_OPCODE_EXP
:
1858 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1859 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1860 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1861 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1862 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1863 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1864 emit_MOV( func
, 1, 0 );
1865 emit_flr( func
, 2, 1 );
1866 /* dst.x = ex2(floor(src.x)) */
1867 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1868 emit_MOV( func
, 2, 1 );
1869 emit_ex2( func
, 3, 2 );
1870 STORE( func
, *inst
, 2, 0, CHAN_X
);
1872 /* dst.y = src.x - floor(src.x) */
1873 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1874 emit_MOV( func
, 2, 0 );
1875 emit_sub( func
, 2, 1 );
1876 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1879 /* dst.z = ex2(src.x) */
1880 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1881 emit_ex2( func
, 3, 0 );
1882 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1886 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1887 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1888 STORE( func
, *inst
, 0, 0, CHAN_W
);
1892 case TGSI_OPCODE_LOG
:
1893 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1894 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1895 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1896 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1897 emit_abs( func
, 0 );
1898 emit_MOV( func
, 1, 0 );
1899 emit_lg2( func
, 2, 1 );
1900 /* dst.z = lg2(abs(src.x)) */
1901 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1902 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1904 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1905 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1906 emit_flr( func
, 2, 1 );
1907 /* dst.x = floor(lg2(abs(src.x))) */
1908 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1909 STORE( func
, *inst
, 1, 0, CHAN_X
);
1911 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1912 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1913 emit_ex2( func
, 2, 1 );
1914 emit_rcp( func
, 1, 1 );
1915 emit_mul( func
, 0, 1 );
1916 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1921 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1922 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1923 STORE( func
, *inst
, 0, 0, CHAN_W
);
1927 case TGSI_OPCODE_MUL
:
1928 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1929 FETCH( func
, *inst
, 0, 0, chan_index
);
1930 FETCH( func
, *inst
, 1, 1, chan_index
);
1931 emit_mul( func
, 0, 1 );
1932 STORE( func
, *inst
, 0, 0, chan_index
);
1936 case TGSI_OPCODE_ADD
:
1937 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1938 FETCH( func
, *inst
, 0, 0, chan_index
);
1939 FETCH( func
, *inst
, 1, 1, chan_index
);
1940 emit_add( func
, 0, 1 );
1941 STORE( func
, *inst
, 0, 0, chan_index
);
1945 case TGSI_OPCODE_DP3
:
1946 /* TGSI_OPCODE_DOT3 */
1947 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1948 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1949 emit_mul( func
, 0, 1 );
1950 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1951 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1952 emit_mul( func
, 1, 2 );
1953 emit_add( func
, 0, 1 );
1954 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1955 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1956 emit_mul( func
, 1, 2 );
1957 emit_add( func
, 0, 1 );
1958 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1959 STORE( func
, *inst
, 0, 0, chan_index
);
1963 case TGSI_OPCODE_DP4
:
1964 /* TGSI_OPCODE_DOT4 */
1965 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1966 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1967 emit_mul( func
, 0, 1 );
1968 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1969 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1970 emit_mul( func
, 1, 2 );
1971 emit_add( func
, 0, 1 );
1972 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1973 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1974 emit_mul(func
, 1, 2 );
1975 emit_add(func
, 0, 1 );
1976 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1977 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1978 emit_mul( func
, 1, 2 );
1979 emit_add( func
, 0, 1 );
1980 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1981 STORE( func
, *inst
, 0, 0, chan_index
);
1985 case TGSI_OPCODE_DST
:
1986 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1992 STORE( func
, *inst
, 0, 0, CHAN_X
);
1994 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1995 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1996 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1997 emit_mul( func
, 0, 1 );
1998 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2000 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2001 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
2002 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2004 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2005 FETCH( func
, *inst
, 0, 1, CHAN_W
);
2006 STORE( func
, *inst
, 0, 0, CHAN_W
);
2010 case TGSI_OPCODE_MIN
:
2011 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2012 FETCH( func
, *inst
, 0, 0, chan_index
);
2013 FETCH( func
, *inst
, 1, 1, chan_index
);
2018 STORE( func
, *inst
, 0, 0, chan_index
);
2022 case TGSI_OPCODE_MAX
:
2023 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2024 FETCH( func
, *inst
, 0, 0, chan_index
);
2025 FETCH( func
, *inst
, 1, 1, chan_index
);
2030 STORE( func
, *inst
, 0, 0, chan_index
);
2034 case TGSI_OPCODE_SLT
:
2035 /* TGSI_OPCODE_SETLT */
2036 emit_setcc( func
, inst
, cc_LessThan
);
2039 case TGSI_OPCODE_SGE
:
2040 /* TGSI_OPCODE_SETGE */
2041 emit_setcc( func
, inst
, cc_NotLessThan
);
2044 case TGSI_OPCODE_MAD
:
2045 /* TGSI_OPCODE_MADD */
2046 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2047 FETCH( func
, *inst
, 0, 0, chan_index
);
2048 FETCH( func
, *inst
, 1, 1, chan_index
);
2049 FETCH( func
, *inst
, 2, 2, chan_index
);
2050 emit_mul( func
, 0, 1 );
2051 emit_add( func
, 0, 2 );
2052 STORE( func
, *inst
, 0, 0, chan_index
);
2056 case TGSI_OPCODE_SUB
:
2057 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2058 FETCH( func
, *inst
, 0, 0, chan_index
);
2059 FETCH( func
, *inst
, 1, 1, chan_index
);
2060 emit_sub( func
, 0, 1 );
2061 STORE( func
, *inst
, 0, 0, chan_index
);
2065 case TGSI_OPCODE_LERP
:
2066 /* TGSI_OPCODE_LRP */
2067 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2068 FETCH( func
, *inst
, 0, 0, chan_index
);
2069 FETCH( func
, *inst
, 1, 1, chan_index
);
2070 FETCH( func
, *inst
, 2, 2, chan_index
);
2071 emit_sub( func
, 1, 2 );
2072 emit_mul( func
, 0, 1 );
2073 emit_add( func
, 0, 2 );
2074 STORE( func
, *inst
, 0, 0, chan_index
);
2078 case TGSI_OPCODE_CND
:
2082 case TGSI_OPCODE_CND0
:
2086 case TGSI_OPCODE_DOT2ADD
:
2087 /* TGSI_OPCODE_DP2A */
2088 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2089 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2090 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2091 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2092 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2093 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2094 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2095 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
2096 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2097 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2098 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2102 case TGSI_OPCODE_INDEX
:
2106 case TGSI_OPCODE_NEGATE
:
2110 case TGSI_OPCODE_FRAC
:
2111 /* TGSI_OPCODE_FRC */
2112 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2113 FETCH( func
, *inst
, 0, 0, chan_index
);
2114 emit_frc( func
, 0, 0 );
2115 STORE( func
, *inst
, 0, 0, chan_index
);
2119 case TGSI_OPCODE_CLAMP
:
2123 case TGSI_OPCODE_FLOOR
:
2124 /* TGSI_OPCODE_FLR */
2125 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2126 FETCH( func
, *inst
, 0, 0, chan_index
);
2127 emit_flr( func
, 0, 0 );
2128 STORE( func
, *inst
, 0, 0, chan_index
);
2132 case TGSI_OPCODE_ROUND
:
2133 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2134 FETCH( func
, *inst
, 0, 0, chan_index
);
2135 emit_rnd( func
, 0, 0 );
2136 STORE( func
, *inst
, 0, 0, chan_index
);
2140 case TGSI_OPCODE_EXPBASE2
:
2141 /* TGSI_OPCODE_EX2 */
2142 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2143 emit_ex2( func
, 0, 0 );
2144 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2145 STORE( func
, *inst
, 0, 0, chan_index
);
2149 case TGSI_OPCODE_LOGBASE2
:
2150 /* TGSI_OPCODE_LG2 */
2151 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2152 emit_lg2( func
, 0, 0 );
2153 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2154 STORE( func
, *inst
, 0, 0, chan_index
);
2158 case TGSI_OPCODE_POWER
:
2159 /* TGSI_OPCODE_POW */
2160 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2161 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2162 emit_pow( func
, 0, 0, 0, 1 );
2163 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2164 STORE( func
, *inst
, 0, 0, chan_index
);
2168 case TGSI_OPCODE_CROSSPRODUCT
:
2169 /* TGSI_OPCODE_XPD */
2170 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2171 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2172 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
2173 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
2175 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2176 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2177 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2178 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
2180 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2181 emit_MOV( func
, 2, 0 );
2182 emit_mul( func
, 2, 1 );
2183 emit_MOV( func
, 5, 3 );
2184 emit_mul( func
, 5, 4 );
2185 emit_sub( func
, 2, 5 );
2186 STORE( func
, *inst
, 2, 0, CHAN_X
);
2188 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2189 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2190 FETCH( func
, *inst
, 2, 1, CHAN_X
);
2191 FETCH( func
, *inst
, 5, 0, CHAN_X
);
2193 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2194 emit_mul( func
, 3, 2 );
2195 emit_mul( func
, 1, 5 );
2196 emit_sub( func
, 3, 1 );
2197 STORE( func
, *inst
, 3, 0, CHAN_Y
);
2199 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2200 emit_mul( func
, 5, 4 );
2201 emit_mul( func
, 0, 2 );
2202 emit_sub( func
, 5, 0 );
2203 STORE( func
, *inst
, 5, 0, CHAN_Z
);
2205 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2211 STORE( func
, *inst
, 0, 0, CHAN_W
);
2215 case TGSI_OPCODE_MULTIPLYMATRIX
:
2219 case TGSI_OPCODE_ABS
:
2220 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2221 FETCH( func
, *inst
, 0, 0, chan_index
);
2222 emit_abs( func
, 0) ;
2224 STORE( func
, *inst
, 0, 0, chan_index
);
2228 case TGSI_OPCODE_RCC
:
2232 case TGSI_OPCODE_DPH
:
2233 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2234 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2235 emit_mul( func
, 0, 1 );
2236 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2237 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2238 emit_mul( func
, 1, 2 );
2239 emit_add( func
, 0, 1 );
2240 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2241 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2242 emit_mul( func
, 1, 2 );
2243 emit_add( func
, 0, 1 );
2244 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2245 emit_add( func
, 0, 1 );
2246 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2247 STORE( func
, *inst
, 0, 0, chan_index
);
2251 case TGSI_OPCODE_COS
:
2252 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2253 emit_cos( func
, 0, 0 );
2254 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2255 STORE( func
, *inst
, 0, 0, chan_index
);
2259 case TGSI_OPCODE_DDX
:
2263 case TGSI_OPCODE_DDY
:
2267 case TGSI_OPCODE_KILP
:
2268 /* predicated kill */
2270 return 0; /* XXX fix me */
2273 case TGSI_OPCODE_KIL
:
2274 /* conditional kill */
2275 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
2278 case TGSI_OPCODE_PK2H
:
2282 case TGSI_OPCODE_PK2US
:
2286 case TGSI_OPCODE_PK4B
:
2290 case TGSI_OPCODE_PK4UB
:
2294 case TGSI_OPCODE_RFL
:
2298 case TGSI_OPCODE_SEQ
:
2302 case TGSI_OPCODE_SFL
:
2306 case TGSI_OPCODE_SGT
:
2310 case TGSI_OPCODE_SIN
:
2311 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2312 emit_sin( func
, 0, 0 );
2313 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2314 STORE( func
, *inst
, 0, 0, chan_index
);
2318 case TGSI_OPCODE_SLE
:
2322 case TGSI_OPCODE_SNE
:
2326 case TGSI_OPCODE_STR
:
2330 case TGSI_OPCODE_TEX
:
2331 emit_tex( func
, inst
, FALSE
, FALSE
);
2334 case TGSI_OPCODE_TXD
:
2338 case TGSI_OPCODE_UP2H
:
2342 case TGSI_OPCODE_UP2US
:
2346 case TGSI_OPCODE_UP4B
:
2350 case TGSI_OPCODE_UP4UB
:
2354 case TGSI_OPCODE_X2D
:
2358 case TGSI_OPCODE_ARA
:
2362 case TGSI_OPCODE_ARR
:
2363 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2364 FETCH( func
, *inst
, 0, 0, chan_index
);
2365 emit_rnd( func
, 0, 0 );
2366 emit_f2it( func
, 0 );
2367 STORE( func
, *inst
, 0, 0, chan_index
);
2371 case TGSI_OPCODE_BRA
:
2375 case TGSI_OPCODE_CAL
:
2379 case TGSI_OPCODE_RET
:
2383 case TGSI_OPCODE_END
:
2386 case TGSI_OPCODE_SSG
:
2387 /* TGSI_OPCODE_SGN */
2388 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2389 FETCH( func
, *inst
, 0, 0, chan_index
);
2390 emit_sgn( func
, 0, 0 );
2391 STORE( func
, *inst
, 0, 0, chan_index
);
2395 case TGSI_OPCODE_CMP
:
2396 emit_cmp (func
, inst
);
2399 case TGSI_OPCODE_SCS
:
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2401 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2402 emit_cos( func
, 0, 0 );
2403 STORE( func
, *inst
, 0, 0, CHAN_X
);
2405 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2406 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2407 emit_sin( func
, 0, 0 );
2408 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2410 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2414 TGSI_EXEC_TEMP_00000000_I
,
2415 TGSI_EXEC_TEMP_00000000_C
);
2416 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2418 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2424 STORE( func
, *inst
, 0, 0, CHAN_W
);
2428 case TGSI_OPCODE_TXB
:
2429 emit_tex( func
, inst
, TRUE
, FALSE
);
2432 case TGSI_OPCODE_NRM
:
2434 case TGSI_OPCODE_NRM4
:
2435 /* 3 or 4-component normalization */
2437 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2439 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2440 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2441 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2442 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2444 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2447 /* xmm0 = src.x * src.x */
2448 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2449 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2450 emit_MOV(func
, 4, 0);
2452 emit_mul(func
, 0, 0);
2455 /* xmm0 = xmm0 + src.y * src.y */
2456 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2457 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2458 emit_MOV(func
, 5, 1);
2460 emit_mul(func
, 1, 1);
2461 emit_add(func
, 0, 1);
2464 /* xmm0 = xmm0 + src.z * src.z */
2465 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2466 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2467 emit_MOV(func
, 6, 1);
2469 emit_mul(func
, 1, 1);
2470 emit_add(func
, 0, 1);
2474 /* xmm0 = xmm0 + src.w * src.w */
2475 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2476 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2477 emit_MOV(func
, 7, 1);
2479 emit_mul(func
, 1, 1);
2480 emit_add(func
, 0, 1);
2483 /* xmm1 = 1 / sqrt(xmm0) */
2484 emit_rsqrt(func
, 1, 0);
2486 /* dst.x = xmm1 * src.x */
2487 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2488 emit_mul(func
, 4, 1);
2489 STORE(func
, *inst
, 4, 0, CHAN_X
);
2492 /* dst.y = xmm1 * src.y */
2493 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2494 emit_mul(func
, 5, 1);
2495 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2498 /* dst.z = xmm1 * src.z */
2499 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2500 emit_mul(func
, 6, 1);
2501 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2504 /* dst.w = xmm1 * src.w */
2505 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2506 emit_mul(func
, 7, 1);
2507 STORE(func
, *inst
, 7, 0, CHAN_W
);
2512 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2513 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2514 STORE(func
, *inst
, 0, 0, CHAN_W
);
2519 case TGSI_OPCODE_DIV
:
2523 case TGSI_OPCODE_DP2
:
2524 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2525 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2526 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2527 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2528 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2529 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2530 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2531 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2532 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2536 case TGSI_OPCODE_TXL
:
2537 emit_tex( func
, inst
, TRUE
, FALSE
);
2540 case TGSI_OPCODE_TXP
:
2541 emit_tex( func
, inst
, FALSE
, TRUE
);
2544 case TGSI_OPCODE_BRK
:
2548 case TGSI_OPCODE_IF
:
2552 case TGSI_OPCODE_LOOP
:
2556 case TGSI_OPCODE_REP
:
2560 case TGSI_OPCODE_ELSE
:
2564 case TGSI_OPCODE_ENDIF
:
2568 case TGSI_OPCODE_ENDLOOP
:
2572 case TGSI_OPCODE_ENDREP
:
2576 case TGSI_OPCODE_PUSHA
:
2580 case TGSI_OPCODE_POPA
:
2584 case TGSI_OPCODE_CEIL
:
2588 case TGSI_OPCODE_I2F
:
2592 case TGSI_OPCODE_NOT
:
2596 case TGSI_OPCODE_TRUNC
:
2597 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2598 FETCH( func
, *inst
, 0, 0, chan_index
);
2599 emit_f2it( func
, 0 );
2600 emit_i2f( func
, 0 );
2601 STORE( func
, *inst
, 0, 0, chan_index
);
2605 case TGSI_OPCODE_SHL
:
2609 case TGSI_OPCODE_SHR
:
2613 case TGSI_OPCODE_AND
:
2617 case TGSI_OPCODE_OR
:
2621 case TGSI_OPCODE_MOD
:
2625 case TGSI_OPCODE_XOR
:
2629 case TGSI_OPCODE_SAD
:
2633 case TGSI_OPCODE_TXF
:
2637 case TGSI_OPCODE_TXQ
:
2641 case TGSI_OPCODE_CONT
:
2645 case TGSI_OPCODE_EMIT
:
2649 case TGSI_OPCODE_ENDPRIM
:
2662 struct x86_function
*func
,
2663 struct tgsi_full_declaration
*decl
)
2665 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2666 unsigned first
, last
, mask
;
2669 first
= decl
->DeclarationRange
.First
;
2670 last
= decl
->DeclarationRange
.Last
;
2671 mask
= decl
->Declaration
.UsageMask
;
2673 for( i
= first
; i
<= last
; i
++ ) {
2674 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2675 if( mask
& (1 << j
) ) {
2676 switch( decl
->Declaration
.Interpolate
) {
2677 case TGSI_INTERPOLATE_CONSTANT
:
2678 emit_coef_a0( func
, 0, i
, j
);
2679 emit_inputs( func
, 0, i
, j
);
2682 case TGSI_INTERPOLATE_LINEAR
:
2683 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2684 emit_coef_dadx( func
, 1, i
, j
);
2685 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2686 emit_coef_dady( func
, 3, i
, j
);
2687 emit_mul( func
, 0, 1 ); /* x * dadx */
2688 emit_coef_a0( func
, 4, i
, j
);
2689 emit_mul( func
, 2, 3 ); /* y * dady */
2690 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2691 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2692 emit_inputs( func
, 0, i
, j
);
2695 case TGSI_INTERPOLATE_PERSPECTIVE
:
2696 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2697 emit_coef_dadx( func
, 1, i
, j
);
2698 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2699 emit_coef_dady( func
, 3, i
, j
);
2700 emit_mul( func
, 0, 1 ); /* x * dadx */
2701 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2702 emit_coef_a0( func
, 5, i
, j
);
2703 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2704 emit_mul( func
, 2, 3 ); /* y * dady */
2705 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2706 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2707 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2708 emit_inputs( func
, 0, i
, j
);
2721 static void aos_to_soa( struct x86_function
*func
,
2727 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2728 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2729 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2730 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2735 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2737 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2738 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2739 x86_lea( func
, soa_input
,
2740 x86_make_disp( soa_input
,
2741 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2742 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2743 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2746 inner_loop
= x86_get_label( func
);
2748 x86_push( func
, aos_input
);
2749 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2750 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2751 x86_add( func
, aos_input
, stride
);
2752 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2753 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2754 x86_add( func
, aos_input
, stride
);
2755 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2756 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2757 x86_add( func
, aos_input
, stride
);
2758 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2759 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2760 x86_pop( func
, aos_input
);
2762 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2763 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2764 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2765 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2766 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2767 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2769 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2770 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2771 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2772 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2774 /* Advance to next input */
2775 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2776 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2778 /* while --num_inputs */
2779 x86_dec( func
, num_inputs
);
2780 x86_jcc( func
, cc_NE
, inner_loop
);
2783 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2786 static void soa_to_aos( struct x86_function
*func
,
2792 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2793 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2794 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2795 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2799 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2801 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2802 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2803 x86_lea( func
, soa_output
,
2804 x86_make_disp( soa_output
,
2805 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2806 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2809 inner_loop
= x86_get_label( func
);
2811 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2812 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2813 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2814 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2816 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2817 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2818 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2819 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2820 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2821 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2823 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2824 x86_push( func
, aos_output
);
2825 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2826 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2827 x86_add( func
, aos_output
, temp
);
2828 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2829 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2830 x86_add( func
, aos_output
, temp
);
2831 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2832 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2833 x86_add( func
, aos_output
, temp
);
2834 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2835 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2836 x86_pop( func
, aos_output
);
2838 /* Advance to next output */
2839 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2840 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2842 /* while --num_outputs */
2843 x86_dec( func
, num_outputs
);
2844 x86_jcc( func
, cc_NE
, inner_loop
);
2847 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2851 * Translate a TGSI vertex/fragment shader to SSE2 code.
2852 * Slightly different things are done for vertex vs. fragment shaders.
2854 * \param tokens the TGSI input shader
2855 * \param func the output SSE code/function
2856 * \param immediates buffer to place immediates, later passed to SSE func
2857 * \param return 1 for success, 0 if translation failed
2861 const struct tgsi_token
*tokens
,
2862 struct x86_function
*func
,
2863 float (*immediates
)[4],
2864 boolean do_swizzles
)
2866 struct tgsi_parse_context parse
;
2868 uint num_immediates
= 0;
2872 func
->csr
= func
->store
;
2874 tgsi_parse_init( &parse
, tokens
);
2876 /* Can't just use EDI, EBX without save/restoring them:
2878 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2879 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2882 * Different function args for vertex/fragment shaders:
2884 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2890 6 ); /* input_stride */
2896 x86_fn_arg( func
, 1 ) );
2900 x86_fn_arg( func
, 2 ) );
2903 get_immediate_base(),
2904 x86_fn_arg( func
, 3 ) );
2906 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2910 x86_fn_arg( func
, 4 ) );
2915 x86_make_disp( get_machine_base(),
2916 Offset( struct tgsi_exec_machine
, Samplers
) ) );
2920 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2921 tgsi_parse_token( &parse
);
2923 switch( parse
.FullToken
.Token
.Type
) {
2924 case TGSI_TOKEN_TYPE_DECLARATION
:
2925 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2928 &parse
.FullToken
.FullDeclaration
);
2932 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2933 ok
= emit_instruction(
2935 &parse
.FullToken
.FullInstruction
);
2938 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2939 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2940 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2941 "vertex shader" : "fragment shader");
2945 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2946 /* simply copy the immediate values into the next immediates[] slot */
2948 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2951 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2952 for( i
= 0; i
< size
; i
++ ) {
2953 immediates
[num_immediates
][i
] =
2954 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2957 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2959 immediates
[num_immediates
][0],
2960 immediates
[num_immediates
][1],
2961 immediates
[num_immediates
][2],
2962 immediates
[num_immediates
][3]);
2974 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2979 8, /* num_outputs */
2980 9 ); /* output_stride */
2983 /* Can't just use EBX, EDI without save/restoring them:
2985 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
2986 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2990 tgsi_parse_free( &parse
);
2995 #endif /* PIPE_ARCH_X86 */