1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
44 #include "rtasm/rtasm_x86sse.h"
48 * This costs about 100fps (close to 10%) in gears:
50 #define HIGH_PRECISION 1
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83 * X86 utility functions.
92 (enum x86_reg_name
) xmm
);
96 * X86 register mapping helpers.
100 get_const_base( void )
107 static struct x86_reg
108 get_machine_base( void )
115 static struct x86_reg
116 get_input_base( void )
118 return x86_make_disp(
120 Offset(struct tgsi_exec_machine
, Inputs
) );
123 static struct x86_reg
124 get_output_base( void )
126 return x86_make_disp(
128 Offset(struct tgsi_exec_machine
, Outputs
) );
131 static struct x86_reg
132 get_temp_base( void )
134 return x86_make_disp(
136 Offset(struct tgsi_exec_machine
, Temps
) );
139 static struct x86_reg
140 get_coef_base( void )
147 static struct x86_reg
148 get_immediate_base( void )
157 * Data access helpers.
161 static struct x86_reg
166 return x86_make_disp(
167 get_immediate_base(),
168 (vec
* 4 + chan
) * 4 );
171 static struct x86_reg
176 return x86_make_disp(
178 (vec
* 4 + chan
) * 4 );
181 static struct x86_reg
186 return x86_make_disp(
188 (vec
* 4 + chan
) * 16 );
191 static struct x86_reg
196 return x86_make_disp(
198 (vec
* 4 + chan
) * 16 );
201 static struct x86_reg
206 return x86_make_disp(
208 (vec
* 4 + chan
) * 16 );
211 static struct x86_reg
217 return x86_make_disp(
219 ((vec
* 3 + member
) * 4 + chan
) * 4 );
225 struct x86_function
*func
)
232 * Data fetch helpers.
236 * Copy a shader constant to xmm register
237 * \param xmm the destination xmm register
238 * \param vec the src const buffer index
239 * \param chan src channel to fetch (X, Y, Z or W)
243 struct x86_function
*func
,
252 /* 'vec' is the offset from the address register's value.
253 * We're loading CONST[ADDR+vec] into an xmm register.
255 struct x86_reg r0
= get_input_base();
256 struct x86_reg r1
= get_output_base();
259 assert( indirectFile
== TGSI_FILE_ADDRESS
);
260 assert( indirectIndex
== 0 );
262 x86_push( func
, r0
);
263 x86_push( func
, r1
);
266 * Loop over the four pixels or vertices in the quad.
267 * Get the value of the address (offset) register for pixel/vertex[i],
268 * add it to the src offset and index into the constant buffer.
269 * Note that we're working on SOA data.
270 * If any of the pixel/vertex execution channels are unused their
271 * values will be garbage. It's very important that we don't use
272 * those garbage values as indexes into the constant buffer since
273 * that'll cause segfaults.
274 * The solution is to bitwise-AND the offset with the execution mask
275 * register whose values are either 0 or ~0.
276 * The caller must setup the execution mask register to indicate
277 * which channels are valid/alive before running the shader.
278 * The execution mask will also figure into loops and conditionals
281 for (i
= 0; i
< QUAD_SIZE
; i
++) {
282 /* r1 = address register[i] */
283 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
284 /* r0 = execution mask[i] */
285 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
287 x86_and( func
, r1
, r0
);
288 /* r0 = 'vec', the offset */
289 x86_lea( func
, r0
, get_const( vec
, chan
) );
291 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
293 x86_add( func
, r1
, r1
);
294 x86_add( func
, r1
, r1
);
295 x86_add( func
, r1
, r1
);
296 x86_add( func
, r1
, r1
);
298 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
299 x86_mov( func
, r1
, x86_deref( r0
) );
300 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
309 get_temp( TEMP_R0
, CHAN_X
) );
312 /* 'vec' is the index into the src register file, such as TEMP[vec] */
318 get_const( vec
, chan
) );
323 SHUF( 0, 0, 0, 0 ) );
329 struct x86_function
*func
,
337 get_immediate( vec
, chan
) );
342 SHUF( 0, 0, 0, 0 ) );
347 * Copy a shader input to xmm register
348 * \param xmm the destination xmm register
349 * \param vec the src input attrib
350 * \param chan src channel to fetch (X, Y, Z or W)
354 struct x86_function
*func
,
362 get_input( vec
, chan
) );
366 * Store an xmm register to a shader output
367 * \param xmm the source xmm register
368 * \param vec the dest output attrib
369 * \param chan src dest channel to store (X, Y, Z or W)
373 struct x86_function
*func
,
380 get_output( vec
, chan
),
385 * Copy a shader temporary to xmm register
386 * \param xmm the destination xmm register
387 * \param vec the src temp register
388 * \param chan src channel to fetch (X, Y, Z or W)
392 struct x86_function
*func
,
400 get_temp( vec
, chan
) );
404 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
405 * \param xmm the destination xmm register
406 * \param vec the src input/attribute coefficient index
407 * \param chan src channel to fetch (X, Y, Z or W)
408 * \param member 0=a0, 1=dadx, 2=dady
412 struct x86_function
*func
,
421 get_coef( vec
, chan
, member
) );
426 SHUF( 0, 0, 0, 0 ) );
430 * Data store helpers.
435 struct x86_function
*func
,
442 get_input( vec
, chan
),
448 struct x86_function
*func
,
455 get_temp( vec
, chan
),
461 struct x86_function
*func
,
471 vec
+ TGSI_EXEC_TEMP_ADDR
,
476 * Coefficent fetch helpers.
481 struct x86_function
*func
,
496 struct x86_function
*func
,
511 struct x86_function
*func
,
525 * Function call helpers.
529 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
530 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
531 * that the stack pointer is 16 byte aligned, as expected.
535 struct x86_function
*func
,
538 void (PIPE_CDECL
*code
)() )
540 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
544 /* Bitmask of the xmm registers to save */
545 xmm_mask
= (1 << xmm_save
) - 1;
546 xmm_mask
&= ~(1 << xmm_dst
);
550 x86_make_reg( file_REG32
, reg_AX
) );
553 x86_make_reg( file_REG32
, reg_CX
) );
556 x86_make_reg( file_REG32
, reg_DX
) );
558 /* Store XMM regs to the stack
560 for(i
= 0, n
= 0; i
< 8; ++i
)
561 if(xmm_mask
& (1 << i
))
566 x86_make_reg( file_REG32
, reg_SP
),
569 for(i
= 0, n
= 0; i
< 8; ++i
)
570 if(xmm_mask
& (1 << i
)) {
573 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
578 /* Load the address of the buffer we use for passing arguments and
584 get_temp( TEMP_R0
, 0 ) );
586 /* Push actual function arguments (currently just the pointer to
587 * the buffer above), and call the function:
589 x86_push( func
, ecx
);
590 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
591 x86_call( func
, ecx
);
595 /* Pop the saved XMM regs:
597 for(i
= 0, n
= 0; i
< 8; ++i
)
598 if(xmm_mask
& (1 << i
)) {
602 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
608 x86_make_reg( file_REG32
, reg_SP
),
611 /* Restore GP registers in a reverse order.
615 x86_make_reg( file_REG32
, reg_DX
) );
618 x86_make_reg( file_REG32
, reg_CX
) );
621 x86_make_reg( file_REG32
, reg_AX
) );
626 emit_func_call_dst_src1(
627 struct x86_function
*func
,
631 void (PIPE_CDECL
*code
)() )
633 /* Store our input parameters (in xmm regs) to the buffer we use
634 * for passing arguments. We will pass a pointer to this buffer as
635 * the actual function argument.
639 get_temp( TEMP_R0
, 0 ),
640 make_xmm( xmm_src0
) );
651 get_temp( TEMP_R0
, 0 ) );
656 emit_func_call_dst_src2(
657 struct x86_function
*func
,
662 void (PIPE_CDECL
*code
)() )
664 /* Store two inputs to parameter buffer.
668 get_temp( TEMP_R0
, 0 ),
669 make_xmm( xmm_src0
) );
673 get_temp( TEMP_R0
, 1 ),
674 make_xmm( xmm_src1
) );
685 /* Retrieve the results:
690 get_temp( TEMP_R0
, 0 ) );
697 #if defined(PIPE_ARCH_SSE)
700 * Fast SSE2 implementation of special math functions.
703 #define POLY0(x, c0) _mm_set1_ps(c0)
704 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
705 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
706 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
707 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
708 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
710 #define EXP_POLY_DEGREE 3
711 #define LOG_POLY_DEGREE 5
714 * See http://www.devmaster.net/forums/showthread.php?p=43580
720 __m128 fpart
, expipart
, expfpart
;
722 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
723 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
725 /* ipart = int(x - 0.5) */
726 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
728 /* fpart = x - ipart */
729 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
731 /* expipart = (float) (1 << ipart) */
732 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
734 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
735 #if EXP_POLY_DEGREE == 5
736 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
737 #elif EXP_POLY_DEGREE == 4
738 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
739 #elif EXP_POLY_DEGREE == 3
740 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
741 #elif EXP_POLY_DEGREE == 2
742 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
747 return _mm_mul_ps(expipart
, expfpart
);
752 * See http://www.devmaster.net/forums/showthread.php?p=43580
757 __m128i expmask
= _mm_set1_epi32(0x7f800000);
758 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
759 __m128 one
= _mm_set1_ps(1.0f
);
761 __m128i i
= _mm_castps_si128(x
);
763 /* exp = (float) exponent(x) */
764 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
766 /* mant = (float) mantissa(x) */
767 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
771 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
772 * These coefficients can be generate with
773 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
775 #if LOG_POLY_DEGREE == 6
776 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
777 #elif LOG_POLY_DEGREE == 5
778 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
779 #elif LOG_POLY_DEGREE == 4
780 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
781 #elif LOG_POLY_DEGREE == 3
782 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
787 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
788 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
790 return _mm_add_ps(logmant
, exp
);
795 powf4(__m128 x
, __m128 y
)
797 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
800 #endif /* PIPE_ARCH_SSE */
805 * Low-level instruction translators.
810 struct x86_function
*func
,
817 TGSI_EXEC_TEMP_7FFFFFFF_I
,
818 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
823 struct x86_function
*func
,
830 make_xmm( xmm_src
) );
833 static void PIPE_CDECL
837 store
[0] = cosf( store
[0] );
838 store
[1] = cosf( store
[1] );
839 store
[2] = cosf( store
[2] );
840 store
[3] = cosf( store
[3] );
845 struct x86_function
*func
,
849 emit_func_call_dst_src1(
857 static void PIPE_CDECL
858 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
859 __attribute__((force_align_arg_pointer
))
864 #if defined(PIPE_ARCH_SSE)
865 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
867 store
[0] = util_fast_exp2( store
[0] );
868 store
[1] = util_fast_exp2( store
[1] );
869 store
[2] = util_fast_exp2( store
[2] );
870 store
[3] = util_fast_exp2( store
[3] );
876 struct x86_function
*func
,
880 emit_func_call_dst_src1(
890 struct x86_function
*func
,
901 struct x86_function
*func
,
910 static void PIPE_CDECL
914 store
[0] = floorf( store
[0] );
915 store
[1] = floorf( store
[1] );
916 store
[2] = floorf( store
[2] );
917 store
[3] = floorf( store
[3] );
922 struct x86_function
*func
,
926 emit_func_call_dst_src1(
934 static void PIPE_CDECL
938 store
[0] -= floorf( store
[0] );
939 store
[1] -= floorf( store
[1] );
940 store
[2] -= floorf( store
[2] );
941 store
[3] -= floorf( store
[3] );
946 struct x86_function
*func
,
950 emit_func_call_dst_src1(
958 static void PIPE_CDECL
959 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
960 __attribute__((force_align_arg_pointer
))
965 #if defined(PIPE_ARCH_SSE)
966 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
968 store
[0] = util_fast_log2( store
[0] );
969 store
[1] = util_fast_log2( store
[1] );
970 store
[2] = util_fast_log2( store
[2] );
971 store
[3] = util_fast_log2( store
[3] );
977 struct x86_function
*func
,
981 emit_func_call_dst_src1(
991 struct x86_function
*func
,
998 make_xmm( xmm_src
) );
1002 emit_mul (struct x86_function
*func
,
1008 make_xmm( xmm_dst
),
1009 make_xmm( xmm_src
) );
1014 struct x86_function
*func
,
1021 TGSI_EXEC_TEMP_80000000_I
,
1022 TGSI_EXEC_TEMP_80000000_C
) );
1025 static void PIPE_CDECL
1026 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1027 __attribute__((force_align_arg_pointer
))
1032 #if defined(PIPE_ARCH_SSE)
1033 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1035 store
[0] = util_fast_pow( store
[0], store
[4] );
1036 store
[1] = util_fast_pow( store
[1], store
[5] );
1037 store
[2] = util_fast_pow( store
[2], store
[6] );
1038 store
[3] = util_fast_pow( store
[3], store
[7] );
1044 struct x86_function
*func
,
1050 emit_func_call_dst_src2(
1061 struct x86_function
*func
,
1065 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1066 * good enough. Need to either emit a proper divide or use the
1067 * iterative technique described below in emit_rsqrt().
1071 make_xmm( xmm_dst
),
1072 make_xmm( xmm_src
) );
1075 static void PIPE_CDECL
1079 store
[0] = floorf( store
[0] + 0.5f
);
1080 store
[1] = floorf( store
[1] + 0.5f
);
1081 store
[2] = floorf( store
[2] + 0.5f
);
1082 store
[3] = floorf( store
[3] + 0.5f
);
1087 struct x86_function
*func
,
1091 emit_func_call_dst_src1(
1101 struct x86_function
*func
,
1106 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1107 * implementations, it is possible to improve its precision at
1108 * fairly low cost, using a newton/raphson step, as below:
1110 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1111 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1113 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1116 struct x86_reg dst
= make_xmm( xmm_dst
);
1117 struct x86_reg src
= make_xmm( xmm_src
);
1118 struct x86_reg tmp0
= make_xmm( 2 );
1119 struct x86_reg tmp1
= make_xmm( 3 );
1121 assert( xmm_dst
!= xmm_src
);
1122 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1123 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1125 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1126 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1127 sse_rsqrtps( func
, tmp1
, src
);
1128 sse_mulps( func
, src
, tmp1
);
1129 sse_mulps( func
, dst
, tmp1
);
1130 sse_mulps( func
, src
, tmp1
);
1131 sse_subps( func
, tmp0
, src
);
1132 sse_mulps( func
, dst
, tmp0
);
1135 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1140 make_xmm( xmm_dst
),
1141 make_xmm( xmm_src
) );
1147 struct x86_function
*func
,
1154 TGSI_EXEC_TEMP_80000000_I
,
1155 TGSI_EXEC_TEMP_80000000_C
) );
1158 static void PIPE_CDECL
1162 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1163 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1164 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1165 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1170 struct x86_function
*func
,
1174 emit_func_call_dst_src1(
1182 static void PIPE_CDECL
1186 store
[0] = sinf( store
[0] );
1187 store
[1] = sinf( store
[1] );
1188 store
[2] = sinf( store
[2] );
1189 store
[3] = sinf( store
[3] );
1193 emit_sin (struct x86_function
*func
,
1197 emit_func_call_dst_src1(
1207 struct x86_function
*func
,
1213 make_xmm( xmm_dst
),
1214 make_xmm( xmm_src
) );
1223 struct x86_function
*func
,
1225 const struct tgsi_full_src_register
*reg
,
1226 const unsigned chan_index
)
1228 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1231 case TGSI_EXTSWIZZLE_X
:
1232 case TGSI_EXTSWIZZLE_Y
:
1233 case TGSI_EXTSWIZZLE_Z
:
1234 case TGSI_EXTSWIZZLE_W
:
1235 switch (reg
->SrcRegister
.File
) {
1236 case TGSI_FILE_CONSTANT
:
1240 reg
->SrcRegister
.Index
,
1242 reg
->SrcRegister
.Indirect
,
1243 reg
->SrcRegisterInd
.File
,
1244 reg
->SrcRegisterInd
.Index
);
1247 case TGSI_FILE_IMMEDIATE
:
1251 reg
->SrcRegister
.Index
,
1255 case TGSI_FILE_INPUT
:
1259 reg
->SrcRegister
.Index
,
1263 case TGSI_FILE_TEMPORARY
:
1267 reg
->SrcRegister
.Index
,
1276 case TGSI_EXTSWIZZLE_ZERO
:
1280 TGSI_EXEC_TEMP_00000000_I
,
1281 TGSI_EXEC_TEMP_00000000_C
);
1284 case TGSI_EXTSWIZZLE_ONE
:
1296 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1297 case TGSI_UTIL_SIGN_CLEAR
:
1298 emit_abs( func
, xmm
);
1301 case TGSI_UTIL_SIGN_SET
:
1302 emit_setsign( func
, xmm
);
1305 case TGSI_UTIL_SIGN_TOGGLE
:
1306 emit_neg( func
, xmm
);
1309 case TGSI_UTIL_SIGN_KEEP
:
1314 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1315 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1323 struct x86_function
*func
,
1325 const struct tgsi_full_dst_register
*reg
,
1326 const struct tgsi_full_instruction
*inst
,
1327 unsigned chan_index
)
1329 switch( reg
->DstRegister
.File
) {
1330 case TGSI_FILE_OUTPUT
:
1334 reg
->DstRegister
.Index
,
1338 case TGSI_FILE_TEMPORARY
:
1342 reg
->DstRegister
.Index
,
1346 case TGSI_FILE_ADDRESS
:
1350 reg
->DstRegister
.Index
,
1358 switch( inst
->Instruction
.Saturate
) {
1362 case TGSI_SAT_ZERO_ONE
:
1366 case TGSI_SAT_MINUS_PLUS_ONE
:
1372 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1373 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1376 * High-level instruction translators.
1381 struct x86_function
*func
,
1382 const struct tgsi_full_src_register
*reg
)
1384 unsigned uniquemask
;
1385 unsigned registers
[4];
1386 unsigned nextregister
= 0;
1387 unsigned firstchan
= ~0;
1388 unsigned chan_index
;
1390 /* This mask stores component bits that were already tested. Note that
1391 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1393 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1395 FOR_EACH_CHANNEL( chan_index
) {
1398 /* unswizzle channel */
1399 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1403 /* check if the component has not been already tested */
1404 if( !(uniquemask
& (1 << swizzle
)) ) {
1405 uniquemask
|= 1 << swizzle
;
1407 /* allocate register */
1408 registers
[chan_index
] = nextregister
;
1416 /* mark the first channel used */
1417 if( firstchan
== ~0 ) {
1418 firstchan
= chan_index
;
1425 x86_make_reg( file_REG32
, reg_AX
) );
1428 x86_make_reg( file_REG32
, reg_DX
) );
1430 FOR_EACH_CHANNEL( chan_index
) {
1431 if( uniquemask
& (1 << chan_index
) ) {
1434 make_xmm( registers
[chan_index
] ),
1436 TGSI_EXEC_TEMP_00000000_I
,
1437 TGSI_EXEC_TEMP_00000000_C
),
1440 if( chan_index
== firstchan
) {
1443 x86_make_reg( file_REG32
, reg_AX
),
1444 make_xmm( registers
[chan_index
] ) );
1449 x86_make_reg( file_REG32
, reg_DX
),
1450 make_xmm( registers
[chan_index
] ) );
1453 x86_make_reg( file_REG32
, reg_AX
),
1454 x86_make_reg( file_REG32
, reg_DX
) );
1462 TGSI_EXEC_TEMP_KILMASK_I
,
1463 TGSI_EXEC_TEMP_KILMASK_C
),
1464 x86_make_reg( file_REG32
, reg_AX
) );
1468 x86_make_reg( file_REG32
, reg_DX
) );
1471 x86_make_reg( file_REG32
, reg_AX
) );
1477 struct x86_function
*func
)
1479 /* XXX todo / fix me */
1485 struct x86_function
*func
,
1486 struct tgsi_full_instruction
*inst
,
1489 unsigned chan_index
;
1491 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1492 FETCH( func
, *inst
, 0, 0, chan_index
);
1493 FETCH( func
, *inst
, 1, 1, chan_index
);
1505 STORE( func
, *inst
, 0, 0, chan_index
);
1511 struct x86_function
*func
,
1512 struct tgsi_full_instruction
*inst
)
1514 unsigned chan_index
;
1516 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1517 FETCH( func
, *inst
, 0, 0, chan_index
);
1518 FETCH( func
, *inst
, 1, 1, chan_index
);
1519 FETCH( func
, *inst
, 2, 2, chan_index
);
1524 TGSI_EXEC_TEMP_00000000_I
,
1525 TGSI_EXEC_TEMP_00000000_C
),
1539 STORE( func
, *inst
, 0, 0, chan_index
);
1545 * Check if inst src/dest regs use indirect addressing into temporary
1549 indirect_temp_reference(const struct tgsi_full_instruction
*inst
)
1552 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1553 const struct tgsi_full_src_register
*reg
= &inst
->FullSrcRegisters
[i
];
1554 if (reg
->SrcRegister
.File
== TGSI_FILE_TEMPORARY
&&
1555 reg
->SrcRegister
.Indirect
)
1558 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1559 const struct tgsi_full_dst_register
*reg
= &inst
->FullDstRegisters
[i
];
1560 if (reg
->DstRegister
.File
== TGSI_FILE_TEMPORARY
&&
1561 reg
->DstRegister
.Indirect
)
1570 struct x86_function
*func
,
1571 struct tgsi_full_instruction
*inst
)
1573 unsigned chan_index
;
1575 /* we can't handle indirect addressing into temp register file yet */
1576 if (indirect_temp_reference(inst
))
1579 switch (inst
->Instruction
.Opcode
) {
1580 case TGSI_OPCODE_ARL
:
1581 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1582 FETCH( func
, *inst
, 0, 0, chan_index
);
1583 emit_flr(func
, 0, 0);
1584 emit_f2it( func
, 0 );
1585 STORE( func
, *inst
, 0, 0, chan_index
);
1589 case TGSI_OPCODE_MOV
:
1590 case TGSI_OPCODE_SWZ
:
1591 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1592 FETCH( func
, *inst
, 0, 0, chan_index
);
1593 STORE( func
, *inst
, 0, 0, chan_index
);
1597 case TGSI_OPCODE_LIT
:
1598 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1599 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1605 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1606 STORE( func
, *inst
, 0, 0, CHAN_X
);
1608 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1609 STORE( func
, *inst
, 0, 0, CHAN_W
);
1612 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1613 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1614 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1615 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1620 TGSI_EXEC_TEMP_00000000_I
,
1621 TGSI_EXEC_TEMP_00000000_C
) );
1622 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1624 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1625 /* XMM[1] = SrcReg[0].yyyy */
1626 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1627 /* XMM[1] = max(XMM[1], 0) */
1632 TGSI_EXEC_TEMP_00000000_I
,
1633 TGSI_EXEC_TEMP_00000000_C
) );
1634 /* XMM[2] = SrcReg[0].wwww */
1635 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1636 /* XMM[2] = min(XMM[2], 128.0) */
1641 TGSI_EXEC_TEMP_128_I
,
1642 TGSI_EXEC_TEMP_128_C
) );
1643 /* XMM[2] = max(XMM[2], -128.0) */
1648 TGSI_EXEC_TEMP_MINUS_128_I
,
1649 TGSI_EXEC_TEMP_MINUS_128_C
) );
1650 emit_pow( func
, 3, 1, 1, 2 );
1651 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1665 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1670 case TGSI_OPCODE_RCP
:
1671 /* TGSI_OPCODE_RECIP */
1672 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1673 emit_rcp( func
, 0, 0 );
1674 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1675 STORE( func
, *inst
, 0, 0, chan_index
);
1679 case TGSI_OPCODE_RSQ
:
1680 /* TGSI_OPCODE_RECIPSQRT */
1681 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1682 emit_abs( func
, 0 );
1683 emit_rsqrt( func
, 1, 0 );
1684 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1685 STORE( func
, *inst
, 1, 0, chan_index
);
1689 case TGSI_OPCODE_EXP
:
1690 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1691 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1692 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1693 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1694 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1695 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1696 emit_MOV( func
, 1, 0 );
1697 emit_flr( func
, 2, 1 );
1698 /* dst.x = ex2(floor(src.x)) */
1699 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1700 emit_MOV( func
, 2, 1 );
1701 emit_ex2( func
, 3, 2 );
1702 STORE( func
, *inst
, 2, 0, CHAN_X
);
1704 /* dst.y = src.x - floor(src.x) */
1705 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1706 emit_MOV( func
, 2, 0 );
1707 emit_sub( func
, 2, 1 );
1708 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1711 /* dst.z = ex2(src.x) */
1712 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1713 emit_ex2( func
, 3, 0 );
1714 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1718 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1719 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1720 STORE( func
, *inst
, 0, 0, CHAN_W
);
1724 case TGSI_OPCODE_LOG
:
1725 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1726 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1727 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1728 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1729 emit_abs( func
, 0 );
1730 emit_MOV( func
, 1, 0 );
1731 emit_lg2( func
, 2, 1 );
1732 /* dst.z = lg2(abs(src.x)) */
1733 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1734 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1736 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1737 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1738 emit_flr( func
, 2, 1 );
1739 /* dst.x = floor(lg2(abs(src.x))) */
1740 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1741 STORE( func
, *inst
, 1, 0, CHAN_X
);
1743 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1744 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1745 emit_ex2( func
, 2, 1 );
1746 emit_rcp( func
, 1, 1 );
1747 emit_mul( func
, 0, 1 );
1748 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1753 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1754 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1755 STORE( func
, *inst
, 0, 0, CHAN_W
);
1759 case TGSI_OPCODE_MUL
:
1760 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1761 FETCH( func
, *inst
, 0, 0, chan_index
);
1762 FETCH( func
, *inst
, 1, 1, chan_index
);
1763 emit_mul( func
, 0, 1 );
1764 STORE( func
, *inst
, 0, 0, chan_index
);
1768 case TGSI_OPCODE_ADD
:
1769 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1770 FETCH( func
, *inst
, 0, 0, chan_index
);
1771 FETCH( func
, *inst
, 1, 1, chan_index
);
1772 emit_add( func
, 0, 1 );
1773 STORE( func
, *inst
, 0, 0, chan_index
);
1777 case TGSI_OPCODE_DP3
:
1778 /* TGSI_OPCODE_DOT3 */
1779 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1780 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1781 emit_mul( func
, 0, 1 );
1782 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1783 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1784 emit_mul( func
, 1, 2 );
1785 emit_add( func
, 0, 1 );
1786 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1787 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1788 emit_mul( func
, 1, 2 );
1789 emit_add( func
, 0, 1 );
1790 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1791 STORE( func
, *inst
, 0, 0, chan_index
);
1795 case TGSI_OPCODE_DP4
:
1796 /* TGSI_OPCODE_DOT4 */
1797 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1798 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1799 emit_mul( func
, 0, 1 );
1800 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1801 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1802 emit_mul( func
, 1, 2 );
1803 emit_add( func
, 0, 1 );
1804 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1805 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1806 emit_mul(func
, 1, 2 );
1807 emit_add(func
, 0, 1 );
1808 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1809 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1810 emit_mul( func
, 1, 2 );
1811 emit_add( func
, 0, 1 );
1812 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1813 STORE( func
, *inst
, 0, 0, chan_index
);
1817 case TGSI_OPCODE_DST
:
1818 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1824 STORE( func
, *inst
, 0, 0, CHAN_X
);
1826 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1827 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1828 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1829 emit_mul( func
, 0, 1 );
1830 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1832 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
1833 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
1834 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1836 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
1837 FETCH( func
, *inst
, 0, 1, CHAN_W
);
1838 STORE( func
, *inst
, 0, 0, CHAN_W
);
1842 case TGSI_OPCODE_MIN
:
1843 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1844 FETCH( func
, *inst
, 0, 0, chan_index
);
1845 FETCH( func
, *inst
, 1, 1, chan_index
);
1850 STORE( func
, *inst
, 0, 0, chan_index
);
1854 case TGSI_OPCODE_MAX
:
1855 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1856 FETCH( func
, *inst
, 0, 0, chan_index
);
1857 FETCH( func
, *inst
, 1, 1, chan_index
);
1862 STORE( func
, *inst
, 0, 0, chan_index
);
1866 case TGSI_OPCODE_SLT
:
1867 /* TGSI_OPCODE_SETLT */
1868 emit_setcc( func
, inst
, cc_LessThan
);
1871 case TGSI_OPCODE_SGE
:
1872 /* TGSI_OPCODE_SETGE */
1873 emit_setcc( func
, inst
, cc_NotLessThan
);
1876 case TGSI_OPCODE_MAD
:
1877 /* TGSI_OPCODE_MADD */
1878 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1879 FETCH( func
, *inst
, 0, 0, chan_index
);
1880 FETCH( func
, *inst
, 1, 1, chan_index
);
1881 FETCH( func
, *inst
, 2, 2, chan_index
);
1882 emit_mul( func
, 0, 1 );
1883 emit_add( func
, 0, 2 );
1884 STORE( func
, *inst
, 0, 0, chan_index
);
1888 case TGSI_OPCODE_SUB
:
1889 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1890 FETCH( func
, *inst
, 0, 0, chan_index
);
1891 FETCH( func
, *inst
, 1, 1, chan_index
);
1892 emit_sub( func
, 0, 1 );
1893 STORE( func
, *inst
, 0, 0, chan_index
);
1897 case TGSI_OPCODE_LERP
:
1898 /* TGSI_OPCODE_LRP */
1899 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1900 FETCH( func
, *inst
, 0, 0, chan_index
);
1901 FETCH( func
, *inst
, 1, 1, chan_index
);
1902 FETCH( func
, *inst
, 2, 2, chan_index
);
1903 emit_sub( func
, 1, 2 );
1904 emit_mul( func
, 0, 1 );
1905 emit_add( func
, 0, 2 );
1906 STORE( func
, *inst
, 0, 0, chan_index
);
1910 case TGSI_OPCODE_CND
:
1914 case TGSI_OPCODE_CND0
:
1918 case TGSI_OPCODE_DOT2ADD
:
1919 /* TGSI_OPCODE_DP2A */
1920 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
1921 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
1922 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
1923 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
1924 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
1925 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
1926 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1927 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
1928 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
1929 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1930 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
1934 case TGSI_OPCODE_INDEX
:
1938 case TGSI_OPCODE_NEGATE
:
1942 case TGSI_OPCODE_FRAC
:
1943 /* TGSI_OPCODE_FRC */
1944 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1945 FETCH( func
, *inst
, 0, 0, chan_index
);
1946 emit_frc( func
, 0, 0 );
1947 STORE( func
, *inst
, 0, 0, chan_index
);
1951 case TGSI_OPCODE_CLAMP
:
1955 case TGSI_OPCODE_FLOOR
:
1956 /* TGSI_OPCODE_FLR */
1957 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1958 FETCH( func
, *inst
, 0, 0, chan_index
);
1959 emit_flr( func
, 0, 0 );
1960 STORE( func
, *inst
, 0, 0, chan_index
);
1964 case TGSI_OPCODE_ROUND
:
1965 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1966 FETCH( func
, *inst
, 0, 0, chan_index
);
1967 emit_rnd( func
, 0, 0 );
1968 STORE( func
, *inst
, 0, 0, chan_index
);
1972 case TGSI_OPCODE_EXPBASE2
:
1973 /* TGSI_OPCODE_EX2 */
1974 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1975 emit_ex2( func
, 0, 0 );
1976 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1977 STORE( func
, *inst
, 0, 0, chan_index
);
1981 case TGSI_OPCODE_LOGBASE2
:
1982 /* TGSI_OPCODE_LG2 */
1983 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1984 emit_lg2( func
, 0, 0 );
1985 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1986 STORE( func
, *inst
, 0, 0, chan_index
);
1990 case TGSI_OPCODE_POWER
:
1991 /* TGSI_OPCODE_POW */
1992 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1993 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1994 emit_pow( func
, 0, 0, 0, 1 );
1995 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1996 STORE( func
, *inst
, 0, 0, chan_index
);
2000 case TGSI_OPCODE_CROSSPRODUCT
:
2001 /* TGSI_OPCODE_XPD */
2002 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2003 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2004 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
2005 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
2007 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2008 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2009 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2010 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
2012 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2013 emit_MOV( func
, 2, 0 );
2014 emit_mul( func
, 2, 1 );
2015 emit_MOV( func
, 5, 3 );
2016 emit_mul( func
, 5, 4 );
2017 emit_sub( func
, 2, 5 );
2018 STORE( func
, *inst
, 2, 0, CHAN_X
);
2020 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2021 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2022 FETCH( func
, *inst
, 2, 1, CHAN_X
);
2023 FETCH( func
, *inst
, 5, 0, CHAN_X
);
2025 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2026 emit_mul( func
, 3, 2 );
2027 emit_mul( func
, 1, 5 );
2028 emit_sub( func
, 3, 1 );
2029 STORE( func
, *inst
, 3, 0, CHAN_Y
);
2031 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2032 emit_mul( func
, 5, 4 );
2033 emit_mul( func
, 0, 2 );
2034 emit_sub( func
, 5, 0 );
2035 STORE( func
, *inst
, 5, 0, CHAN_Z
);
2037 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2043 STORE( func
, *inst
, 0, 0, CHAN_W
);
2047 case TGSI_OPCODE_MULTIPLYMATRIX
:
2051 case TGSI_OPCODE_ABS
:
2052 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2053 FETCH( func
, *inst
, 0, 0, chan_index
);
2054 emit_abs( func
, 0) ;
2056 STORE( func
, *inst
, 0, 0, chan_index
);
2060 case TGSI_OPCODE_RCC
:
2064 case TGSI_OPCODE_DPH
:
2065 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2066 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2067 emit_mul( func
, 0, 1 );
2068 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2069 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2070 emit_mul( func
, 1, 2 );
2071 emit_add( func
, 0, 1 );
2072 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2073 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2074 emit_mul( func
, 1, 2 );
2075 emit_add( func
, 0, 1 );
2076 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2077 emit_add( func
, 0, 1 );
2078 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2079 STORE( func
, *inst
, 0, 0, chan_index
);
2083 case TGSI_OPCODE_COS
:
2084 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2085 emit_cos( func
, 0, 0 );
2086 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2087 STORE( func
, *inst
, 0, 0, chan_index
);
2091 case TGSI_OPCODE_DDX
:
2095 case TGSI_OPCODE_DDY
:
2099 case TGSI_OPCODE_KILP
:
2100 /* predicated kill */
2102 return 0; /* XXX fix me */
2105 case TGSI_OPCODE_KIL
:
2106 /* conditional kill */
2107 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
2110 case TGSI_OPCODE_PK2H
:
2114 case TGSI_OPCODE_PK2US
:
2118 case TGSI_OPCODE_PK4B
:
2122 case TGSI_OPCODE_PK4UB
:
2126 case TGSI_OPCODE_RFL
:
2130 case TGSI_OPCODE_SEQ
:
2134 case TGSI_OPCODE_SFL
:
2138 case TGSI_OPCODE_SGT
:
2142 case TGSI_OPCODE_SIN
:
2143 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2144 emit_sin( func
, 0, 0 );
2145 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2146 STORE( func
, *inst
, 0, 0, chan_index
);
2150 case TGSI_OPCODE_SLE
:
2154 case TGSI_OPCODE_SNE
:
2158 case TGSI_OPCODE_STR
:
2162 case TGSI_OPCODE_TEX
:
2164 /* Disable dummy texture code:
2171 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2172 STORE( func
, *inst
, 0, 0, chan_index
);
2180 case TGSI_OPCODE_TXD
:
2184 case TGSI_OPCODE_UP2H
:
2188 case TGSI_OPCODE_UP2US
:
2192 case TGSI_OPCODE_UP4B
:
2196 case TGSI_OPCODE_UP4UB
:
2200 case TGSI_OPCODE_X2D
:
2204 case TGSI_OPCODE_ARA
:
2208 case TGSI_OPCODE_ARR
:
2209 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2210 FETCH( func
, *inst
, 0, 0, chan_index
);
2211 emit_rnd( func
, 0, 0 );
2212 emit_f2it( func
, 0 );
2213 STORE( func
, *inst
, 0, 0, chan_index
);
2217 case TGSI_OPCODE_BRA
:
2221 case TGSI_OPCODE_CAL
:
2225 case TGSI_OPCODE_RET
:
2229 case TGSI_OPCODE_END
:
2232 case TGSI_OPCODE_SSG
:
2233 /* TGSI_OPCODE_SGN */
2234 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2235 FETCH( func
, *inst
, 0, 0, chan_index
);
2236 emit_sgn( func
, 0, 0 );
2237 STORE( func
, *inst
, 0, 0, chan_index
);
2241 case TGSI_OPCODE_CMP
:
2242 emit_cmp (func
, inst
);
2245 case TGSI_OPCODE_SCS
:
2246 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2247 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2248 emit_cos( func
, 0, 0 );
2249 STORE( func
, *inst
, 0, 0, CHAN_X
);
2251 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2252 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2253 emit_sin( func
, 0, 0 );
2254 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2256 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2260 TGSI_EXEC_TEMP_00000000_I
,
2261 TGSI_EXEC_TEMP_00000000_C
);
2262 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2264 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2270 STORE( func
, *inst
, 0, 0, CHAN_W
);
2274 case TGSI_OPCODE_TXB
:
2278 case TGSI_OPCODE_NRM
:
2280 case TGSI_OPCODE_NRM4
:
2281 /* 3 or 4-component normalization */
2283 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2285 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2286 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2287 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2288 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2290 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2293 /* xmm0 = src.x * src.x */
2294 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2295 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2296 emit_MOV(func
, 4, 0);
2298 emit_mul(func
, 0, 0);
2301 /* xmm0 = xmm0 + src.y * src.y */
2302 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2303 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2304 emit_MOV(func
, 5, 1);
2306 emit_mul(func
, 1, 1);
2307 emit_add(func
, 0, 1);
2310 /* xmm0 = xmm0 + src.z * src.z */
2311 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2312 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2313 emit_MOV(func
, 6, 1);
2315 emit_mul(func
, 1, 1);
2316 emit_add(func
, 0, 1);
2320 /* xmm0 = xmm0 + src.w * src.w */
2321 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2322 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2323 emit_MOV(func
, 7, 1);
2325 emit_mul(func
, 1, 1);
2326 emit_add(func
, 0, 1);
2329 /* xmm1 = 1 / sqrt(xmm0) */
2330 emit_rsqrt(func
, 1, 0);
2332 /* dst.x = xmm1 * src.x */
2333 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2334 emit_mul(func
, 4, 1);
2335 STORE(func
, *inst
, 4, 0, CHAN_X
);
2338 /* dst.y = xmm1 * src.y */
2339 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2340 emit_mul(func
, 5, 1);
2341 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2344 /* dst.z = xmm1 * src.z */
2345 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2346 emit_mul(func
, 6, 1);
2347 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2350 /* dst.w = xmm1 * src.w */
2351 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2352 emit_mul(func
, 7, 1);
2353 STORE(func
, *inst
, 7, 0, CHAN_W
);
2358 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2359 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2360 STORE(func
, *inst
, 0, 0, CHAN_W
);
2365 case TGSI_OPCODE_DIV
:
2369 case TGSI_OPCODE_DP2
:
2370 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2371 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2372 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2373 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2374 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2375 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2376 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2377 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2378 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2382 case TGSI_OPCODE_TXL
:
2386 case TGSI_OPCODE_BRK
:
2390 case TGSI_OPCODE_IF
:
2394 case TGSI_OPCODE_LOOP
:
2398 case TGSI_OPCODE_REP
:
2402 case TGSI_OPCODE_ELSE
:
2406 case TGSI_OPCODE_ENDIF
:
2410 case TGSI_OPCODE_ENDLOOP
:
2414 case TGSI_OPCODE_ENDREP
:
2418 case TGSI_OPCODE_PUSHA
:
2422 case TGSI_OPCODE_POPA
:
2426 case TGSI_OPCODE_CEIL
:
2430 case TGSI_OPCODE_I2F
:
2434 case TGSI_OPCODE_NOT
:
2438 case TGSI_OPCODE_TRUNC
:
2439 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2440 FETCH( func
, *inst
, 0, 0, chan_index
);
2441 emit_f2it( func
, 0 );
2442 emit_i2f( func
, 0 );
2443 STORE( func
, *inst
, 0, 0, chan_index
);
2447 case TGSI_OPCODE_SHL
:
2451 case TGSI_OPCODE_SHR
:
2455 case TGSI_OPCODE_AND
:
2459 case TGSI_OPCODE_OR
:
2463 case TGSI_OPCODE_MOD
:
2467 case TGSI_OPCODE_XOR
:
2471 case TGSI_OPCODE_SAD
:
2475 case TGSI_OPCODE_TXF
:
2479 case TGSI_OPCODE_TXQ
:
2483 case TGSI_OPCODE_CONT
:
2487 case TGSI_OPCODE_EMIT
:
2491 case TGSI_OPCODE_ENDPRIM
:
2504 struct x86_function
*func
,
2505 struct tgsi_full_declaration
*decl
)
2507 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2508 unsigned first
, last
, mask
;
2511 first
= decl
->DeclarationRange
.First
;
2512 last
= decl
->DeclarationRange
.Last
;
2513 mask
= decl
->Declaration
.UsageMask
;
2515 for( i
= first
; i
<= last
; i
++ ) {
2516 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2517 if( mask
& (1 << j
) ) {
2518 switch( decl
->Declaration
.Interpolate
) {
2519 case TGSI_INTERPOLATE_CONSTANT
:
2520 emit_coef_a0( func
, 0, i
, j
);
2521 emit_inputs( func
, 0, i
, j
);
2524 case TGSI_INTERPOLATE_LINEAR
:
2525 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2526 emit_coef_dadx( func
, 1, i
, j
);
2527 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2528 emit_coef_dady( func
, 3, i
, j
);
2529 emit_mul( func
, 0, 1 ); /* x * dadx */
2530 emit_coef_a0( func
, 4, i
, j
);
2531 emit_mul( func
, 2, 3 ); /* y * dady */
2532 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2533 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2534 emit_inputs( func
, 0, i
, j
);
2537 case TGSI_INTERPOLATE_PERSPECTIVE
:
2538 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2539 emit_coef_dadx( func
, 1, i
, j
);
2540 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2541 emit_coef_dady( func
, 3, i
, j
);
2542 emit_mul( func
, 0, 1 ); /* x * dadx */
2543 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2544 emit_coef_a0( func
, 5, i
, j
);
2545 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2546 emit_mul( func
, 2, 3 ); /* y * dady */
2547 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2548 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2549 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2550 emit_inputs( func
, 0, i
, j
);
2563 static void aos_to_soa( struct x86_function
*func
,
2569 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2570 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2571 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2572 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2577 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2579 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2580 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2581 x86_lea( func
, soa_input
,
2582 x86_make_disp( soa_input
,
2583 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2584 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2585 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2588 inner_loop
= x86_get_label( func
);
2590 x86_push( func
, aos_input
);
2591 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2592 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2593 x86_add( func
, aos_input
, stride
);
2594 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2595 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2596 x86_add( func
, aos_input
, stride
);
2597 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2598 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2599 x86_add( func
, aos_input
, stride
);
2600 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2601 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2602 x86_pop( func
, aos_input
);
2604 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2605 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2606 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2607 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2608 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2609 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2611 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2612 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2613 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2614 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2616 /* Advance to next input */
2617 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2618 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2620 /* while --num_inputs */
2621 x86_dec( func
, num_inputs
);
2622 x86_jcc( func
, cc_NE
, inner_loop
);
2625 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2628 static void soa_to_aos( struct x86_function
*func
,
2634 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2635 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2636 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2637 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2641 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2643 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2644 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2645 x86_lea( func
, soa_output
,
2646 x86_make_disp( soa_output
,
2647 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2648 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2651 inner_loop
= x86_get_label( func
);
2653 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2654 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2655 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2656 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2658 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2659 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2660 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2661 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2662 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2663 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2665 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2666 x86_push( func
, aos_output
);
2667 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2668 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2669 x86_add( func
, aos_output
, temp
);
2670 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2671 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2672 x86_add( func
, aos_output
, temp
);
2673 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2674 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2675 x86_add( func
, aos_output
, temp
);
2676 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2677 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2678 x86_pop( func
, aos_output
);
2680 /* Advance to next output */
2681 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2682 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2684 /* while --num_outputs */
2685 x86_dec( func
, num_outputs
);
2686 x86_jcc( func
, cc_NE
, inner_loop
);
2689 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2693 * Translate a TGSI vertex/fragment shader to SSE2 code.
2694 * Slightly different things are done for vertex vs. fragment shaders.
2696 * \param tokens the TGSI input shader
2697 * \param func the output SSE code/function
2698 * \param immediates buffer to place immediates, later passed to SSE func
2699 * \param return 1 for success, 0 if translation failed
2703 const struct tgsi_token
*tokens
,
2704 struct x86_function
*func
,
2705 float (*immediates
)[4],
2706 boolean do_swizzles
)
2708 struct tgsi_parse_context parse
;
2710 uint num_immediates
= 0;
2714 func
->csr
= func
->store
;
2716 tgsi_parse_init( &parse
, tokens
);
2718 /* Can't just use EDI, EBX without save/restoring them:
2720 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2721 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2724 * Different function args for vertex/fragment shaders:
2726 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2732 6 ); /* input_stride */
2738 x86_fn_arg( func
, 1 ) );
2742 x86_fn_arg( func
, 2 ) );
2745 get_immediate_base(),
2746 x86_fn_arg( func
, 3 ) );
2748 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2752 x86_fn_arg( func
, 4 ) );
2756 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2757 tgsi_parse_token( &parse
);
2759 switch( parse
.FullToken
.Token
.Type
) {
2760 case TGSI_TOKEN_TYPE_DECLARATION
:
2761 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2764 &parse
.FullToken
.FullDeclaration
);
2768 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2769 ok
= emit_instruction(
2771 &parse
.FullToken
.FullInstruction
);
2774 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2775 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2776 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2777 "vertex shader" : "fragment shader");
2781 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2782 /* simply copy the immediate values into the next immediates[] slot */
2784 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2787 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2788 for( i
= 0; i
< size
; i
++ ) {
2789 immediates
[num_immediates
][i
] =
2790 parse
.FullToken
.FullImmediate
.u
.ImmediateFloat32
[i
].Float
;
2793 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2795 immediates
[num_immediates
][0],
2796 immediates
[num_immediates
][1],
2797 immediates
[num_immediates
][2],
2798 immediates
[num_immediates
][3]);
2810 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2815 8, /* num_outputs */
2816 9 ); /* output_stride */
2819 /* Can't just use EBX, EDI without save/restoring them:
2821 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
2822 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2826 tgsi_parse_free( &parse
);
2831 #endif /* PIPE_ARCH_X86 */