1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
39 #include "tgsi/tgsi_parse.h"
40 #include "tgsi/tgsi_util.h"
41 #include "tgsi_exec.h"
42 #include "tgsi_sse2.h"
44 #include "rtasm/rtasm_x86sse.h"
48 * This costs about 100fps (close to 10%) in gears:
50 #define HIGH_PRECISION 1
55 #define FOR_EACH_CHANNEL( CHAN )\
56 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
59 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
62 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
65 FOR_EACH_CHANNEL( CHAN )\
66 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
73 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
74 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
76 #define TEMP_R0 TGSI_EXEC_TEMP_R0
77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
83 * X86 utility functions.
92 (enum x86_reg_name
) xmm
);
96 * X86 register mapping helpers.
100 get_const_base( void )
107 static struct x86_reg
108 get_machine_base( void )
115 static struct x86_reg
116 get_input_base( void )
118 return x86_make_disp(
120 Offset(struct tgsi_exec_machine
, Inputs
) );
123 static struct x86_reg
124 get_output_base( void )
126 return x86_make_disp(
128 Offset(struct tgsi_exec_machine
, Outputs
) );
131 static struct x86_reg
132 get_temp_base( void )
134 return x86_make_disp(
136 Offset(struct tgsi_exec_machine
, Temps
) );
139 static struct x86_reg
140 get_coef_base( void )
147 static struct x86_reg
148 get_sampler_base( void )
155 static struct x86_reg
156 get_immediate_base( void )
165 * Data access helpers.
169 static struct x86_reg
174 return x86_make_disp(
175 get_immediate_base(),
176 (vec
* 4 + chan
) * 4 );
179 static struct x86_reg
184 return x86_make_disp(
186 (vec
* 4 + chan
) * 4 );
189 static struct x86_reg
193 return x86_make_disp(
195 unit
* sizeof( struct tgsi_sampler
* ) );
198 static struct x86_reg
203 return x86_make_disp(
205 (vec
* 4 + chan
) * 16 );
208 static struct x86_reg
213 return x86_make_disp(
215 (vec
* 4 + chan
) * 16 );
218 static struct x86_reg
223 return x86_make_disp(
225 (vec
* 4 + chan
) * 16 );
228 static struct x86_reg
234 return x86_make_disp(
236 ((vec
* 3 + member
) * 4 + chan
) * 4 );
242 struct x86_function
*func
)
249 * Data fetch helpers.
253 * Copy a shader constant to xmm register
254 * \param xmm the destination xmm register
255 * \param vec the src const buffer index
256 * \param chan src channel to fetch (X, Y, Z or W)
260 struct x86_function
*func
,
269 /* 'vec' is the offset from the address register's value.
270 * We're loading CONST[ADDR+vec] into an xmm register.
272 struct x86_reg r0
= get_immediate_base();
273 struct x86_reg r1
= get_coef_base();
276 assert( indirectFile
== TGSI_FILE_ADDRESS
);
277 assert( indirectIndex
== 0 );
278 assert( r0
.mod
== mod_REG
);
279 assert( r1
.mod
== mod_REG
);
281 x86_push( func
, r0
);
282 x86_push( func
, r1
);
285 * Loop over the four pixels or vertices in the quad.
286 * Get the value of the address (offset) register for pixel/vertex[i],
287 * add it to the src offset and index into the constant buffer.
288 * Note that we're working on SOA data.
289 * If any of the pixel/vertex execution channels are unused their
290 * values will be garbage. It's very important that we don't use
291 * those garbage values as indexes into the constant buffer since
292 * that'll cause segfaults.
293 * The solution is to bitwise-AND the offset with the execution mask
294 * register whose values are either 0 or ~0.
295 * The caller must setup the execution mask register to indicate
296 * which channels are valid/alive before running the shader.
297 * The execution mask will also figure into loops and conditionals
300 for (i
= 0; i
< QUAD_SIZE
; i
++) {
301 /* r1 = address register[i] */
302 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
303 /* r0 = execution mask[i] */
304 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
306 x86_and( func
, r1
, r0
);
307 /* r0 = 'vec', the offset */
308 x86_lea( func
, r0
, get_const( vec
, chan
) );
310 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
312 x86_add( func
, r1
, r1
);
313 x86_add( func
, r1
, r1
);
314 x86_add( func
, r1
, r1
);
315 x86_add( func
, r1
, r1
);
317 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
318 x86_mov( func
, r1
, x86_deref( r0
) );
319 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
328 get_temp( TEMP_R0
, CHAN_X
) );
331 /* 'vec' is the index into the src register file, such as TEMP[vec] */
337 get_const( vec
, chan
) );
342 SHUF( 0, 0, 0, 0 ) );
348 struct x86_function
*func
,
356 get_immediate( vec
, chan
) );
361 SHUF( 0, 0, 0, 0 ) );
366 * Copy a shader input to xmm register
367 * \param xmm the destination xmm register
368 * \param vec the src input attrib
369 * \param chan src channel to fetch (X, Y, Z or W)
373 struct x86_function
*func
,
381 get_input( vec
, chan
) );
385 * Store an xmm register to a shader output
386 * \param xmm the source xmm register
387 * \param vec the dest output attrib
388 * \param chan src dest channel to store (X, Y, Z or W)
392 struct x86_function
*func
,
399 get_output( vec
, chan
),
404 * Copy a shader temporary to xmm register
405 * \param xmm the destination xmm register
406 * \param vec the src temp register
407 * \param chan src channel to fetch (X, Y, Z or W)
411 struct x86_function
*func
,
419 get_temp( vec
, chan
) );
423 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
424 * \param xmm the destination xmm register
425 * \param vec the src input/attribute coefficient index
426 * \param chan src channel to fetch (X, Y, Z or W)
427 * \param member 0=a0, 1=dadx, 2=dady
431 struct x86_function
*func
,
440 get_coef( vec
, chan
, member
) );
445 SHUF( 0, 0, 0, 0 ) );
449 * Data store helpers.
454 struct x86_function
*func
,
461 get_input( vec
, chan
),
467 struct x86_function
*func
,
474 get_temp( vec
, chan
),
480 struct x86_function
*func
,
490 vec
+ TGSI_EXEC_TEMP_ADDR
,
495 * Coefficent fetch helpers.
500 struct x86_function
*func
,
515 struct x86_function
*func
,
530 struct x86_function
*func
,
544 * Function call helpers.
548 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
549 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
550 * that the stack pointer is 16 byte aligned, as expected.
554 struct x86_function
*func
,
555 unsigned xmm_save_mask
,
556 const struct x86_reg
*arg
,
558 void (PIPE_CDECL
*code
)() )
560 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
565 x86_make_reg( file_REG32
, reg_AX
) );
568 x86_make_reg( file_REG32
, reg_CX
) );
571 x86_make_reg( file_REG32
, reg_DX
) );
573 /* Store XMM regs to the stack
575 for(i
= 0, n
= 0; i
< 8; ++i
)
576 if(xmm_save_mask
& (1 << i
))
581 x86_make_reg( file_REG32
, reg_SP
),
584 for(i
= 0, n
= 0; i
< 8; ++i
)
585 if(xmm_save_mask
& (1 << i
)) {
588 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
593 for (i
= 0; i
< nr_args
; i
++) {
594 /* Load the address of the buffer we use for passing arguments and
602 /* Push actual function arguments (currently just the pointer to
603 * the buffer above), and call the function:
605 x86_push( func
, ecx
);
608 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
609 x86_call( func
, ecx
);
611 /* Pop the arguments (or just add an immediate to esp)
613 for (i
= 0; i
< nr_args
; i
++) {
617 /* Pop the saved XMM regs:
619 for(i
= 0, n
= 0; i
< 8; ++i
)
620 if(xmm_save_mask
& (1 << i
)) {
624 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
630 x86_make_reg( file_REG32
, reg_SP
),
633 /* Restore GP registers in a reverse order.
637 x86_make_reg( file_REG32
, reg_DX
) );
640 x86_make_reg( file_REG32
, reg_CX
) );
643 x86_make_reg( file_REG32
, reg_AX
) );
647 emit_func_call_dst_src1(
648 struct x86_function
*func
,
652 void (PIPE_CDECL
*code
)() )
654 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
655 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
657 /* Store our input parameters (in xmm regs) to the buffer we use
658 * for passing arguments. We will pass a pointer to this buffer as
659 * the actual function argument.
664 make_xmm( xmm_src0
) );
666 emit_func_call( func
,
680 emit_func_call_dst_src2(
681 struct x86_function
*func
,
686 void (PIPE_CDECL
*code
)() )
688 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
689 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
691 /* Store two inputs to parameter buffer.
696 make_xmm( xmm_src0
) );
700 x86_make_disp( store
, 4 * sizeof(float) ),
701 make_xmm( xmm_src1
) );
706 emit_func_call( func
,
712 /* Retrieve the results:
724 #if defined(PIPE_ARCH_SSE)
727 * Fast SSE2 implementation of special math functions.
730 #define POLY0(x, c0) _mm_set1_ps(c0)
731 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
732 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
733 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
734 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
735 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
737 #define EXP_POLY_DEGREE 3
738 #define LOG_POLY_DEGREE 5
741 * See http://www.devmaster.net/forums/showthread.php?p=43580
747 __m128 fpart
, expipart
, expfpart
;
749 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
750 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
752 /* ipart = int(x - 0.5) */
753 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
755 /* fpart = x - ipart */
756 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
758 /* expipart = (float) (1 << ipart) */
759 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
761 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
762 #if EXP_POLY_DEGREE == 5
763 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
764 #elif EXP_POLY_DEGREE == 4
765 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
766 #elif EXP_POLY_DEGREE == 3
767 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
768 #elif EXP_POLY_DEGREE == 2
769 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
774 return _mm_mul_ps(expipart
, expfpart
);
779 * See http://www.devmaster.net/forums/showthread.php?p=43580
784 __m128i expmask
= _mm_set1_epi32(0x7f800000);
785 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
786 __m128 one
= _mm_set1_ps(1.0f
);
788 __m128i i
= _mm_castps_si128(x
);
790 /* exp = (float) exponent(x) */
791 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
793 /* mant = (float) mantissa(x) */
794 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
798 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
799 * These coefficients can be generate with
800 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
802 #if LOG_POLY_DEGREE == 6
803 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
804 #elif LOG_POLY_DEGREE == 5
805 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
806 #elif LOG_POLY_DEGREE == 4
807 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
808 #elif LOG_POLY_DEGREE == 3
809 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
814 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
815 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
817 return _mm_add_ps(logmant
, exp
);
822 powf4(__m128 x
, __m128 y
)
824 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
827 #endif /* PIPE_ARCH_SSE */
832 * Low-level instruction translators.
837 struct x86_function
*func
,
844 TGSI_EXEC_TEMP_7FFFFFFF_I
,
845 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
850 struct x86_function
*func
,
857 make_xmm( xmm_src
) );
860 static void PIPE_CDECL
864 store
[0] = cosf( store
[0] );
865 store
[1] = cosf( store
[1] );
866 store
[2] = cosf( store
[2] );
867 store
[3] = cosf( store
[3] );
872 struct x86_function
*func
,
876 emit_func_call_dst_src1(
884 static void PIPE_CDECL
885 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
886 __attribute__((force_align_arg_pointer
))
891 #if defined(PIPE_ARCH_SSE)
892 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
894 store
[0] = util_fast_exp2( store
[0] );
895 store
[1] = util_fast_exp2( store
[1] );
896 store
[2] = util_fast_exp2( store
[2] );
897 store
[3] = util_fast_exp2( store
[3] );
903 struct x86_function
*func
,
907 emit_func_call_dst_src1(
917 struct x86_function
*func
,
928 struct x86_function
*func
,
937 static void PIPE_CDECL
941 store
[0] = floorf( store
[0] );
942 store
[1] = floorf( store
[1] );
943 store
[2] = floorf( store
[2] );
944 store
[3] = floorf( store
[3] );
949 struct x86_function
*func
,
953 emit_func_call_dst_src1(
961 static void PIPE_CDECL
965 store
[0] -= floorf( store
[0] );
966 store
[1] -= floorf( store
[1] );
967 store
[2] -= floorf( store
[2] );
968 store
[3] -= floorf( store
[3] );
973 struct x86_function
*func
,
977 emit_func_call_dst_src1(
985 static void PIPE_CDECL
986 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
987 __attribute__((force_align_arg_pointer
))
992 #if defined(PIPE_ARCH_SSE)
993 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
995 store
[0] = util_fast_log2( store
[0] );
996 store
[1] = util_fast_log2( store
[1] );
997 store
[2] = util_fast_log2( store
[2] );
998 store
[3] = util_fast_log2( store
[3] );
1004 struct x86_function
*func
,
1008 emit_func_call_dst_src1(
1018 struct x86_function
*func
,
1024 make_xmm( xmm_dst
),
1025 make_xmm( xmm_src
) );
1029 emit_mul (struct x86_function
*func
,
1035 make_xmm( xmm_dst
),
1036 make_xmm( xmm_src
) );
1041 struct x86_function
*func
,
1048 TGSI_EXEC_TEMP_80000000_I
,
1049 TGSI_EXEC_TEMP_80000000_C
) );
1052 static void PIPE_CDECL
1053 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1054 __attribute__((force_align_arg_pointer
))
1059 #if defined(PIPE_ARCH_SSE)
1060 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1062 store
[0] = util_fast_pow( store
[0], store
[4] );
1063 store
[1] = util_fast_pow( store
[1], store
[5] );
1064 store
[2] = util_fast_pow( store
[2], store
[6] );
1065 store
[3] = util_fast_pow( store
[3], store
[7] );
1071 struct x86_function
*func
,
1077 emit_func_call_dst_src2(
1088 struct x86_function
*func
,
1092 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1093 * good enough. Need to either emit a proper divide or use the
1094 * iterative technique described below in emit_rsqrt().
1098 make_xmm( xmm_dst
),
1099 make_xmm( xmm_src
) );
1102 static void PIPE_CDECL
1106 store
[0] = floorf( store
[0] + 0.5f
);
1107 store
[1] = floorf( store
[1] + 0.5f
);
1108 store
[2] = floorf( store
[2] + 0.5f
);
1109 store
[3] = floorf( store
[3] + 0.5f
);
1114 struct x86_function
*func
,
1118 emit_func_call_dst_src1(
1128 struct x86_function
*func
,
1133 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1134 * implementations, it is possible to improve its precision at
1135 * fairly low cost, using a newton/raphson step, as below:
1137 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1138 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1140 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1143 struct x86_reg dst
= make_xmm( xmm_dst
);
1144 struct x86_reg src
= make_xmm( xmm_src
);
1145 struct x86_reg tmp0
= make_xmm( 2 );
1146 struct x86_reg tmp1
= make_xmm( 3 );
1148 assert( xmm_dst
!= xmm_src
);
1149 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1150 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1152 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1153 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1154 sse_rsqrtps( func
, tmp1
, src
);
1155 sse_mulps( func
, src
, tmp1
);
1156 sse_mulps( func
, dst
, tmp1
);
1157 sse_mulps( func
, src
, tmp1
);
1158 sse_subps( func
, tmp0
, src
);
1159 sse_mulps( func
, dst
, tmp0
);
1162 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1167 make_xmm( xmm_dst
),
1168 make_xmm( xmm_src
) );
1174 struct x86_function
*func
,
1181 TGSI_EXEC_TEMP_80000000_I
,
1182 TGSI_EXEC_TEMP_80000000_C
) );
1185 static void PIPE_CDECL
1189 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1190 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1191 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1192 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1197 struct x86_function
*func
,
1201 emit_func_call_dst_src1(
1209 static void PIPE_CDECL
1213 store
[0] = sinf( store
[0] );
1214 store
[1] = sinf( store
[1] );
1215 store
[2] = sinf( store
[2] );
1216 store
[3] = sinf( store
[3] );
1220 emit_sin (struct x86_function
*func
,
1224 emit_func_call_dst_src1(
1234 struct x86_function
*func
,
1240 make_xmm( xmm_dst
),
1241 make_xmm( xmm_src
) );
1256 struct x86_function
*func
,
1258 const struct tgsi_full_src_register
*reg
,
1259 const unsigned chan_index
)
1261 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1264 case TGSI_EXTSWIZZLE_X
:
1265 case TGSI_EXTSWIZZLE_Y
:
1266 case TGSI_EXTSWIZZLE_Z
:
1267 case TGSI_EXTSWIZZLE_W
:
1268 switch (reg
->SrcRegister
.File
) {
1269 case TGSI_FILE_CONSTANT
:
1273 reg
->SrcRegister
.Index
,
1275 reg
->SrcRegister
.Indirect
,
1276 reg
->SrcRegisterInd
.File
,
1277 reg
->SrcRegisterInd
.Index
);
1280 case TGSI_FILE_IMMEDIATE
:
1284 reg
->SrcRegister
.Index
,
1288 case TGSI_FILE_INPUT
:
1292 reg
->SrcRegister
.Index
,
1296 case TGSI_FILE_TEMPORARY
:
1300 reg
->SrcRegister
.Index
,
1309 case TGSI_EXTSWIZZLE_ZERO
:
1313 TGSI_EXEC_TEMP_00000000_I
,
1314 TGSI_EXEC_TEMP_00000000_C
);
1317 case TGSI_EXTSWIZZLE_ONE
:
1329 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1330 case TGSI_UTIL_SIGN_CLEAR
:
1331 emit_abs( func
, xmm
);
1334 case TGSI_UTIL_SIGN_SET
:
1335 emit_setsign( func
, xmm
);
1338 case TGSI_UTIL_SIGN_TOGGLE
:
1339 emit_neg( func
, xmm
);
1342 case TGSI_UTIL_SIGN_KEEP
:
1347 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1348 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1356 struct x86_function
*func
,
1358 const struct tgsi_full_dst_register
*reg
,
1359 const struct tgsi_full_instruction
*inst
,
1360 unsigned chan_index
)
1362 switch( reg
->DstRegister
.File
) {
1363 case TGSI_FILE_OUTPUT
:
1367 reg
->DstRegister
.Index
,
1371 case TGSI_FILE_TEMPORARY
:
1375 reg
->DstRegister
.Index
,
1379 case TGSI_FILE_ADDRESS
:
1383 reg
->DstRegister
.Index
,
1391 switch( inst
->Instruction
.Saturate
) {
1395 case TGSI_SAT_ZERO_ONE
:
1399 case TGSI_SAT_MINUS_PLUS_ONE
:
1405 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1406 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1409 static void PIPE_CDECL
1410 fetch_texel( struct tgsi_sampler
**sampler
,
1416 debug_printf("%s sampler: %p (%p) store: %p\n",
1421 debug_printf("lodbias %f\n", store
[12]);
1423 for (j
= 0; j
< 4; j
++)
1424 debug_printf("sample %d texcoord %f %f\n",
1431 float rgba
[NUM_CHANNELS
][QUAD_SIZE
];
1432 (*sampler
)->get_samples(*sampler
,
1436 0.0f
, /*store[12], lodbias */
1439 memcpy( store
, rgba
, 16 * sizeof(float));
1443 for (j
= 0; j
< 4; j
++)
1444 debug_printf("sample %d result %f %f %f %f\n",
1454 * High-level instruction translators.
1458 emit_tex( struct x86_function
*func
,
1459 const struct tgsi_full_instruction
*inst
,
1463 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
1464 struct x86_reg args
[2];
1468 switch (inst
->InstructionExtTexture
.Texture
) {
1469 case TGSI_TEXTURE_1D
:
1472 case TGSI_TEXTURE_2D
:
1473 case TGSI_TEXTURE_RECT
:
1476 case TGSI_TEXTURE_SHADOW1D
:
1477 case TGSI_TEXTURE_SHADOW2D
:
1478 case TGSI_TEXTURE_SHADOWRECT
:
1479 case TGSI_TEXTURE_3D
:
1480 case TGSI_TEXTURE_CUBE
:
1489 FETCH( func
, *inst
, 3, 0, 3 );
1495 TGSI_EXEC_TEMP_00000000_I
,
1496 TGSI_EXEC_TEMP_00000000_C
);
1500 /* store lodbias whether enabled or not -- fetch_texel currently
1501 * respects it always.
1504 get_temp( TEMP_R0
, 3 ),
1509 FETCH( func
, *inst
, 3, 0, 3 );
1511 emit_rcp( func
, 3, 3 );
1514 for (i
= 0; i
< count
; i
++) {
1515 FETCH( func
, *inst
, i
, 0, i
);
1524 /* Store in the argument buffer:
1528 get_temp( TEMP_R0
, i
),
1532 args
[0] = get_temp( TEMP_R0
, 0 );
1533 args
[1] = get_sampler_ptr( unit
);
1536 emit_func_call( func
,
1542 /* If all four channels are enabled, could use a pointer to
1543 * dst[0].x instead of TEMP_R0 for store?
1545 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, i
) {
1550 get_temp( TEMP_R0
, i
) );
1552 STORE( func
, *inst
, 0, 0, i
);
1559 struct x86_function
*func
,
1560 const struct tgsi_full_src_register
*reg
)
1562 unsigned uniquemask
;
1563 unsigned unique_count
= 0;
1564 unsigned chan_index
;
1567 /* This mask stores component bits that were already tested. Note that
1568 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1570 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1572 FOR_EACH_CHANNEL( chan_index
) {
1575 /* unswizzle channel */
1576 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1580 /* check if the component has not been already tested */
1581 if( !(uniquemask
& (1 << swizzle
)) ) {
1582 uniquemask
|= 1 << swizzle
;
1584 /* allocate register */
1595 x86_make_reg( file_REG32
, reg_AX
) );
1598 x86_make_reg( file_REG32
, reg_DX
) );
1600 for (i
= 0 ; i
< unique_count
; i
++ ) {
1601 struct x86_reg dataXMM
= make_xmm(i
);
1607 TGSI_EXEC_TEMP_00000000_I
,
1608 TGSI_EXEC_TEMP_00000000_C
),
1614 x86_make_reg( file_REG32
, reg_AX
),
1620 x86_make_reg( file_REG32
, reg_DX
),
1624 x86_make_reg( file_REG32
, reg_AX
),
1625 x86_make_reg( file_REG32
, reg_DX
) );
1632 TGSI_EXEC_TEMP_KILMASK_I
,
1633 TGSI_EXEC_TEMP_KILMASK_C
),
1634 x86_make_reg( file_REG32
, reg_AX
) );
1638 x86_make_reg( file_REG32
, reg_DX
) );
1641 x86_make_reg( file_REG32
, reg_AX
) );
1647 struct x86_function
*func
)
1649 /* XXX todo / fix me */
1655 struct x86_function
*func
,
1656 struct tgsi_full_instruction
*inst
,
1659 unsigned chan_index
;
1661 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1662 FETCH( func
, *inst
, 0, 0, chan_index
);
1663 FETCH( func
, *inst
, 1, 1, chan_index
);
1675 STORE( func
, *inst
, 0, 0, chan_index
);
1681 struct x86_function
*func
,
1682 struct tgsi_full_instruction
*inst
)
1684 unsigned chan_index
;
1686 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1687 FETCH( func
, *inst
, 0, 0, chan_index
);
1688 FETCH( func
, *inst
, 1, 1, chan_index
);
1689 FETCH( func
, *inst
, 2, 2, chan_index
);
1694 TGSI_EXEC_TEMP_00000000_I
,
1695 TGSI_EXEC_TEMP_00000000_C
),
1709 STORE( func
, *inst
, 0, 0, chan_index
);
1715 * Check if inst src/dest regs use indirect addressing into temporary
1719 indirect_temp_reference(const struct tgsi_full_instruction
*inst
)
1722 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1723 const struct tgsi_full_src_register
*reg
= &inst
->FullSrcRegisters
[i
];
1724 if (reg
->SrcRegister
.File
== TGSI_FILE_TEMPORARY
&&
1725 reg
->SrcRegister
.Indirect
)
1728 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1729 const struct tgsi_full_dst_register
*reg
= &inst
->FullDstRegisters
[i
];
1730 if (reg
->DstRegister
.File
== TGSI_FILE_TEMPORARY
&&
1731 reg
->DstRegister
.Indirect
)
1740 struct x86_function
*func
,
1741 struct tgsi_full_instruction
*inst
)
1743 unsigned chan_index
;
1745 /* we can't handle indirect addressing into temp register file yet */
1746 if (indirect_temp_reference(inst
))
1749 switch (inst
->Instruction
.Opcode
) {
1750 case TGSI_OPCODE_ARL
:
1751 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1752 FETCH( func
, *inst
, 0, 0, chan_index
);
1753 emit_flr(func
, 0, 0);
1754 emit_f2it( func
, 0 );
1755 STORE( func
, *inst
, 0, 0, chan_index
);
1759 case TGSI_OPCODE_MOV
:
1760 case TGSI_OPCODE_SWZ
:
1761 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1762 FETCH( func
, *inst
, 0, 0, chan_index
);
1763 STORE( func
, *inst
, 0, 0, chan_index
);
1767 case TGSI_OPCODE_LIT
:
1768 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1769 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1775 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1776 STORE( func
, *inst
, 0, 0, CHAN_X
);
1778 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1779 STORE( func
, *inst
, 0, 0, CHAN_W
);
1782 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1783 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1784 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1785 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1790 TGSI_EXEC_TEMP_00000000_I
,
1791 TGSI_EXEC_TEMP_00000000_C
) );
1792 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1794 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1795 /* XMM[1] = SrcReg[0].yyyy */
1796 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1797 /* XMM[1] = max(XMM[1], 0) */
1802 TGSI_EXEC_TEMP_00000000_I
,
1803 TGSI_EXEC_TEMP_00000000_C
) );
1804 /* XMM[2] = SrcReg[0].wwww */
1805 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1806 /* XMM[2] = min(XMM[2], 128.0) */
1811 TGSI_EXEC_TEMP_128_I
,
1812 TGSI_EXEC_TEMP_128_C
) );
1813 /* XMM[2] = max(XMM[2], -128.0) */
1818 TGSI_EXEC_TEMP_MINUS_128_I
,
1819 TGSI_EXEC_TEMP_MINUS_128_C
) );
1820 emit_pow( func
, 3, 1, 1, 2 );
1821 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1835 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1840 case TGSI_OPCODE_RCP
:
1841 /* TGSI_OPCODE_RECIP */
1842 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1843 emit_rcp( func
, 0, 0 );
1844 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1845 STORE( func
, *inst
, 0, 0, chan_index
);
1849 case TGSI_OPCODE_RSQ
:
1850 /* TGSI_OPCODE_RECIPSQRT */
1851 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1852 emit_abs( func
, 0 );
1853 emit_rsqrt( func
, 1, 0 );
1854 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1855 STORE( func
, *inst
, 1, 0, chan_index
);
1859 case TGSI_OPCODE_EXP
:
1860 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1861 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1862 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1863 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1864 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1865 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1866 emit_MOV( func
, 1, 0 );
1867 emit_flr( func
, 2, 1 );
1868 /* dst.x = ex2(floor(src.x)) */
1869 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1870 emit_MOV( func
, 2, 1 );
1871 emit_ex2( func
, 3, 2 );
1872 STORE( func
, *inst
, 2, 0, CHAN_X
);
1874 /* dst.y = src.x - floor(src.x) */
1875 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1876 emit_MOV( func
, 2, 0 );
1877 emit_sub( func
, 2, 1 );
1878 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1881 /* dst.z = ex2(src.x) */
1882 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1883 emit_ex2( func
, 3, 0 );
1884 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1888 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1889 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1890 STORE( func
, *inst
, 0, 0, CHAN_W
);
1894 case TGSI_OPCODE_LOG
:
1895 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1896 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1897 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1898 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1899 emit_abs( func
, 0 );
1900 emit_MOV( func
, 1, 0 );
1901 emit_lg2( func
, 2, 1 );
1902 /* dst.z = lg2(abs(src.x)) */
1903 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1904 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1906 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1907 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1908 emit_flr( func
, 2, 1 );
1909 /* dst.x = floor(lg2(abs(src.x))) */
1910 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1911 STORE( func
, *inst
, 1, 0, CHAN_X
);
1913 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1914 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1915 emit_ex2( func
, 2, 1 );
1916 emit_rcp( func
, 1, 1 );
1917 emit_mul( func
, 0, 1 );
1918 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1923 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1924 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1925 STORE( func
, *inst
, 0, 0, CHAN_W
);
1929 case TGSI_OPCODE_MUL
:
1930 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1931 FETCH( func
, *inst
, 0, 0, chan_index
);
1932 FETCH( func
, *inst
, 1, 1, chan_index
);
1933 emit_mul( func
, 0, 1 );
1934 STORE( func
, *inst
, 0, 0, chan_index
);
1938 case TGSI_OPCODE_ADD
:
1939 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1940 FETCH( func
, *inst
, 0, 0, chan_index
);
1941 FETCH( func
, *inst
, 1, 1, chan_index
);
1942 emit_add( func
, 0, 1 );
1943 STORE( func
, *inst
, 0, 0, chan_index
);
1947 case TGSI_OPCODE_DP3
:
1948 /* TGSI_OPCODE_DOT3 */
1949 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1950 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1951 emit_mul( func
, 0, 1 );
1952 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1953 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1954 emit_mul( func
, 1, 2 );
1955 emit_add( func
, 0, 1 );
1956 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1957 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1958 emit_mul( func
, 1, 2 );
1959 emit_add( func
, 0, 1 );
1960 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1961 STORE( func
, *inst
, 0, 0, chan_index
);
1965 case TGSI_OPCODE_DP4
:
1966 /* TGSI_OPCODE_DOT4 */
1967 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1968 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1969 emit_mul( func
, 0, 1 );
1970 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1971 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1972 emit_mul( func
, 1, 2 );
1973 emit_add( func
, 0, 1 );
1974 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1975 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1976 emit_mul(func
, 1, 2 );
1977 emit_add(func
, 0, 1 );
1978 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1979 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1980 emit_mul( func
, 1, 2 );
1981 emit_add( func
, 0, 1 );
1982 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1983 STORE( func
, *inst
, 0, 0, chan_index
);
1987 case TGSI_OPCODE_DST
:
1988 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
1994 STORE( func
, *inst
, 0, 0, CHAN_X
);
1996 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
1997 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
1998 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
1999 emit_mul( func
, 0, 1 );
2000 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2002 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2003 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
2004 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2006 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2007 FETCH( func
, *inst
, 0, 1, CHAN_W
);
2008 STORE( func
, *inst
, 0, 0, CHAN_W
);
2012 case TGSI_OPCODE_MIN
:
2013 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2014 FETCH( func
, *inst
, 0, 0, chan_index
);
2015 FETCH( func
, *inst
, 1, 1, chan_index
);
2020 STORE( func
, *inst
, 0, 0, chan_index
);
2024 case TGSI_OPCODE_MAX
:
2025 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2026 FETCH( func
, *inst
, 0, 0, chan_index
);
2027 FETCH( func
, *inst
, 1, 1, chan_index
);
2032 STORE( func
, *inst
, 0, 0, chan_index
);
2036 case TGSI_OPCODE_SLT
:
2037 /* TGSI_OPCODE_SETLT */
2038 emit_setcc( func
, inst
, cc_LessThan
);
2041 case TGSI_OPCODE_SGE
:
2042 /* TGSI_OPCODE_SETGE */
2043 emit_setcc( func
, inst
, cc_NotLessThan
);
2046 case TGSI_OPCODE_MAD
:
2047 /* TGSI_OPCODE_MADD */
2048 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2049 FETCH( func
, *inst
, 0, 0, chan_index
);
2050 FETCH( func
, *inst
, 1, 1, chan_index
);
2051 FETCH( func
, *inst
, 2, 2, chan_index
);
2052 emit_mul( func
, 0, 1 );
2053 emit_add( func
, 0, 2 );
2054 STORE( func
, *inst
, 0, 0, chan_index
);
2058 case TGSI_OPCODE_SUB
:
2059 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2060 FETCH( func
, *inst
, 0, 0, chan_index
);
2061 FETCH( func
, *inst
, 1, 1, chan_index
);
2062 emit_sub( func
, 0, 1 );
2063 STORE( func
, *inst
, 0, 0, chan_index
);
2067 case TGSI_OPCODE_LRP
:
2068 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2069 FETCH( func
, *inst
, 0, 0, chan_index
);
2070 FETCH( func
, *inst
, 1, 1, chan_index
);
2071 FETCH( func
, *inst
, 2, 2, chan_index
);
2072 emit_sub( func
, 1, 2 );
2073 emit_mul( func
, 0, 1 );
2074 emit_add( func
, 0, 2 );
2075 STORE( func
, *inst
, 0, 0, chan_index
);
2079 case TGSI_OPCODE_CND
:
2083 case TGSI_OPCODE_CND0
:
2087 case TGSI_OPCODE_DP2A
:
2088 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2089 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2090 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2091 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2092 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2093 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2094 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2095 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
2096 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2097 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2098 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2102 case TGSI_OPCODE_FRC
:
2103 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2104 FETCH( func
, *inst
, 0, 0, chan_index
);
2105 emit_frc( func
, 0, 0 );
2106 STORE( func
, *inst
, 0, 0, chan_index
);
2110 case TGSI_OPCODE_CLAMP
:
2114 case TGSI_OPCODE_FLR
:
2115 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2116 FETCH( func
, *inst
, 0, 0, chan_index
);
2117 emit_flr( func
, 0, 0 );
2118 STORE( func
, *inst
, 0, 0, chan_index
);
2122 case TGSI_OPCODE_ROUND
:
2123 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2124 FETCH( func
, *inst
, 0, 0, chan_index
);
2125 emit_rnd( func
, 0, 0 );
2126 STORE( func
, *inst
, 0, 0, chan_index
);
2130 case TGSI_OPCODE_EX2
:
2131 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2132 emit_ex2( func
, 0, 0 );
2133 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2134 STORE( func
, *inst
, 0, 0, chan_index
);
2138 case TGSI_OPCODE_LG2
:
2139 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2140 emit_lg2( func
, 0, 0 );
2141 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2142 STORE( func
, *inst
, 0, 0, chan_index
);
2146 case TGSI_OPCODE_POW
:
2147 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2148 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2149 emit_pow( func
, 0, 0, 0, 1 );
2150 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2151 STORE( func
, *inst
, 0, 0, chan_index
);
2155 case TGSI_OPCODE_XPD
:
2156 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2157 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2158 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
2159 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
2161 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2162 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2163 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2164 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
2166 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2167 emit_MOV( func
, 2, 0 );
2168 emit_mul( func
, 2, 1 );
2169 emit_MOV( func
, 5, 3 );
2170 emit_mul( func
, 5, 4 );
2171 emit_sub( func
, 2, 5 );
2172 STORE( func
, *inst
, 2, 0, CHAN_X
);
2174 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2175 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2176 FETCH( func
, *inst
, 2, 1, CHAN_X
);
2177 FETCH( func
, *inst
, 5, 0, CHAN_X
);
2179 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2180 emit_mul( func
, 3, 2 );
2181 emit_mul( func
, 1, 5 );
2182 emit_sub( func
, 3, 1 );
2183 STORE( func
, *inst
, 3, 0, CHAN_Y
);
2185 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2186 emit_mul( func
, 5, 4 );
2187 emit_mul( func
, 0, 2 );
2188 emit_sub( func
, 5, 0 );
2189 STORE( func
, *inst
, 5, 0, CHAN_Z
);
2191 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2197 STORE( func
, *inst
, 0, 0, CHAN_W
);
2201 case TGSI_OPCODE_ABS
:
2202 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2203 FETCH( func
, *inst
, 0, 0, chan_index
);
2204 emit_abs( func
, 0) ;
2206 STORE( func
, *inst
, 0, 0, chan_index
);
2210 case TGSI_OPCODE_RCC
:
2214 case TGSI_OPCODE_DPH
:
2215 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2216 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2217 emit_mul( func
, 0, 1 );
2218 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2219 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2220 emit_mul( func
, 1, 2 );
2221 emit_add( func
, 0, 1 );
2222 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2223 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2224 emit_mul( func
, 1, 2 );
2225 emit_add( func
, 0, 1 );
2226 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2227 emit_add( func
, 0, 1 );
2228 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2229 STORE( func
, *inst
, 0, 0, chan_index
);
2233 case TGSI_OPCODE_COS
:
2234 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2235 emit_cos( func
, 0, 0 );
2236 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2237 STORE( func
, *inst
, 0, 0, chan_index
);
2241 case TGSI_OPCODE_DDX
:
2245 case TGSI_OPCODE_DDY
:
2249 case TGSI_OPCODE_KILP
:
2250 /* predicated kill */
2252 return 0; /* XXX fix me */
2255 case TGSI_OPCODE_KIL
:
2256 /* conditional kill */
2257 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
2260 case TGSI_OPCODE_PK2H
:
2264 case TGSI_OPCODE_PK2US
:
2268 case TGSI_OPCODE_PK4B
:
2272 case TGSI_OPCODE_PK4UB
:
2276 case TGSI_OPCODE_RFL
:
2280 case TGSI_OPCODE_SEQ
:
2284 case TGSI_OPCODE_SFL
:
2288 case TGSI_OPCODE_SGT
:
2292 case TGSI_OPCODE_SIN
:
2293 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2294 emit_sin( func
, 0, 0 );
2295 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2296 STORE( func
, *inst
, 0, 0, chan_index
);
2300 case TGSI_OPCODE_SLE
:
2304 case TGSI_OPCODE_SNE
:
2308 case TGSI_OPCODE_STR
:
2312 case TGSI_OPCODE_TEX
:
2313 emit_tex( func
, inst
, FALSE
, FALSE
);
2316 case TGSI_OPCODE_TXD
:
2320 case TGSI_OPCODE_UP2H
:
2324 case TGSI_OPCODE_UP2US
:
2328 case TGSI_OPCODE_UP4B
:
2332 case TGSI_OPCODE_UP4UB
:
2336 case TGSI_OPCODE_X2D
:
2340 case TGSI_OPCODE_ARA
:
2344 case TGSI_OPCODE_ARR
:
2345 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2346 FETCH( func
, *inst
, 0, 0, chan_index
);
2347 emit_rnd( func
, 0, 0 );
2348 emit_f2it( func
, 0 );
2349 STORE( func
, *inst
, 0, 0, chan_index
);
2353 case TGSI_OPCODE_BRA
:
2357 case TGSI_OPCODE_CAL
:
2361 case TGSI_OPCODE_RET
:
2365 case TGSI_OPCODE_END
:
2368 case TGSI_OPCODE_SSG
:
2369 /* TGSI_OPCODE_SGN */
2370 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2371 FETCH( func
, *inst
, 0, 0, chan_index
);
2372 emit_sgn( func
, 0, 0 );
2373 STORE( func
, *inst
, 0, 0, chan_index
);
2377 case TGSI_OPCODE_CMP
:
2378 emit_cmp (func
, inst
);
2381 case TGSI_OPCODE_SCS
:
2382 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2383 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2384 emit_cos( func
, 0, 0 );
2385 STORE( func
, *inst
, 0, 0, CHAN_X
);
2387 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2388 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2389 emit_sin( func
, 0, 0 );
2390 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2392 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2396 TGSI_EXEC_TEMP_00000000_I
,
2397 TGSI_EXEC_TEMP_00000000_C
);
2398 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2406 STORE( func
, *inst
, 0, 0, CHAN_W
);
2410 case TGSI_OPCODE_TXB
:
2411 emit_tex( func
, inst
, TRUE
, FALSE
);
2414 case TGSI_OPCODE_NRM
:
2416 case TGSI_OPCODE_NRM4
:
2417 /* 3 or 4-component normalization */
2419 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2421 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2422 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2423 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2424 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2426 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2429 /* xmm0 = src.x * src.x */
2430 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2431 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2432 emit_MOV(func
, 4, 0);
2434 emit_mul(func
, 0, 0);
2437 /* xmm0 = xmm0 + src.y * src.y */
2438 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2439 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2440 emit_MOV(func
, 5, 1);
2442 emit_mul(func
, 1, 1);
2443 emit_add(func
, 0, 1);
2446 /* xmm0 = xmm0 + src.z * src.z */
2447 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2448 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2449 emit_MOV(func
, 6, 1);
2451 emit_mul(func
, 1, 1);
2452 emit_add(func
, 0, 1);
2456 /* xmm0 = xmm0 + src.w * src.w */
2457 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2458 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2459 emit_MOV(func
, 7, 1);
2461 emit_mul(func
, 1, 1);
2462 emit_add(func
, 0, 1);
2465 /* xmm1 = 1 / sqrt(xmm0) */
2466 emit_rsqrt(func
, 1, 0);
2468 /* dst.x = xmm1 * src.x */
2469 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2470 emit_mul(func
, 4, 1);
2471 STORE(func
, *inst
, 4, 0, CHAN_X
);
2474 /* dst.y = xmm1 * src.y */
2475 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2476 emit_mul(func
, 5, 1);
2477 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2480 /* dst.z = xmm1 * src.z */
2481 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2482 emit_mul(func
, 6, 1);
2483 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2486 /* dst.w = xmm1 * src.w */
2487 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2488 emit_mul(func
, 7, 1);
2489 STORE(func
, *inst
, 7, 0, CHAN_W
);
2494 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2495 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2496 STORE(func
, *inst
, 0, 0, CHAN_W
);
2501 case TGSI_OPCODE_DIV
:
2505 case TGSI_OPCODE_DP2
:
2506 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2507 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2508 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2509 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2510 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2511 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2512 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2513 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2514 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2518 case TGSI_OPCODE_TXL
:
2519 emit_tex( func
, inst
, TRUE
, FALSE
);
2522 case TGSI_OPCODE_TXP
:
2523 emit_tex( func
, inst
, FALSE
, TRUE
);
2526 case TGSI_OPCODE_BRK
:
2530 case TGSI_OPCODE_IF
:
2534 case TGSI_OPCODE_LOOP
:
2538 case TGSI_OPCODE_REP
:
2542 case TGSI_OPCODE_ELSE
:
2546 case TGSI_OPCODE_ENDIF
:
2550 case TGSI_OPCODE_ENDLOOP
:
2554 case TGSI_OPCODE_ENDREP
:
2558 case TGSI_OPCODE_PUSHA
:
2562 case TGSI_OPCODE_POPA
:
2566 case TGSI_OPCODE_CEIL
:
2570 case TGSI_OPCODE_I2F
:
2574 case TGSI_OPCODE_NOT
:
2578 case TGSI_OPCODE_TRUNC
:
2579 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2580 FETCH( func
, *inst
, 0, 0, chan_index
);
2581 emit_f2it( func
, 0 );
2582 emit_i2f( func
, 0 );
2583 STORE( func
, *inst
, 0, 0, chan_index
);
2587 case TGSI_OPCODE_SHL
:
2591 case TGSI_OPCODE_SHR
:
2595 case TGSI_OPCODE_AND
:
2599 case TGSI_OPCODE_OR
:
2603 case TGSI_OPCODE_MOD
:
2607 case TGSI_OPCODE_XOR
:
2611 case TGSI_OPCODE_SAD
:
2615 case TGSI_OPCODE_TXF
:
2619 case TGSI_OPCODE_TXQ
:
2623 case TGSI_OPCODE_CONT
:
2627 case TGSI_OPCODE_EMIT
:
2631 case TGSI_OPCODE_ENDPRIM
:
2644 struct x86_function
*func
,
2645 struct tgsi_full_declaration
*decl
)
2647 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2648 unsigned first
, last
, mask
;
2651 first
= decl
->DeclarationRange
.First
;
2652 last
= decl
->DeclarationRange
.Last
;
2653 mask
= decl
->Declaration
.UsageMask
;
2655 for( i
= first
; i
<= last
; i
++ ) {
2656 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2657 if( mask
& (1 << j
) ) {
2658 switch( decl
->Declaration
.Interpolate
) {
2659 case TGSI_INTERPOLATE_CONSTANT
:
2660 emit_coef_a0( func
, 0, i
, j
);
2661 emit_inputs( func
, 0, i
, j
);
2664 case TGSI_INTERPOLATE_LINEAR
:
2665 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2666 emit_coef_dadx( func
, 1, i
, j
);
2667 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2668 emit_coef_dady( func
, 3, i
, j
);
2669 emit_mul( func
, 0, 1 ); /* x * dadx */
2670 emit_coef_a0( func
, 4, i
, j
);
2671 emit_mul( func
, 2, 3 ); /* y * dady */
2672 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2673 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2674 emit_inputs( func
, 0, i
, j
);
2677 case TGSI_INTERPOLATE_PERSPECTIVE
:
2678 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2679 emit_coef_dadx( func
, 1, i
, j
);
2680 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2681 emit_coef_dady( func
, 3, i
, j
);
2682 emit_mul( func
, 0, 1 ); /* x * dadx */
2683 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2684 emit_coef_a0( func
, 5, i
, j
);
2685 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2686 emit_mul( func
, 2, 3 ); /* y * dady */
2687 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2688 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2689 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2690 emit_inputs( func
, 0, i
, j
);
2703 static void aos_to_soa( struct x86_function
*func
,
2709 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2710 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2711 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2712 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2717 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2719 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2720 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2721 x86_lea( func
, soa_input
,
2722 x86_make_disp( soa_input
,
2723 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2724 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2725 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2728 inner_loop
= x86_get_label( func
);
2730 x86_push( func
, aos_input
);
2731 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2732 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2733 x86_add( func
, aos_input
, stride
);
2734 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2735 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2736 x86_add( func
, aos_input
, stride
);
2737 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2738 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2739 x86_add( func
, aos_input
, stride
);
2740 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2741 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2742 x86_pop( func
, aos_input
);
2744 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2745 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2746 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2747 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2748 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2749 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2751 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2752 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2753 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2754 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2756 /* Advance to next input */
2757 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2758 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2760 /* while --num_inputs */
2761 x86_dec( func
, num_inputs
);
2762 x86_jcc( func
, cc_NE
, inner_loop
);
2765 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2768 static void soa_to_aos( struct x86_function
*func
,
2774 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2775 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2776 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2777 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2781 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2783 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2784 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2785 x86_lea( func
, soa_output
,
2786 x86_make_disp( soa_output
,
2787 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2788 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2791 inner_loop
= x86_get_label( func
);
2793 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2794 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2795 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2796 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2798 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2799 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2800 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2801 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2802 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2803 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2805 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2806 x86_push( func
, aos_output
);
2807 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2808 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2809 x86_add( func
, aos_output
, temp
);
2810 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2811 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2812 x86_add( func
, aos_output
, temp
);
2813 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2814 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2815 x86_add( func
, aos_output
, temp
);
2816 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2817 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2818 x86_pop( func
, aos_output
);
2820 /* Advance to next output */
2821 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2822 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2824 /* while --num_outputs */
2825 x86_dec( func
, num_outputs
);
2826 x86_jcc( func
, cc_NE
, inner_loop
);
2829 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2833 * Translate a TGSI vertex/fragment shader to SSE2 code.
2834 * Slightly different things are done for vertex vs. fragment shaders.
2836 * \param tokens the TGSI input shader
2837 * \param func the output SSE code/function
2838 * \param immediates buffer to place immediates, later passed to SSE func
2839 * \param return 1 for success, 0 if translation failed
2843 const struct tgsi_token
*tokens
,
2844 struct x86_function
*func
,
2845 float (*immediates
)[4],
2846 boolean do_swizzles
)
2848 struct tgsi_parse_context parse
;
2850 uint num_immediates
= 0;
2854 func
->csr
= func
->store
;
2856 tgsi_parse_init( &parse
, tokens
);
2858 /* Can't just use EDI, EBX without save/restoring them:
2860 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2861 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2864 * Different function args for vertex/fragment shaders:
2866 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2872 6 ); /* input_stride */
2878 x86_fn_arg( func
, 1 ) );
2882 x86_fn_arg( func
, 2 ) );
2885 get_immediate_base(),
2886 x86_fn_arg( func
, 3 ) );
2888 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2892 x86_fn_arg( func
, 4 ) );
2898 x86_make_disp( get_machine_base(),
2899 Offset( struct tgsi_exec_machine
, Samplers
) ) );
2902 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2903 tgsi_parse_token( &parse
);
2905 switch( parse
.FullToken
.Token
.Type
) {
2906 case TGSI_TOKEN_TYPE_DECLARATION
:
2907 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2910 &parse
.FullToken
.FullDeclaration
);
2914 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2915 ok
= emit_instruction(
2917 &parse
.FullToken
.FullInstruction
);
2920 debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2921 parse
.FullToken
.FullInstruction
.Instruction
.Opcode
,
2922 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2923 "vertex shader" : "fragment shader");
2927 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2928 /* simply copy the immediate values into the next immediates[] slot */
2930 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2933 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2934 for( i
= 0; i
< size
; i
++ ) {
2935 immediates
[num_immediates
][i
] =
2936 parse
.FullToken
.FullImmediate
.u
[i
].Float
;
2939 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2941 immediates
[num_immediates
][0],
2942 immediates
[num_immediates
][1],
2943 immediates
[num_immediates
][2],
2944 immediates
[num_immediates
][3]);
2956 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2961 8, /* num_outputs */
2962 9 ); /* output_stride */
2965 /* Can't just use EBX, EDI without save/restoring them:
2967 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
2968 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2972 tgsi_parse_free( &parse
);
2977 #endif /* PIPE_ARCH_X86 */