1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
39 #include "tgsi/tgsi_info.h"
40 #include "tgsi/tgsi_parse.h"
41 #include "tgsi/tgsi_util.h"
42 #include "tgsi/tgsi_dump.h"
43 #include "tgsi/tgsi_exec.h"
44 #include "tgsi/tgsi_sse2.h"
46 #include "rtasm/rtasm_x86sse.h"
50 * This costs about 100fps (close to 10%) in gears:
52 #define HIGH_PRECISION 1
57 #define FOR_EACH_CHANNEL( CHAN )\
58 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
60 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
61 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
63 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
64 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
66 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
67 FOR_EACH_CHANNEL( CHAN )\
68 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
75 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
76 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
78 #define TEMP_R0 TGSI_EXEC_TEMP_R0
79 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
80 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
81 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
85 * X86 utility functions.
94 (enum x86_reg_name
) xmm
);
98 * X86 register mapping helpers.
101 static struct x86_reg
102 get_const_base( void )
109 static struct x86_reg
110 get_machine_base( void )
117 static struct x86_reg
118 get_input_base( void )
120 return x86_make_disp(
122 Offset(struct tgsi_exec_machine
, Inputs
) );
125 static struct x86_reg
126 get_output_base( void )
128 return x86_make_disp(
130 Offset(struct tgsi_exec_machine
, Outputs
) );
133 static struct x86_reg
134 get_temp_base( void )
136 return x86_make_disp(
138 Offset(struct tgsi_exec_machine
, Temps
) );
141 static struct x86_reg
142 get_coef_base( void )
149 static struct x86_reg
150 get_sampler_base( void )
157 static struct x86_reg
158 get_immediate_base( void )
167 * Data access helpers.
171 static struct x86_reg
176 return x86_make_disp(
177 get_immediate_base(),
178 (vec
* 4 + chan
) * 4 );
181 static struct x86_reg
186 return x86_make_disp(
188 (vec
* 4 + chan
) * 4 );
191 static struct x86_reg
195 return x86_make_disp(
197 unit
* sizeof( struct tgsi_sampler
* ) );
200 static struct x86_reg
205 return x86_make_disp(
207 (vec
* 4 + chan
) * 16 );
210 static struct x86_reg
215 return x86_make_disp(
217 (vec
* 4 + chan
) * 16 );
220 static struct x86_reg
225 return x86_make_disp(
227 (vec
* 4 + chan
) * 16 );
230 static struct x86_reg
236 return x86_make_disp(
238 ((vec
* 3 + member
) * 4 + chan
) * 4 );
244 struct x86_function
*func
)
251 * Data fetch helpers.
255 * Copy a shader constant to xmm register
256 * \param xmm the destination xmm register
257 * \param vec the src const buffer index
258 * \param chan src channel to fetch (X, Y, Z or W)
262 struct x86_function
*func
,
271 /* 'vec' is the offset from the address register's value.
272 * We're loading CONST[ADDR+vec] into an xmm register.
274 struct x86_reg r0
= get_immediate_base();
275 struct x86_reg r1
= get_coef_base();
278 assert( indirectFile
== TGSI_FILE_ADDRESS
);
279 assert( indirectIndex
== 0 );
280 assert( r0
.mod
== mod_REG
);
281 assert( r1
.mod
== mod_REG
);
283 x86_push( func
, r0
);
284 x86_push( func
, r1
);
287 * Loop over the four pixels or vertices in the quad.
288 * Get the value of the address (offset) register for pixel/vertex[i],
289 * add it to the src offset and index into the constant buffer.
290 * Note that we're working on SOA data.
291 * If any of the pixel/vertex execution channels are unused their
292 * values will be garbage. It's very important that we don't use
293 * those garbage values as indexes into the constant buffer since
294 * that'll cause segfaults.
295 * The solution is to bitwise-AND the offset with the execution mask
296 * register whose values are either 0 or ~0.
297 * The caller must setup the execution mask register to indicate
298 * which channels are valid/alive before running the shader.
299 * The execution mask will also figure into loops and conditionals
302 for (i
= 0; i
< QUAD_SIZE
; i
++) {
303 /* r1 = address register[i] */
304 x86_mov( func
, r1
, x86_make_disp( get_temp( TEMP_ADDR
, CHAN_X
), i
* 4 ) );
305 /* r0 = execution mask[i] */
306 x86_mov( func
, r0
, x86_make_disp( get_temp( TEMP_EXEC_MASK_I
, TEMP_EXEC_MASK_C
), i
* 4 ) );
308 x86_and( func
, r1
, r0
);
309 /* r0 = 'vec', the offset */
310 x86_lea( func
, r0
, get_const( vec
, chan
) );
312 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
314 x86_add( func
, r1
, r1
);
315 x86_add( func
, r1
, r1
);
316 x86_add( func
, r1
, r1
);
317 x86_add( func
, r1
, r1
);
319 x86_add( func
, r0
, r1
); /* r0 = r0 + r1 */
320 x86_mov( func
, r1
, x86_deref( r0
) );
321 x86_mov( func
, x86_make_disp( get_temp( TEMP_R0
, CHAN_X
), i
* 4 ), r1
);
330 get_temp( TEMP_R0
, CHAN_X
) );
333 /* 'vec' is the index into the src register file, such as TEMP[vec] */
339 get_const( vec
, chan
) );
344 SHUF( 0, 0, 0, 0 ) );
350 struct x86_function
*func
,
358 get_immediate( vec
, chan
) );
363 SHUF( 0, 0, 0, 0 ) );
368 * Copy a shader input to xmm register
369 * \param xmm the destination xmm register
370 * \param vec the src input attrib
371 * \param chan src channel to fetch (X, Y, Z or W)
375 struct x86_function
*func
,
383 get_input( vec
, chan
) );
387 * Store an xmm register to a shader output
388 * \param xmm the source xmm register
389 * \param vec the dest output attrib
390 * \param chan src dest channel to store (X, Y, Z or W)
394 struct x86_function
*func
,
401 get_output( vec
, chan
),
406 * Copy a shader temporary to xmm register
407 * \param xmm the destination xmm register
408 * \param vec the src temp register
409 * \param chan src channel to fetch (X, Y, Z or W)
413 struct x86_function
*func
,
421 get_temp( vec
, chan
) );
425 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
426 * \param xmm the destination xmm register
427 * \param vec the src input/attribute coefficient index
428 * \param chan src channel to fetch (X, Y, Z or W)
429 * \param member 0=a0, 1=dadx, 2=dady
433 struct x86_function
*func
,
442 get_coef( vec
, chan
, member
) );
447 SHUF( 0, 0, 0, 0 ) );
451 * Data store helpers.
456 struct x86_function
*func
,
463 get_input( vec
, chan
),
469 struct x86_function
*func
,
476 get_temp( vec
, chan
),
482 struct x86_function
*func
,
492 vec
+ TGSI_EXEC_TEMP_ADDR
,
497 * Coefficent fetch helpers.
502 struct x86_function
*func
,
517 struct x86_function
*func
,
532 struct x86_function
*func
,
546 * Function call helpers.
550 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
551 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
552 * that the stack pointer is 16 byte aligned, as expected.
556 struct x86_function
*func
,
557 unsigned xmm_save_mask
,
558 const struct x86_reg
*arg
,
560 void (PIPE_CDECL
*code
)() )
562 struct x86_reg ecx
= x86_make_reg( file_REG32
, reg_CX
);
567 x86_make_reg( file_REG32
, reg_AX
) );
570 x86_make_reg( file_REG32
, reg_CX
) );
573 x86_make_reg( file_REG32
, reg_DX
) );
575 /* Store XMM regs to the stack
577 for(i
= 0, n
= 0; i
< 8; ++i
)
578 if(xmm_save_mask
& (1 << i
))
583 x86_make_reg( file_REG32
, reg_SP
),
586 for(i
= 0, n
= 0; i
< 8; ++i
)
587 if(xmm_save_mask
& (1 << i
)) {
590 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ),
595 for (i
= 0; i
< nr_args
; i
++) {
596 /* Load the address of the buffer we use for passing arguments and
604 /* Push actual function arguments (currently just the pointer to
605 * the buffer above), and call the function:
607 x86_push( func
, ecx
);
610 x86_mov_reg_imm( func
, ecx
, (unsigned long) code
);
611 x86_call( func
, ecx
);
613 /* Pop the arguments (or just add an immediate to esp)
615 for (i
= 0; i
< nr_args
; i
++) {
619 /* Pop the saved XMM regs:
621 for(i
= 0, n
= 0; i
< 8; ++i
)
622 if(xmm_save_mask
& (1 << i
)) {
626 x86_make_disp( x86_make_reg( file_REG32
, reg_SP
), n
*16 ) );
632 x86_make_reg( file_REG32
, reg_SP
),
635 /* Restore GP registers in a reverse order.
639 x86_make_reg( file_REG32
, reg_DX
) );
642 x86_make_reg( file_REG32
, reg_CX
) );
645 x86_make_reg( file_REG32
, reg_AX
) );
649 emit_func_call_dst_src1(
650 struct x86_function
*func
,
654 void (PIPE_CDECL
*code
)() )
656 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
657 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
659 /* Store our input parameters (in xmm regs) to the buffer we use
660 * for passing arguments. We will pass a pointer to this buffer as
661 * the actual function argument.
666 make_xmm( xmm_src0
) );
668 emit_func_call( func
,
682 emit_func_call_dst_src2(
683 struct x86_function
*func
,
688 void (PIPE_CDECL
*code
)() )
690 struct x86_reg store
= get_temp( TEMP_R0
, 0 );
691 unsigned xmm_mask
= ((1 << xmm_save
) - 1) & ~(1 << xmm_dst
);
693 /* Store two inputs to parameter buffer.
698 make_xmm( xmm_src0
) );
702 x86_make_disp( store
, 4 * sizeof(float) ),
703 make_xmm( xmm_src1
) );
708 emit_func_call( func
,
714 /* Retrieve the results:
726 #if defined(PIPE_ARCH_SSE)
729 * Fast SSE2 implementation of special math functions.
732 #define POLY0(x, c0) _mm_set1_ps(c0)
733 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
734 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
735 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
736 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
737 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
739 #define EXP_POLY_DEGREE 3
740 #define LOG_POLY_DEGREE 5
743 * See http://www.devmaster.net/forums/showthread.php?p=43580
749 __m128 fpart
, expipart
, expfpart
;
751 x
= _mm_min_ps(x
, _mm_set1_ps( 129.00000f
));
752 x
= _mm_max_ps(x
, _mm_set1_ps(-126.99999f
));
754 /* ipart = int(x - 0.5) */
755 ipart
= _mm_cvtps_epi32(_mm_sub_ps(x
, _mm_set1_ps(0.5f
)));
757 /* fpart = x - ipart */
758 fpart
= _mm_sub_ps(x
, _mm_cvtepi32_ps(ipart
));
760 /* expipart = (float) (1 << ipart) */
761 expipart
= _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart
, _mm_set1_epi32(127)), 23));
763 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
764 #if EXP_POLY_DEGREE == 5
765 expfpart
= POLY5(fpart
, 9.9999994e-1f
, 6.9315308e-1f
, 2.4015361e-1f
, 5.5826318e-2f
, 8.9893397e-3f
, 1.8775767e-3f
);
766 #elif EXP_POLY_DEGREE == 4
767 expfpart
= POLY4(fpart
, 1.0000026f
, 6.9300383e-1f
, 2.4144275e-1f
, 5.2011464e-2f
, 1.3534167e-2f
);
768 #elif EXP_POLY_DEGREE == 3
769 expfpart
= POLY3(fpart
, 9.9992520e-1f
, 6.9583356e-1f
, 2.2606716e-1f
, 7.8024521e-2f
);
770 #elif EXP_POLY_DEGREE == 2
771 expfpart
= POLY2(fpart
, 1.0017247f
, 6.5763628e-1f
, 3.3718944e-1f
);
776 return _mm_mul_ps(expipart
, expfpart
);
781 * See http://www.devmaster.net/forums/showthread.php?p=43580
786 __m128i expmask
= _mm_set1_epi32(0x7f800000);
787 __m128i mantmask
= _mm_set1_epi32(0x007fffff);
788 __m128 one
= _mm_set1_ps(1.0f
);
790 __m128i i
= _mm_castps_si128(x
);
792 /* exp = (float) exponent(x) */
793 __m128 exp
= _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i
, expmask
), 23), _mm_set1_epi32(127)));
795 /* mant = (float) mantissa(x) */
796 __m128 mant
= _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i
, mantmask
)), one
);
800 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
801 * These coefficients can be generate with
802 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
804 #if LOG_POLY_DEGREE == 6
805 logmant
= POLY5(mant
, 3.11578814719469302614f
, -3.32419399085241980044f
, 2.59883907202499966007f
, -1.23152682416275988241f
, 0.318212422185251071475f
, -0.0344359067839062357313f
);
806 #elif LOG_POLY_DEGREE == 5
807 logmant
= POLY4(mant
, 2.8882704548164776201f
, -2.52074962577807006663f
, 1.48116647521213171641f
, -0.465725644288844778798f
, 0.0596515482674574969533f
);
808 #elif LOG_POLY_DEGREE == 4
809 logmant
= POLY3(mant
, 2.61761038894603480148f
, -1.75647175389045657003f
, 0.688243882994381274313f
, -0.107254423828329604454f
);
810 #elif LOG_POLY_DEGREE == 3
811 logmant
= POLY2(mant
, 2.28330284476918490682f
, -1.04913055217340124191f
, 0.204446009836232697516f
);
816 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
817 logmant
= _mm_mul_ps(logmant
, _mm_sub_ps(mant
, one
));
819 return _mm_add_ps(logmant
, exp
);
824 powf4(__m128 x
, __m128 y
)
826 return exp2f4(_mm_mul_ps(log2f4(x
), y
));
829 #endif /* PIPE_ARCH_SSE */
834 * Low-level instruction translators.
839 struct x86_function
*func
,
846 TGSI_EXEC_TEMP_7FFFFFFF_I
,
847 TGSI_EXEC_TEMP_7FFFFFFF_C
) );
852 struct x86_function
*func
,
859 make_xmm( xmm_src
) );
862 static void PIPE_CDECL
866 store
[0] = cosf( store
[0] );
867 store
[1] = cosf( store
[1] );
868 store
[2] = cosf( store
[2] );
869 store
[3] = cosf( store
[3] );
874 struct x86_function
*func
,
878 emit_func_call_dst_src1(
886 static void PIPE_CDECL
887 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
888 __attribute__((force_align_arg_pointer
))
893 #if defined(PIPE_ARCH_SSE)
894 _mm_store_ps(&store
[0], exp2f4( _mm_load_ps(&store
[0]) ));
896 store
[0] = util_fast_exp2( store
[0] );
897 store
[1] = util_fast_exp2( store
[1] );
898 store
[2] = util_fast_exp2( store
[2] );
899 store
[3] = util_fast_exp2( store
[3] );
905 struct x86_function
*func
,
909 emit_func_call_dst_src1(
919 struct x86_function
*func
,
930 struct x86_function
*func
,
939 static void PIPE_CDECL
943 store
[0] = floorf( store
[0] );
944 store
[1] = floorf( store
[1] );
945 store
[2] = floorf( store
[2] );
946 store
[3] = floorf( store
[3] );
951 struct x86_function
*func
,
955 emit_func_call_dst_src1(
963 static void PIPE_CDECL
967 store
[0] -= floorf( store
[0] );
968 store
[1] -= floorf( store
[1] );
969 store
[2] -= floorf( store
[2] );
970 store
[3] -= floorf( store
[3] );
975 struct x86_function
*func
,
979 emit_func_call_dst_src1(
987 static void PIPE_CDECL
988 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
989 __attribute__((force_align_arg_pointer
))
994 #if defined(PIPE_ARCH_SSE)
995 _mm_store_ps(&store
[0], log2f4( _mm_load_ps(&store
[0]) ));
997 store
[0] = util_fast_log2( store
[0] );
998 store
[1] = util_fast_log2( store
[1] );
999 store
[2] = util_fast_log2( store
[2] );
1000 store
[3] = util_fast_log2( store
[3] );
1006 struct x86_function
*func
,
1010 emit_func_call_dst_src1(
1020 struct x86_function
*func
,
1026 make_xmm( xmm_dst
),
1027 make_xmm( xmm_src
) );
1031 emit_mul (struct x86_function
*func
,
1037 make_xmm( xmm_dst
),
1038 make_xmm( xmm_src
) );
1043 struct x86_function
*func
,
1050 TGSI_EXEC_TEMP_80000000_I
,
1051 TGSI_EXEC_TEMP_80000000_C
) );
1054 static void PIPE_CDECL
1055 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1056 __attribute__((force_align_arg_pointer
))
1061 #if defined(PIPE_ARCH_SSE)
1062 _mm_store_ps(&store
[0], powf4( _mm_load_ps(&store
[0]), _mm_load_ps(&store
[4]) ));
1064 store
[0] = util_fast_pow( store
[0], store
[4] );
1065 store
[1] = util_fast_pow( store
[1], store
[5] );
1066 store
[2] = util_fast_pow( store
[2], store
[6] );
1067 store
[3] = util_fast_pow( store
[3], store
[7] );
1073 struct x86_function
*func
,
1079 emit_func_call_dst_src2(
1090 struct x86_function
*func
,
1094 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1095 * good enough. Need to either emit a proper divide or use the
1096 * iterative technique described below in emit_rsqrt().
1100 make_xmm( xmm_dst
),
1101 make_xmm( xmm_src
) );
1104 static void PIPE_CDECL
1108 store
[0] = floorf( store
[0] + 0.5f
);
1109 store
[1] = floorf( store
[1] + 0.5f
);
1110 store
[2] = floorf( store
[2] + 0.5f
);
1111 store
[3] = floorf( store
[3] + 0.5f
);
1116 struct x86_function
*func
,
1120 emit_func_call_dst_src1(
1130 struct x86_function
*func
,
1135 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1136 * implementations, it is possible to improve its precision at
1137 * fairly low cost, using a newton/raphson step, as below:
1139 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1140 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1142 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1145 struct x86_reg dst
= make_xmm( xmm_dst
);
1146 struct x86_reg src
= make_xmm( xmm_src
);
1147 struct x86_reg tmp0
= make_xmm( 2 );
1148 struct x86_reg tmp1
= make_xmm( 3 );
1150 assert( xmm_dst
!= xmm_src
);
1151 assert( xmm_dst
!= 2 && xmm_dst
!= 3 );
1152 assert( xmm_src
!= 2 && xmm_src
!= 3 );
1154 sse_movaps( func
, dst
, get_temp( TGSI_EXEC_TEMP_HALF_I
, TGSI_EXEC_TEMP_HALF_C
) );
1155 sse_movaps( func
, tmp0
, get_temp( TGSI_EXEC_TEMP_THREE_I
, TGSI_EXEC_TEMP_THREE_C
) );
1156 sse_rsqrtps( func
, tmp1
, src
);
1157 sse_mulps( func
, src
, tmp1
);
1158 sse_mulps( func
, dst
, tmp1
);
1159 sse_mulps( func
, src
, tmp1
);
1160 sse_subps( func
, tmp0
, src
);
1161 sse_mulps( func
, dst
, tmp0
);
1164 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1169 make_xmm( xmm_dst
),
1170 make_xmm( xmm_src
) );
1176 struct x86_function
*func
,
1183 TGSI_EXEC_TEMP_80000000_I
,
1184 TGSI_EXEC_TEMP_80000000_C
) );
1187 static void PIPE_CDECL
1191 store
[0] = store
[0] < 0.0f
? -1.0f
: store
[0] > 0.0f
? 1.0f
: 0.0f
;
1192 store
[1] = store
[1] < 0.0f
? -1.0f
: store
[1] > 0.0f
? 1.0f
: 0.0f
;
1193 store
[2] = store
[2] < 0.0f
? -1.0f
: store
[2] > 0.0f
? 1.0f
: 0.0f
;
1194 store
[3] = store
[3] < 0.0f
? -1.0f
: store
[3] > 0.0f
? 1.0f
: 0.0f
;
1199 struct x86_function
*func
,
1203 emit_func_call_dst_src1(
1211 static void PIPE_CDECL
1215 store
[0] = sinf( store
[0] );
1216 store
[1] = sinf( store
[1] );
1217 store
[2] = sinf( store
[2] );
1218 store
[3] = sinf( store
[3] );
1222 emit_sin (struct x86_function
*func
,
1226 emit_func_call_dst_src1(
1236 struct x86_function
*func
,
1242 make_xmm( xmm_dst
),
1243 make_xmm( xmm_src
) );
1258 struct x86_function
*func
,
1260 const struct tgsi_full_src_register
*reg
,
1261 const unsigned chan_index
)
1263 unsigned swizzle
= tgsi_util_get_full_src_register_extswizzle( reg
, chan_index
);
1266 case TGSI_EXTSWIZZLE_X
:
1267 case TGSI_EXTSWIZZLE_Y
:
1268 case TGSI_EXTSWIZZLE_Z
:
1269 case TGSI_EXTSWIZZLE_W
:
1270 switch (reg
->SrcRegister
.File
) {
1271 case TGSI_FILE_CONSTANT
:
1275 reg
->SrcRegister
.Index
,
1277 reg
->SrcRegister
.Indirect
,
1278 reg
->SrcRegisterInd
.File
,
1279 reg
->SrcRegisterInd
.Index
);
1282 case TGSI_FILE_IMMEDIATE
:
1286 reg
->SrcRegister
.Index
,
1290 case TGSI_FILE_INPUT
:
1294 reg
->SrcRegister
.Index
,
1298 case TGSI_FILE_TEMPORARY
:
1302 reg
->SrcRegister
.Index
,
1311 case TGSI_EXTSWIZZLE_ZERO
:
1315 TGSI_EXEC_TEMP_00000000_I
,
1316 TGSI_EXEC_TEMP_00000000_C
);
1319 case TGSI_EXTSWIZZLE_ONE
:
1331 switch( tgsi_util_get_full_src_register_sign_mode( reg
, chan_index
) ) {
1332 case TGSI_UTIL_SIGN_CLEAR
:
1333 emit_abs( func
, xmm
);
1336 case TGSI_UTIL_SIGN_SET
:
1337 emit_setsign( func
, xmm
);
1340 case TGSI_UTIL_SIGN_TOGGLE
:
1341 emit_neg( func
, xmm
);
1344 case TGSI_UTIL_SIGN_KEEP
:
1349 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1350 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1358 struct x86_function
*func
,
1360 const struct tgsi_full_dst_register
*reg
,
1361 const struct tgsi_full_instruction
*inst
,
1362 unsigned chan_index
)
1364 switch( inst
->Instruction
.Saturate
) {
1368 case TGSI_SAT_ZERO_ONE
:
1373 TGSI_EXEC_TEMP_00000000_I
,
1374 TGSI_EXEC_TEMP_00000000_C
) );
1380 TGSI_EXEC_TEMP_ONE_I
,
1381 TGSI_EXEC_TEMP_ONE_C
) );
1384 case TGSI_SAT_MINUS_PLUS_ONE
:
1390 switch( reg
->DstRegister
.File
) {
1391 case TGSI_FILE_OUTPUT
:
1395 reg
->DstRegister
.Index
,
1399 case TGSI_FILE_TEMPORARY
:
1403 reg
->DstRegister
.Index
,
1407 case TGSI_FILE_ADDRESS
:
1411 reg
->DstRegister
.Index
,
1420 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1421 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1424 static void PIPE_CDECL
1425 fetch_texel( struct tgsi_sampler
**sampler
,
1431 debug_printf("%s sampler: %p (%p) store: %p\n",
1436 debug_printf("lodbias %f\n", store
[12]);
1438 for (j
= 0; j
< 4; j
++)
1439 debug_printf("sample %d texcoord %f %f\n",
1446 float rgba
[NUM_CHANNELS
][QUAD_SIZE
];
1447 (*sampler
)->get_samples(*sampler
,
1451 0.0f
, /*store[12], lodbias */
1454 memcpy( store
, rgba
, 16 * sizeof(float));
1458 for (j
= 0; j
< 4; j
++)
1459 debug_printf("sample %d result %f %f %f %f\n",
1469 * High-level instruction translators.
1473 emit_tex( struct x86_function
*func
,
1474 const struct tgsi_full_instruction
*inst
,
1478 const uint unit
= inst
->FullSrcRegisters
[1].SrcRegister
.Index
;
1479 struct x86_reg args
[2];
1483 switch (inst
->InstructionExtTexture
.Texture
) {
1484 case TGSI_TEXTURE_1D
:
1487 case TGSI_TEXTURE_2D
:
1488 case TGSI_TEXTURE_RECT
:
1491 case TGSI_TEXTURE_SHADOW1D
:
1492 case TGSI_TEXTURE_SHADOW2D
:
1493 case TGSI_TEXTURE_SHADOWRECT
:
1494 case TGSI_TEXTURE_3D
:
1495 case TGSI_TEXTURE_CUBE
:
1504 FETCH( func
, *inst
, 3, 0, 3 );
1510 TGSI_EXEC_TEMP_00000000_I
,
1511 TGSI_EXEC_TEMP_00000000_C
);
1515 /* store lodbias whether enabled or not -- fetch_texel currently
1516 * respects it always.
1519 get_temp( TEMP_R0
, 3 ),
1524 FETCH( func
, *inst
, 3, 0, 3 );
1526 emit_rcp( func
, 3, 3 );
1529 for (i
= 0; i
< count
; i
++) {
1530 FETCH( func
, *inst
, i
, 0, i
);
1539 /* Store in the argument buffer:
1543 get_temp( TEMP_R0
, i
),
1547 args
[0] = get_temp( TEMP_R0
, 0 );
1548 args
[1] = get_sampler_ptr( unit
);
1551 emit_func_call( func
,
1557 /* If all four channels are enabled, could use a pointer to
1558 * dst[0].x instead of TEMP_R0 for store?
1560 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, i
) {
1565 get_temp( TEMP_R0
, i
) );
1567 STORE( func
, *inst
, 0, 0, i
);
1574 struct x86_function
*func
,
1575 const struct tgsi_full_src_register
*reg
)
1577 unsigned uniquemask
;
1578 unsigned unique_count
= 0;
1579 unsigned chan_index
;
1582 /* This mask stores component bits that were already tested. Note that
1583 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1585 uniquemask
= (1 << TGSI_EXTSWIZZLE_ZERO
) | (1 << TGSI_EXTSWIZZLE_ONE
);
1587 FOR_EACH_CHANNEL( chan_index
) {
1590 /* unswizzle channel */
1591 swizzle
= tgsi_util_get_full_src_register_extswizzle(
1595 /* check if the component has not been already tested */
1596 if( !(uniquemask
& (1 << swizzle
)) ) {
1597 uniquemask
|= 1 << swizzle
;
1599 /* allocate register */
1610 x86_make_reg( file_REG32
, reg_AX
) );
1613 x86_make_reg( file_REG32
, reg_DX
) );
1615 for (i
= 0 ; i
< unique_count
; i
++ ) {
1616 struct x86_reg dataXMM
= make_xmm(i
);
1622 TGSI_EXEC_TEMP_00000000_I
,
1623 TGSI_EXEC_TEMP_00000000_C
),
1629 x86_make_reg( file_REG32
, reg_AX
),
1635 x86_make_reg( file_REG32
, reg_DX
),
1639 x86_make_reg( file_REG32
, reg_AX
),
1640 x86_make_reg( file_REG32
, reg_DX
) );
1647 TGSI_EXEC_TEMP_KILMASK_I
,
1648 TGSI_EXEC_TEMP_KILMASK_C
),
1649 x86_make_reg( file_REG32
, reg_AX
) );
1653 x86_make_reg( file_REG32
, reg_DX
) );
1656 x86_make_reg( file_REG32
, reg_AX
) );
1662 struct x86_function
*func
)
1664 /* XXX todo / fix me */
1670 struct x86_function
*func
,
1671 struct tgsi_full_instruction
*inst
,
1674 unsigned chan_index
;
1676 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1677 FETCH( func
, *inst
, 0, 0, chan_index
);
1678 FETCH( func
, *inst
, 1, 1, chan_index
);
1690 STORE( func
, *inst
, 0, 0, chan_index
);
1696 struct x86_function
*func
,
1697 struct tgsi_full_instruction
*inst
)
1699 unsigned chan_index
;
1701 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1702 FETCH( func
, *inst
, 0, 0, chan_index
);
1703 FETCH( func
, *inst
, 1, 1, chan_index
);
1704 FETCH( func
, *inst
, 2, 2, chan_index
);
1709 TGSI_EXEC_TEMP_00000000_I
,
1710 TGSI_EXEC_TEMP_00000000_C
),
1724 STORE( func
, *inst
, 0, 0, chan_index
);
1730 * Check if inst src/dest regs use indirect addressing into temporary
1734 indirect_temp_reference(const struct tgsi_full_instruction
*inst
)
1737 for (i
= 0; i
< inst
->Instruction
.NumSrcRegs
; i
++) {
1738 const struct tgsi_full_src_register
*reg
= &inst
->FullSrcRegisters
[i
];
1739 if (reg
->SrcRegister
.File
== TGSI_FILE_TEMPORARY
&&
1740 reg
->SrcRegister
.Indirect
)
1743 for (i
= 0; i
< inst
->Instruction
.NumDstRegs
; i
++) {
1744 const struct tgsi_full_dst_register
*reg
= &inst
->FullDstRegisters
[i
];
1745 if (reg
->DstRegister
.File
== TGSI_FILE_TEMPORARY
&&
1746 reg
->DstRegister
.Indirect
)
1755 struct x86_function
*func
,
1756 struct tgsi_full_instruction
*inst
)
1758 unsigned chan_index
;
1760 /* we can't handle indirect addressing into temp register file yet */
1761 if (indirect_temp_reference(inst
))
1764 switch (inst
->Instruction
.Opcode
) {
1765 case TGSI_OPCODE_ARL
:
1766 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1767 FETCH( func
, *inst
, 0, 0, chan_index
);
1768 emit_flr(func
, 0, 0);
1769 emit_f2it( func
, 0 );
1770 STORE( func
, *inst
, 0, 0, chan_index
);
1774 case TGSI_OPCODE_MOV
:
1775 case TGSI_OPCODE_SWZ
:
1776 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1777 FETCH( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1779 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1780 STORE( func
, *inst
, 4 + chan_index
, 0, chan_index
);
1784 case TGSI_OPCODE_LIT
:
1785 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1786 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1792 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ) {
1793 STORE( func
, *inst
, 0, 0, CHAN_X
);
1795 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) ) {
1796 STORE( func
, *inst
, 0, 0, CHAN_W
);
1799 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1800 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1801 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
1802 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1807 TGSI_EXEC_TEMP_00000000_I
,
1808 TGSI_EXEC_TEMP_00000000_C
) );
1809 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1811 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
1812 /* XMM[1] = SrcReg[0].yyyy */
1813 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1814 /* XMM[1] = max(XMM[1], 0) */
1819 TGSI_EXEC_TEMP_00000000_I
,
1820 TGSI_EXEC_TEMP_00000000_C
) );
1821 /* XMM[2] = SrcReg[0].wwww */
1822 FETCH( func
, *inst
, 2, 0, CHAN_W
);
1823 /* XMM[2] = min(XMM[2], 128.0) */
1828 TGSI_EXEC_TEMP_128_I
,
1829 TGSI_EXEC_TEMP_128_C
) );
1830 /* XMM[2] = max(XMM[2], -128.0) */
1835 TGSI_EXEC_TEMP_MINUS_128_I
,
1836 TGSI_EXEC_TEMP_MINUS_128_C
) );
1837 emit_pow( func
, 3, 1, 1, 2 );
1838 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1852 STORE( func
, *inst
, 2, 0, CHAN_Z
);
1857 case TGSI_OPCODE_RCP
:
1858 /* TGSI_OPCODE_RECIP */
1859 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1860 emit_rcp( func
, 0, 0 );
1861 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1862 STORE( func
, *inst
, 0, 0, chan_index
);
1866 case TGSI_OPCODE_RSQ
:
1867 /* TGSI_OPCODE_RECIPSQRT */
1868 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1869 emit_abs( func
, 0 );
1870 emit_rsqrt( func
, 1, 0 );
1871 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1872 STORE( func
, *inst
, 1, 0, chan_index
);
1876 case TGSI_OPCODE_EXP
:
1877 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1878 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1879 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1880 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1881 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1882 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1883 emit_MOV( func
, 1, 0 );
1884 emit_flr( func
, 2, 1 );
1885 /* dst.x = ex2(floor(src.x)) */
1886 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1887 emit_MOV( func
, 2, 1 );
1888 emit_ex2( func
, 3, 2 );
1889 STORE( func
, *inst
, 2, 0, CHAN_X
);
1891 /* dst.y = src.x - floor(src.x) */
1892 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1893 emit_MOV( func
, 2, 0 );
1894 emit_sub( func
, 2, 1 );
1895 STORE( func
, *inst
, 2, 0, CHAN_Y
);
1898 /* dst.z = ex2(src.x) */
1899 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1900 emit_ex2( func
, 3, 0 );
1901 STORE( func
, *inst
, 0, 0, CHAN_Z
);
1905 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1906 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1907 STORE( func
, *inst
, 0, 0, CHAN_W
);
1911 case TGSI_OPCODE_LOG
:
1912 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1913 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
1914 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1915 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1916 emit_abs( func
, 0 );
1917 emit_MOV( func
, 1, 0 );
1918 emit_lg2( func
, 2, 1 );
1919 /* dst.z = lg2(abs(src.x)) */
1920 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
)) {
1921 STORE( func
, *inst
, 1, 0, CHAN_Z
);
1923 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
1924 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1925 emit_flr( func
, 2, 1 );
1926 /* dst.x = floor(lg2(abs(src.x))) */
1927 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
)) {
1928 STORE( func
, *inst
, 1, 0, CHAN_X
);
1930 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1931 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
)) {
1932 emit_ex2( func
, 2, 1 );
1933 emit_rcp( func
, 1, 1 );
1934 emit_mul( func
, 0, 1 );
1935 STORE( func
, *inst
, 0, 0, CHAN_Y
);
1940 if (IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
)) {
1941 emit_tempf( func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
1942 STORE( func
, *inst
, 0, 0, CHAN_W
);
1946 case TGSI_OPCODE_MUL
:
1947 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1948 FETCH( func
, *inst
, 0, 0, chan_index
);
1949 FETCH( func
, *inst
, 1, 1, chan_index
);
1950 emit_mul( func
, 0, 1 );
1951 STORE( func
, *inst
, 0, 0, chan_index
);
1955 case TGSI_OPCODE_ADD
:
1956 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1957 FETCH( func
, *inst
, 0, 0, chan_index
);
1958 FETCH( func
, *inst
, 1, 1, chan_index
);
1959 emit_add( func
, 0, 1 );
1960 STORE( func
, *inst
, 0, 0, chan_index
);
1964 case TGSI_OPCODE_DP3
:
1965 /* TGSI_OPCODE_DOT3 */
1966 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1967 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1968 emit_mul( func
, 0, 1 );
1969 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1970 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1971 emit_mul( func
, 1, 2 );
1972 emit_add( func
, 0, 1 );
1973 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1974 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1975 emit_mul( func
, 1, 2 );
1976 emit_add( func
, 0, 1 );
1977 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
1978 STORE( func
, *inst
, 0, 0, chan_index
);
1982 case TGSI_OPCODE_DP4
:
1983 /* TGSI_OPCODE_DOT4 */
1984 FETCH( func
, *inst
, 0, 0, CHAN_X
);
1985 FETCH( func
, *inst
, 1, 1, CHAN_X
);
1986 emit_mul( func
, 0, 1 );
1987 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
1988 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
1989 emit_mul( func
, 1, 2 );
1990 emit_add( func
, 0, 1 );
1991 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
1992 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
1993 emit_mul(func
, 1, 2 );
1994 emit_add(func
, 0, 1 );
1995 FETCH( func
, *inst
, 1, 0, CHAN_W
);
1996 FETCH( func
, *inst
, 2, 1, CHAN_W
);
1997 emit_mul( func
, 1, 2 );
1998 emit_add( func
, 0, 1 );
1999 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2000 STORE( func
, *inst
, 0, 0, chan_index
);
2004 case TGSI_OPCODE_DST
:
2005 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2011 STORE( func
, *inst
, 0, 0, CHAN_X
);
2013 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2014 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2015 FETCH( func
, *inst
, 1, 1, CHAN_Y
);
2016 emit_mul( func
, 0, 1 );
2017 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2019 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2020 FETCH( func
, *inst
, 0, 0, CHAN_Z
);
2021 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2023 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2024 FETCH( func
, *inst
, 0, 1, CHAN_W
);
2025 STORE( func
, *inst
, 0, 0, CHAN_W
);
2029 case TGSI_OPCODE_MIN
:
2030 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2031 FETCH( func
, *inst
, 0, 0, chan_index
);
2032 FETCH( func
, *inst
, 1, 1, chan_index
);
2037 STORE( func
, *inst
, 0, 0, chan_index
);
2041 case TGSI_OPCODE_MAX
:
2042 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2043 FETCH( func
, *inst
, 0, 0, chan_index
);
2044 FETCH( func
, *inst
, 1, 1, chan_index
);
2049 STORE( func
, *inst
, 0, 0, chan_index
);
2053 case TGSI_OPCODE_SLT
:
2054 /* TGSI_OPCODE_SETLT */
2055 emit_setcc( func
, inst
, cc_LessThan
);
2058 case TGSI_OPCODE_SGE
:
2059 /* TGSI_OPCODE_SETGE */
2060 emit_setcc( func
, inst
, cc_NotLessThan
);
2063 case TGSI_OPCODE_MAD
:
2064 /* TGSI_OPCODE_MADD */
2065 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2066 FETCH( func
, *inst
, 0, 0, chan_index
);
2067 FETCH( func
, *inst
, 1, 1, chan_index
);
2068 FETCH( func
, *inst
, 2, 2, chan_index
);
2069 emit_mul( func
, 0, 1 );
2070 emit_add( func
, 0, 2 );
2071 STORE( func
, *inst
, 0, 0, chan_index
);
2075 case TGSI_OPCODE_SUB
:
2076 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2077 FETCH( func
, *inst
, 0, 0, chan_index
);
2078 FETCH( func
, *inst
, 1, 1, chan_index
);
2079 emit_sub( func
, 0, 1 );
2080 STORE( func
, *inst
, 0, 0, chan_index
);
2084 case TGSI_OPCODE_LRP
:
2085 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2086 FETCH( func
, *inst
, 0, 0, chan_index
);
2087 FETCH( func
, *inst
, 1, 1, chan_index
);
2088 FETCH( func
, *inst
, 2, 2, chan_index
);
2089 emit_sub( func
, 1, 2 );
2090 emit_mul( func
, 0, 1 );
2091 emit_add( func
, 0, 2 );
2092 STORE( func
, *inst
, 0, 0, chan_index
);
2096 case TGSI_OPCODE_CND
:
2100 case TGSI_OPCODE_DP2A
:
2101 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2102 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2103 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2104 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2105 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2106 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2107 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2108 FETCH( func
, *inst
, 1, 2, CHAN_X
); /* xmm1 = src[2].x */
2109 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2110 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2111 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2115 case TGSI_OPCODE_FRC
:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2117 FETCH( func
, *inst
, 0, 0, chan_index
);
2118 emit_frc( func
, 0, 0 );
2119 STORE( func
, *inst
, 0, 0, chan_index
);
2123 case TGSI_OPCODE_CLAMP
:
2127 case TGSI_OPCODE_FLR
:
2128 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2129 FETCH( func
, *inst
, 0, 0, chan_index
);
2130 emit_flr( func
, 0, 0 );
2131 STORE( func
, *inst
, 0, 0, chan_index
);
2135 case TGSI_OPCODE_ROUND
:
2136 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2137 FETCH( func
, *inst
, 0, 0, chan_index
);
2138 emit_rnd( func
, 0, 0 );
2139 STORE( func
, *inst
, 0, 0, chan_index
);
2143 case TGSI_OPCODE_EX2
:
2144 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2145 emit_ex2( func
, 0, 0 );
2146 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2147 STORE( func
, *inst
, 0, 0, chan_index
);
2151 case TGSI_OPCODE_LG2
:
2152 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2153 emit_lg2( func
, 0, 0 );
2154 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2155 STORE( func
, *inst
, 0, 0, chan_index
);
2159 case TGSI_OPCODE_POW
:
2160 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2161 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2162 emit_pow( func
, 0, 0, 0, 1 );
2163 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2164 STORE( func
, *inst
, 0, 0, chan_index
);
2168 case TGSI_OPCODE_XPD
:
2169 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2170 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ) {
2171 FETCH( func
, *inst
, 1, 1, CHAN_Z
);
2172 FETCH( func
, *inst
, 3, 0, CHAN_Z
);
2174 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) ||
2175 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2176 FETCH( func
, *inst
, 0, 0, CHAN_Y
);
2177 FETCH( func
, *inst
, 4, 1, CHAN_Y
);
2179 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2180 emit_MOV( func
, 2, 0 );
2181 emit_mul( func
, 2, 1 );
2182 emit_MOV( func
, 5, 3 );
2183 emit_mul( func
, 5, 4 );
2184 emit_sub( func
, 2, 5 );
2185 STORE( func
, *inst
, 2, 0, CHAN_X
);
2187 if( IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) ||
2188 IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) ) {
2189 FETCH( func
, *inst
, 2, 1, CHAN_X
);
2190 FETCH( func
, *inst
, 5, 0, CHAN_X
);
2192 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2193 emit_mul( func
, 3, 2 );
2194 emit_mul( func
, 1, 5 );
2195 emit_sub( func
, 3, 1 );
2196 STORE( func
, *inst
, 3, 0, CHAN_Y
);
2198 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2199 emit_mul( func
, 5, 4 );
2200 emit_mul( func
, 0, 2 );
2201 emit_sub( func
, 5, 0 );
2202 STORE( func
, *inst
, 5, 0, CHAN_Z
);
2204 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2210 STORE( func
, *inst
, 0, 0, CHAN_W
);
2214 case TGSI_OPCODE_ABS
:
2215 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2216 FETCH( func
, *inst
, 0, 0, chan_index
);
2217 emit_abs( func
, 0) ;
2219 STORE( func
, *inst
, 0, 0, chan_index
);
2223 case TGSI_OPCODE_RCC
:
2227 case TGSI_OPCODE_DPH
:
2228 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2229 FETCH( func
, *inst
, 1, 1, CHAN_X
);
2230 emit_mul( func
, 0, 1 );
2231 FETCH( func
, *inst
, 1, 0, CHAN_Y
);
2232 FETCH( func
, *inst
, 2, 1, CHAN_Y
);
2233 emit_mul( func
, 1, 2 );
2234 emit_add( func
, 0, 1 );
2235 FETCH( func
, *inst
, 1, 0, CHAN_Z
);
2236 FETCH( func
, *inst
, 2, 1, CHAN_Z
);
2237 emit_mul( func
, 1, 2 );
2238 emit_add( func
, 0, 1 );
2239 FETCH( func
, *inst
, 1, 1, CHAN_W
);
2240 emit_add( func
, 0, 1 );
2241 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2242 STORE( func
, *inst
, 0, 0, chan_index
);
2246 case TGSI_OPCODE_COS
:
2247 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2248 emit_cos( func
, 0, 0 );
2249 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2250 STORE( func
, *inst
, 0, 0, chan_index
);
2254 case TGSI_OPCODE_DDX
:
2258 case TGSI_OPCODE_DDY
:
2262 case TGSI_OPCODE_KILP
:
2263 /* predicated kill */
2265 return 0; /* XXX fix me */
2268 case TGSI_OPCODE_KIL
:
2269 /* conditional kill */
2270 emit_kil( func
, &inst
->FullSrcRegisters
[0] );
2273 case TGSI_OPCODE_PK2H
:
2277 case TGSI_OPCODE_PK2US
:
2281 case TGSI_OPCODE_PK4B
:
2285 case TGSI_OPCODE_PK4UB
:
2289 case TGSI_OPCODE_RFL
:
2293 case TGSI_OPCODE_SEQ
:
2297 case TGSI_OPCODE_SFL
:
2301 case TGSI_OPCODE_SGT
:
2305 case TGSI_OPCODE_SIN
:
2306 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2307 emit_sin( func
, 0, 0 );
2308 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2309 STORE( func
, *inst
, 0, 0, chan_index
);
2313 case TGSI_OPCODE_SLE
:
2317 case TGSI_OPCODE_SNE
:
2321 case TGSI_OPCODE_STR
:
2325 case TGSI_OPCODE_TEX
:
2326 emit_tex( func
, inst
, FALSE
, FALSE
);
2329 case TGSI_OPCODE_TXD
:
2333 case TGSI_OPCODE_UP2H
:
2337 case TGSI_OPCODE_UP2US
:
2341 case TGSI_OPCODE_UP4B
:
2345 case TGSI_OPCODE_UP4UB
:
2349 case TGSI_OPCODE_X2D
:
2353 case TGSI_OPCODE_ARA
:
2357 case TGSI_OPCODE_ARR
:
2358 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2359 FETCH( func
, *inst
, 0, 0, chan_index
);
2360 emit_rnd( func
, 0, 0 );
2361 emit_f2it( func
, 0 );
2362 STORE( func
, *inst
, 0, 0, chan_index
);
2366 case TGSI_OPCODE_BRA
:
2370 case TGSI_OPCODE_CAL
:
2374 case TGSI_OPCODE_RET
:
2378 case TGSI_OPCODE_END
:
2381 case TGSI_OPCODE_SSG
:
2382 /* TGSI_OPCODE_SGN */
2383 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2384 FETCH( func
, *inst
, 0, 0, chan_index
);
2385 emit_sgn( func
, 0, 0 );
2386 STORE( func
, *inst
, 0, 0, chan_index
);
2390 case TGSI_OPCODE_CMP
:
2391 emit_cmp (func
, inst
);
2394 case TGSI_OPCODE_SCS
:
2395 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_X
) {
2396 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2397 emit_cos( func
, 0, 0 );
2398 STORE( func
, *inst
, 0, 0, CHAN_X
);
2400 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Y
) {
2401 FETCH( func
, *inst
, 0, 0, CHAN_X
);
2402 emit_sin( func
, 0, 0 );
2403 STORE( func
, *inst
, 0, 0, CHAN_Y
);
2405 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_Z
) {
2409 TGSI_EXEC_TEMP_00000000_I
,
2410 TGSI_EXEC_TEMP_00000000_C
);
2411 STORE( func
, *inst
, 0, 0, CHAN_Z
);
2413 IF_IS_DST0_CHANNEL_ENABLED( *inst
, CHAN_W
) {
2419 STORE( func
, *inst
, 0, 0, CHAN_W
);
2423 case TGSI_OPCODE_TXB
:
2424 emit_tex( func
, inst
, TRUE
, FALSE
);
2427 case TGSI_OPCODE_NRM
:
2429 case TGSI_OPCODE_NRM4
:
2430 /* 3 or 4-component normalization */
2432 uint dims
= (inst
->Instruction
.Opcode
== TGSI_OPCODE_NRM
) ? 3 : 4;
2434 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) ||
2435 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
) ||
2436 IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
) ||
2437 (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 4)) {
2439 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2442 /* xmm0 = src.x * src.x */
2443 FETCH(func
, *inst
, 0, 0, CHAN_X
);
2444 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2445 emit_MOV(func
, 4, 0);
2447 emit_mul(func
, 0, 0);
2450 /* xmm0 = xmm0 + src.y * src.y */
2451 FETCH(func
, *inst
, 1, 0, CHAN_Y
);
2452 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2453 emit_MOV(func
, 5, 1);
2455 emit_mul(func
, 1, 1);
2456 emit_add(func
, 0, 1);
2459 /* xmm0 = xmm0 + src.z * src.z */
2460 FETCH(func
, *inst
, 1, 0, CHAN_Z
);
2461 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2462 emit_MOV(func
, 6, 1);
2464 emit_mul(func
, 1, 1);
2465 emit_add(func
, 0, 1);
2469 /* xmm0 = xmm0 + src.w * src.w */
2470 FETCH(func
, *inst
, 1, 0, CHAN_W
);
2471 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
)) {
2472 emit_MOV(func
, 7, 1);
2474 emit_mul(func
, 1, 1);
2475 emit_add(func
, 0, 1);
2478 /* xmm1 = 1 / sqrt(xmm0) */
2479 emit_rsqrt(func
, 1, 0);
2481 /* dst.x = xmm1 * src.x */
2482 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
)) {
2483 emit_mul(func
, 4, 1);
2484 STORE(func
, *inst
, 4, 0, CHAN_X
);
2487 /* dst.y = xmm1 * src.y */
2488 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Y
)) {
2489 emit_mul(func
, 5, 1);
2490 STORE(func
, *inst
, 5, 0, CHAN_Y
);
2493 /* dst.z = xmm1 * src.z */
2494 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_Z
)) {
2495 emit_mul(func
, 6, 1);
2496 STORE(func
, *inst
, 6, 0, CHAN_Z
);
2499 /* dst.w = xmm1 * src.w */
2500 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_X
) && dims
== 4) {
2501 emit_mul(func
, 7, 1);
2502 STORE(func
, *inst
, 7, 0, CHAN_W
);
2507 if (IS_DST0_CHANNEL_ENABLED(*inst
, CHAN_W
) && dims
== 3) {
2508 emit_tempf(func
, 0, TEMP_ONE_I
, TEMP_ONE_C
);
2509 STORE(func
, *inst
, 0, 0, CHAN_W
);
2514 case TGSI_OPCODE_DIV
:
2518 case TGSI_OPCODE_DP2
:
2519 FETCH( func
, *inst
, 0, 0, CHAN_X
); /* xmm0 = src[0].x */
2520 FETCH( func
, *inst
, 1, 1, CHAN_X
); /* xmm1 = src[1].x */
2521 emit_mul( func
, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2522 FETCH( func
, *inst
, 1, 0, CHAN_Y
); /* xmm1 = src[0].y */
2523 FETCH( func
, *inst
, 2, 1, CHAN_Y
); /* xmm2 = src[1].y */
2524 emit_mul( func
, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2525 emit_add( func
, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2526 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2527 STORE( func
, *inst
, 0, 0, chan_index
); /* dest[ch] = xmm0 */
2531 case TGSI_OPCODE_TXL
:
2532 emit_tex( func
, inst
, TRUE
, FALSE
);
2535 case TGSI_OPCODE_TXP
:
2536 emit_tex( func
, inst
, FALSE
, TRUE
);
2539 case TGSI_OPCODE_BRK
:
2543 case TGSI_OPCODE_IF
:
2547 case TGSI_OPCODE_BGNFOR
:
2551 case TGSI_OPCODE_REP
:
2555 case TGSI_OPCODE_ELSE
:
2559 case TGSI_OPCODE_ENDIF
:
2563 case TGSI_OPCODE_ENDFOR
:
2567 case TGSI_OPCODE_ENDREP
:
2571 case TGSI_OPCODE_PUSHA
:
2575 case TGSI_OPCODE_POPA
:
2579 case TGSI_OPCODE_CEIL
:
2583 case TGSI_OPCODE_I2F
:
2587 case TGSI_OPCODE_NOT
:
2591 case TGSI_OPCODE_TRUNC
:
2592 FOR_EACH_DST0_ENABLED_CHANNEL( *inst
, chan_index
) {
2593 FETCH( func
, *inst
, 0, 0, chan_index
);
2594 emit_f2it( func
, 0 );
2595 emit_i2f( func
, 0 );
2596 STORE( func
, *inst
, 0, 0, chan_index
);
2600 case TGSI_OPCODE_SHL
:
2604 case TGSI_OPCODE_SHR
:
2608 case TGSI_OPCODE_AND
:
2612 case TGSI_OPCODE_OR
:
2616 case TGSI_OPCODE_MOD
:
2620 case TGSI_OPCODE_XOR
:
2624 case TGSI_OPCODE_SAD
:
2628 case TGSI_OPCODE_TXF
:
2632 case TGSI_OPCODE_TXQ
:
2636 case TGSI_OPCODE_CONT
:
2640 case TGSI_OPCODE_EMIT
:
2644 case TGSI_OPCODE_ENDPRIM
:
2657 struct x86_function
*func
,
2658 struct tgsi_full_declaration
*decl
)
2660 if( decl
->Declaration
.File
== TGSI_FILE_INPUT
) {
2661 unsigned first
, last
, mask
;
2664 first
= decl
->DeclarationRange
.First
;
2665 last
= decl
->DeclarationRange
.Last
;
2666 mask
= decl
->Declaration
.UsageMask
;
2668 for( i
= first
; i
<= last
; i
++ ) {
2669 for( j
= 0; j
< NUM_CHANNELS
; j
++ ) {
2670 if( mask
& (1 << j
) ) {
2671 switch( decl
->Declaration
.Interpolate
) {
2672 case TGSI_INTERPOLATE_CONSTANT
:
2673 emit_coef_a0( func
, 0, i
, j
);
2674 emit_inputs( func
, 0, i
, j
);
2677 case TGSI_INTERPOLATE_LINEAR
:
2678 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2679 emit_coef_dadx( func
, 1, i
, j
);
2680 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2681 emit_coef_dady( func
, 3, i
, j
);
2682 emit_mul( func
, 0, 1 ); /* x * dadx */
2683 emit_coef_a0( func
, 4, i
, j
);
2684 emit_mul( func
, 2, 3 ); /* y * dady */
2685 emit_add( func
, 0, 4 ); /* x * dadx + a0 */
2686 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2687 emit_inputs( func
, 0, i
, j
);
2690 case TGSI_INTERPOLATE_PERSPECTIVE
:
2691 emit_tempf( func
, 0, 0, TGSI_SWIZZLE_X
);
2692 emit_coef_dadx( func
, 1, i
, j
);
2693 emit_tempf( func
, 2, 0, TGSI_SWIZZLE_Y
);
2694 emit_coef_dady( func
, 3, i
, j
);
2695 emit_mul( func
, 0, 1 ); /* x * dadx */
2696 emit_tempf( func
, 4, 0, TGSI_SWIZZLE_W
);
2697 emit_coef_a0( func
, 5, i
, j
);
2698 emit_rcp( func
, 4, 4 ); /* 1.0 / w */
2699 emit_mul( func
, 2, 3 ); /* y * dady */
2700 emit_add( func
, 0, 5 ); /* x * dadx + a0 */
2701 emit_add( func
, 0, 2 ); /* x * dadx + y * dady + a0 */
2702 emit_mul( func
, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2703 emit_inputs( func
, 0, i
, j
);
2716 static void aos_to_soa( struct x86_function
*func
,
2722 struct x86_reg soa_input
= x86_make_reg( file_REG32
, reg_AX
);
2723 struct x86_reg aos_input
= x86_make_reg( file_REG32
, reg_BX
);
2724 struct x86_reg num_inputs
= x86_make_reg( file_REG32
, reg_CX
);
2725 struct x86_reg stride
= x86_make_reg( file_REG32
, reg_DX
);
2730 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2732 x86_mov( func
, aos_input
, x86_fn_arg( func
, arg_aos
) );
2733 x86_mov( func
, soa_input
, x86_fn_arg( func
, arg_machine
) );
2734 x86_lea( func
, soa_input
,
2735 x86_make_disp( soa_input
,
2736 Offset(struct tgsi_exec_machine
, Inputs
) ) );
2737 x86_mov( func
, num_inputs
, x86_fn_arg( func
, arg_num
) );
2738 x86_mov( func
, stride
, x86_fn_arg( func
, arg_stride
) );
2741 inner_loop
= x86_get_label( func
);
2743 x86_push( func
, aos_input
);
2744 sse_movlps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2745 sse_movlps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2746 x86_add( func
, aos_input
, stride
);
2747 sse_movhps( func
, make_xmm( 0 ), x86_make_disp( aos_input
, 0 ) );
2748 sse_movhps( func
, make_xmm( 3 ), x86_make_disp( aos_input
, 8 ) );
2749 x86_add( func
, aos_input
, stride
);
2750 sse_movlps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2751 sse_movlps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2752 x86_add( func
, aos_input
, stride
);
2753 sse_movhps( func
, make_xmm( 1 ), x86_make_disp( aos_input
, 0 ) );
2754 sse_movhps( func
, make_xmm( 4 ), x86_make_disp( aos_input
, 8 ) );
2755 x86_pop( func
, aos_input
);
2757 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2758 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2759 sse_shufps( func
, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2760 sse_shufps( func
, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2761 sse_shufps( func
, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2762 sse_shufps( func
, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2764 sse_movups( func
, x86_make_disp( soa_input
, 0 ), make_xmm( 0 ) );
2765 sse_movups( func
, x86_make_disp( soa_input
, 16 ), make_xmm( 2 ) );
2766 sse_movups( func
, x86_make_disp( soa_input
, 32 ), make_xmm( 3 ) );
2767 sse_movups( func
, x86_make_disp( soa_input
, 48 ), make_xmm( 5 ) );
2769 /* Advance to next input */
2770 x86_lea( func
, aos_input
, x86_make_disp(aos_input
, 16) );
2771 x86_lea( func
, soa_input
, x86_make_disp(soa_input
, 64) );
2773 /* while --num_inputs */
2774 x86_dec( func
, num_inputs
);
2775 x86_jcc( func
, cc_NE
, inner_loop
);
2778 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2781 static void soa_to_aos( struct x86_function
*func
,
2787 struct x86_reg soa_output
= x86_make_reg( file_REG32
, reg_AX
);
2788 struct x86_reg aos_output
= x86_make_reg( file_REG32
, reg_BX
);
2789 struct x86_reg num_outputs
= x86_make_reg( file_REG32
, reg_CX
);
2790 struct x86_reg temp
= x86_make_reg( file_REG32
, reg_DX
);
2794 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2796 x86_mov( func
, aos_output
, x86_fn_arg( func
, arg_aos
) );
2797 x86_mov( func
, soa_output
, x86_fn_arg( func
, arg_machine
) );
2798 x86_lea( func
, soa_output
,
2799 x86_make_disp( soa_output
,
2800 Offset(struct tgsi_exec_machine
, Outputs
) ) );
2801 x86_mov( func
, num_outputs
, x86_fn_arg( func
, arg_num
) );
2804 inner_loop
= x86_get_label( func
);
2806 sse_movups( func
, make_xmm( 0 ), x86_make_disp( soa_output
, 0 ) );
2807 sse_movups( func
, make_xmm( 1 ), x86_make_disp( soa_output
, 16 ) );
2808 sse_movups( func
, make_xmm( 3 ), x86_make_disp( soa_output
, 32 ) );
2809 sse_movups( func
, make_xmm( 4 ), x86_make_disp( soa_output
, 48 ) );
2811 sse_movaps( func
, make_xmm( 2 ), make_xmm( 0 ) );
2812 sse_movaps( func
, make_xmm( 5 ), make_xmm( 3 ) );
2813 sse_unpcklps( func
, make_xmm( 0 ), make_xmm( 1 ) );
2814 sse_unpckhps( func
, make_xmm( 2 ), make_xmm( 1 ) );
2815 sse_unpcklps( func
, make_xmm( 3 ), make_xmm( 4 ) );
2816 sse_unpckhps( func
, make_xmm( 5 ), make_xmm( 4 ) );
2818 x86_mov( func
, temp
, x86_fn_arg( func
, arg_stride
) );
2819 x86_push( func
, aos_output
);
2820 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2821 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2822 x86_add( func
, aos_output
, temp
);
2823 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 0 ) );
2824 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 3 ) );
2825 x86_add( func
, aos_output
, temp
);
2826 sse_movlps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2827 sse_movlps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2828 x86_add( func
, aos_output
, temp
);
2829 sse_movhps( func
, x86_make_disp( aos_output
, 0 ), make_xmm( 2 ) );
2830 sse_movhps( func
, x86_make_disp( aos_output
, 8 ), make_xmm( 5 ) );
2831 x86_pop( func
, aos_output
);
2833 /* Advance to next output */
2834 x86_lea( func
, aos_output
, x86_make_disp(aos_output
, 16) );
2835 x86_lea( func
, soa_output
, x86_make_disp(soa_output
, 64) );
2837 /* while --num_outputs */
2838 x86_dec( func
, num_outputs
);
2839 x86_jcc( func
, cc_NE
, inner_loop
);
2842 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
2846 * Translate a TGSI vertex/fragment shader to SSE2 code.
2847 * Slightly different things are done for vertex vs. fragment shaders.
2849 * \param tokens the TGSI input shader
2850 * \param func the output SSE code/function
2851 * \param immediates buffer to place immediates, later passed to SSE func
2852 * \param return 1 for success, 0 if translation failed
2856 const struct tgsi_token
*tokens
,
2857 struct x86_function
*func
,
2858 float (*immediates
)[4],
2859 boolean do_swizzles
)
2861 struct tgsi_parse_context parse
;
2863 uint num_immediates
= 0;
2867 func
->csr
= func
->store
;
2869 tgsi_parse_init( &parse
, tokens
);
2871 /* Can't just use EDI, EBX without save/restoring them:
2873 x86_push( func
, x86_make_reg( file_REG32
, reg_BX
) );
2874 x86_push( func
, x86_make_reg( file_REG32
, reg_DI
) );
2877 * Different function args for vertex/fragment shaders:
2879 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2885 6 ); /* input_stride */
2891 x86_fn_arg( func
, 1 ) );
2895 x86_fn_arg( func
, 2 ) );
2898 get_immediate_base(),
2899 x86_fn_arg( func
, 3 ) );
2901 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2905 x86_fn_arg( func
, 4 ) );
2911 x86_make_disp( get_machine_base(),
2912 Offset( struct tgsi_exec_machine
, Samplers
) ) );
2915 while( !tgsi_parse_end_of_tokens( &parse
) && ok
) {
2916 tgsi_parse_token( &parse
);
2918 switch( parse
.FullToken
.Token
.Type
) {
2919 case TGSI_TOKEN_TYPE_DECLARATION
:
2920 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_FRAGMENT
) {
2923 &parse
.FullToken
.FullDeclaration
);
2927 case TGSI_TOKEN_TYPE_INSTRUCTION
:
2928 ok
= emit_instruction(
2930 &parse
.FullToken
.FullInstruction
);
2933 uint opcode
= parse
.FullToken
.FullInstruction
.Instruction
.Opcode
;
2934 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2936 tgsi_get_opcode_name(opcode
),
2937 parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
?
2938 "vertex shader" : "fragment shader");
2941 if (tgsi_check_soa_dependencies(&parse
.FullToken
.FullInstruction
)) {
2942 uint opcode
= parse
.FullToken
.FullInstruction
.Instruction
.Opcode
;
2944 /* XXX: we only handle src/dst aliasing in a few opcodes
2945 * currently. Need to use an additional temporay to hold
2946 * the result in the cases where the code is too opaque to
2949 if (opcode
!= TGSI_OPCODE_MOV
&&
2950 opcode
!= TGSI_OPCODE_SWZ
) {
2951 debug_printf("Warning: src/dst aliasing in instruction"
2952 " is not handled:\n");
2953 tgsi_dump_instruction(&parse
.FullToken
.FullInstruction
, 1);
2958 case TGSI_TOKEN_TYPE_IMMEDIATE
:
2959 /* simply copy the immediate values into the next immediates[] slot */
2961 const uint size
= parse
.FullToken
.FullImmediate
.Immediate
.NrTokens
- 1;
2964 assert(num_immediates
< TGSI_EXEC_NUM_IMMEDIATES
);
2965 for( i
= 0; i
< size
; i
++ ) {
2966 immediates
[num_immediates
][i
] =
2967 parse
.FullToken
.FullImmediate
.u
[i
].Float
;
2970 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2972 immediates
[num_immediates
][0],
2973 immediates
[num_immediates
][1],
2974 immediates
[num_immediates
][2],
2975 immediates
[num_immediates
][3]);
2987 if (parse
.FullHeader
.Processor
.Processor
== TGSI_PROCESSOR_VERTEX
) {
2992 8, /* num_outputs */
2993 9 ); /* output_stride */
2996 /* Can't just use EBX, EDI without save/restoring them:
2998 x86_pop( func
, x86_make_reg( file_REG32
, reg_DI
) );
2999 x86_pop( func
, x86_make_reg( file_REG32
, reg_BX
) );
3003 tgsi_parse_free( &parse
);
3008 #endif /* PIPE_ARCH_X86 */