*
**************************************************************************/
-#include "pipe/p_debug.h"
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
+#include "util/u_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#include "util/u_memory.h"
+#if defined(PIPE_ARCH_SSE)
+#include "util/u_sse.h"
+#endif
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
#include "rtasm/rtasm_x86sse.h"
-#ifdef PIPE_ARCH_X86
-
/* for 1/sqrt()
*
* This costs about 100fps (close to 10%) in gears:
#define TEMP_R0 TGSI_EXEC_TEMP_R0
#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
/**
* X86 utility functions.
{
return x86_make_reg(
file_REG32,
- reg_CX );
+ reg_AX );
}
static struct x86_reg
-get_input_base( void )
+get_machine_base( void )
{
return x86_make_reg(
file_REG32,
- reg_AX );
+ reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Inputs) );
}
static struct x86_reg
get_output_base( void )
{
- return x86_make_reg(
- file_REG32,
- reg_DX );
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Outputs) );
}
static struct x86_reg
get_temp_base( void )
+{
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Temps) );
+}
+
+static struct x86_reg
+get_coef_base( void )
{
return x86_make_reg(
file_REG32,
}
static struct x86_reg
-get_coef_base( void )
+get_sampler_base( void )
{
- return get_output_base();
+ return x86_make_reg(
+ file_REG32,
+ reg_DI );
}
static struct x86_reg
{
return x86_make_reg(
file_REG32,
- reg_DI );
+ reg_DX );
}
(vec * 4 + chan) * 4 );
}
+static struct x86_reg
+get_sampler_ptr(
+ unsigned unit )
+{
+ return x86_make_disp(
+ get_sampler_base(),
+ unit * sizeof( struct tgsi_sampler * ) );
+}
+
static struct x86_reg
get_input(
unsigned vec,
int indirectIndex )
{
if (indirect) {
- struct x86_reg r0 = get_input_base();
- struct x86_reg r1 = get_output_base();
+ /* 'vec' is the offset from the address register's value.
+ * We're loading CONST[ADDR+vec] into an xmm register.
+ */
+ struct x86_reg r0 = get_immediate_base();
+ struct x86_reg r1 = get_coef_base();
uint i;
assert( indirectFile == TGSI_FILE_ADDRESS );
assert( indirectIndex == 0 );
+ assert( r0.mod == mod_REG );
+ assert( r1.mod == mod_REG );
x86_push( func, r0 );
x86_push( func, r1 );
+ /*
+ * Loop over the four pixels or vertices in the quad.
+ * Get the value of the address (offset) register for pixel/vertex[i],
+ * add it to the src offset and index into the constant buffer.
+ * Note that we're working on SOA data.
+ * If any of the pixel/vertex execution channels are unused their
+ * values will be garbage. It's very important that we don't use
+ * those garbage values as indexes into the constant buffer since
+ * that'll cause segfaults.
+ * The solution is to bitwise-AND the offset with the execution mask
+ * register whose values are either 0 or ~0.
+ * The caller must setup the execution mask register to indicate
+ * which channels are valid/alive before running the shader.
+ * The execution mask will also figure into loops and conditionals
+ * someday.
+ */
for (i = 0; i < QUAD_SIZE; i++) {
- x86_lea( func, r0, get_const( vec, chan ) );
+ /* r1 = address register[i] */
x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+ /* r0 = execution mask[i] */
+ x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+ /* r1 = r1 & r0 */
+ x86_and( func, r1, r0 );
+ /* r0 = 'vec', the offset */
+ x86_lea( func, r0, get_const( vec, chan ) );
- /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+ /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
*/
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
x86_add( func, r1, r1 );
- x86_add( func, r0, r1 );
+ x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
x86_mov( func, r1, x86_deref( r0 ) );
x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
}
get_temp( TEMP_R0, CHAN_X ) );
}
else {
+ /* 'vec' is the index into the src register file, such as TEMP[vec] */
assert( vec >= 0 );
sse_movss(
* Function call helpers.
*/
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
static void
-emit_push_gp(
- struct x86_function *func )
+emit_func_call(
+ struct x86_function *func,
+ unsigned xmm_save_mask,
+ const struct x86_reg *arg,
+ unsigned nr_args,
+ void (PIPE_CDECL *code)() )
{
+ struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+ unsigned i, n;
+
x86_push(
func,
x86_make_reg( file_REG32, reg_AX) );
x86_push(
func,
x86_make_reg( file_REG32, reg_DX) );
-}
+
+ /* Store XMM regs to the stack
+ */
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_save_mask & (1 << i))
+ ++n;
+
+ x86_sub_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_save_mask & (1 << i)) {
+ sse_movups(
+ func,
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+ make_xmm( i ) );
+ ++n;
+ }
+
+ for (i = 0; i < nr_args; i++) {
+ /* Load the address of the buffer we use for passing arguments and
+ * receiving results:
+ */
+ x86_lea(
+ func,
+ ecx,
+ arg[i] );
+
+ /* Push actual function arguments (currently just the pointer to
+ * the buffer above), and call the function:
+ */
+ x86_push( func, ecx );
+ }
+
+ x86_mov_reg_imm( func, ecx, (unsigned long) code );
+ x86_call( func, ecx );
+
+ /* Pop the arguments (or just add an immediate to esp)
+ */
+ for (i = 0; i < nr_args; i++) {
+ x86_pop(func, ecx );
+ }
+
+ /* Pop the saved XMM regs:
+ */
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_save_mask & (1 << i)) {
+ sse_movups(
+ func,
+ make_xmm( i ),
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+ ++n;
+ }
+
+ x86_add_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
-static void
-x86_pop_gp(
- struct x86_function *func )
-{
/* Restore GP registers in a reverse order.
*/
x86_pop(
}
static void
-emit_func_call_dst(
+emit_func_call_dst_src1(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
+ unsigned xmm_src0,
void (PIPE_CDECL *code)() )
{
+ struct x86_reg store = get_temp( TEMP_R0, 0 );
+ unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+
+ /* Store our input parameters (in xmm regs) to the buffer we use
+ * for passing arguments. We will pass a pointer to this buffer as
+ * the actual function argument.
+ */
sse_movaps(
func,
- get_temp( TEMP_R0, 0 ),
- make_xmm( xmm_dst ) );
-
- emit_push_gp(
- func );
+ store,
+ make_xmm( xmm_src0 ) );
- {
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
- x86_lea(
- func,
- ecx,
- get_temp( TEMP_R0, 0 ) );
-
- x86_push( func, ecx );
- x86_mov_reg_imm( func, ecx, (unsigned long) code );
- x86_call( func, ecx );
- x86_pop(func, ecx );
- }
-
-
- x86_pop_gp(
- func );
+ emit_func_call( func,
+ xmm_mask,
+ &store,
+ 1,
+ code );
sse_movaps(
func,
make_xmm( xmm_dst ),
- get_temp( TEMP_R0, 0 ) );
+ store );
}
+
static void
-emit_func_call_dst_src(
+emit_func_call_dst_src2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
- unsigned xmm_src,
+ unsigned xmm_src0,
+ unsigned xmm_src1,
void (PIPE_CDECL *code)() )
{
+ struct x86_reg store = get_temp( TEMP_R0, 0 );
+ unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+
+ /* Store two inputs to parameter buffer.
+ */
sse_movaps(
func,
- get_temp( TEMP_R0, 1 ),
- make_xmm( xmm_src ) );
+ store,
+ make_xmm( xmm_src0 ) );
- emit_func_call_dst(
+ sse_movaps(
func,
- xmm_dst,
- code );
+ x86_make_disp( store, 4 * sizeof(float) ),
+ make_xmm( xmm_src1 ) );
+
+
+ /* Emit the call
+ */
+ emit_func_call( func,
+ xmm_mask,
+ &store,
+ 1,
+ code );
+
+ /* Retrieve the results:
+ */
+ sse_movaps(
+ func,
+ make_xmm( xmm_dst ),
+ store );
+}
+
+
+
+
+
+#if defined(PIPE_ARCH_SSE)
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+exp2f4(__m128 x)
+{
+ __m128i ipart;
+ __m128 fpart, expipart, expfpart;
+
+ x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+ x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+ /* ipart = int(x - 0.5) */
+ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+ /* fpart = x - ipart */
+ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+ /* expipart = (float) (1 << ipart) */
+ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+ /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+ expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+ expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+ expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+ expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+ return _mm_mul_ps(expipart, expfpart);
+}
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+log2f4(__m128 x)
+{
+ __m128i expmask = _mm_set1_epi32(0x7f800000);
+ __m128i mantmask = _mm_set1_epi32(0x007fffff);
+ __m128 one = _mm_set1_ps(1.0f);
+
+ __m128i i = _mm_castps_si128(x);
+
+ /* exp = (float) exponent(x) */
+ __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+ /* mant = (float) mantissa(x) */
+ __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+ __m128 logmant;
+
+ /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+ * These coefficients can be generate with
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+#if LOG_POLY_DEGREE == 6
+ logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+ logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+ logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+ return _mm_add_ps(logmant, exp);
+}
+
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+ return exp2f4(_mm_mul_ps(log2f4(x), y));
}
+#endif /* PIPE_ARCH_SSE */
+
+
+
/**
* Low-level instruction translators.
*/
static void
emit_cos(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
cos4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
ex24f(
float *store )
{
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
store[0] = util_fast_exp2( store[0] );
store[1] = util_fast_exp2( store[1] );
store[2] = util_fast_exp2( store[2] );
store[3] = util_fast_exp2( store[3] );
-#else
- store[0] = powf( 2.0f, store[0] );
- store[1] = powf( 2.0f, store[1] );
- store[2] = powf( 2.0f, store[2] );
- store[3] = powf( 2.0f, store[3] );
#endif
}
static void
emit_ex2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
ex24f );
}
static void
emit_flr(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
flr4f );
}
static void
emit_frc(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
frc4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
lg24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
store[0] = util_fast_log2( store[0] );
store[1] = util_fast_log2( store[1] );
store[2] = util_fast_log2( store[2] );
store[3] = util_fast_log2( store[3] );
+#endif
}
static void
emit_lg2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
lg24f );
}
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
pow4f(
float *store )
{
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
store[0] = util_fast_pow( store[0], store[4] );
store[1] = util_fast_pow( store[1], store[5] );
store[2] = util_fast_pow( store[2], store[6] );
store[3] = util_fast_pow( store[3], store[7] );
-#else
- store[0] = powf( store[0], store[4] );
- store[1] = powf( store[1], store[5] );
- store[2] = powf( store[2], store[6] );
- store[3] = powf( store[3], store[7] );
#endif
}
static void
emit_pow(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
- unsigned xmm_src )
+ unsigned xmm_src0,
+ unsigned xmm_src1 )
{
- emit_func_call_dst_src(
+ emit_func_call_dst_src2(
func,
+ xmm_save,
xmm_dst,
- xmm_src,
+ xmm_src0,
+ xmm_src1,
pow4f );
}
make_xmm( xmm_src ) );
}
+static void PIPE_CDECL
+rnd4f(
+ float *store )
+{
+ store[0] = floorf( store[0] + 0.5f );
+ store[1] = floorf( store[1] + 0.5f );
+ store[2] = floorf( store[2] + 0.5f );
+ store[3] = floorf( store[3] + 0.5f );
+}
+
+static void
+emit_rnd(
+ struct x86_function *func,
+ unsigned xmm_save,
+ unsigned xmm_dst )
+{
+ emit_func_call_dst_src1(
+ func,
+ xmm_save,
+ xmm_dst,
+ xmm_dst,
+ rnd4f );
+}
+
static void
emit_rsqrt(
struct x86_function *func,
TGSI_EXEC_TEMP_80000000_C ) );
}
+static void PIPE_CDECL
+sgn4f(
+ float *store )
+{
+ store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
+ store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
+ store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
+ store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+emit_sgn(
+ struct x86_function *func,
+ unsigned xmm_save,
+ unsigned xmm_dst )
+{
+ emit_func_call_dst_src1(
+ func,
+ xmm_save,
+ xmm_dst,
+ xmm_dst,
+ sgn4f );
+}
+
static void PIPE_CDECL
sin4f(
float *store )
static void
emit_sin (struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst)
{
- emit_func_call_dst(
+ emit_func_call_dst_src1(
func,
+ xmm_save,
+ xmm_dst,
xmm_dst,
sin4f );
}
make_xmm( xmm_src ) );
}
+
+
+
+
+
+
/**
* Register fetch.
*/
#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+static void PIPE_CDECL
+fetch_texel( struct tgsi_sampler **sampler,
+ float *store )
+{
+#if 0
+ uint j;
+
+ debug_printf("%s sampler: %p (%p) store: %p\n",
+ __FUNCTION__,
+ sampler, *sampler,
+ store );
+
+ debug_printf("lodbias %f\n", store[12]);
+
+ for (j = 0; j < 4; j++)
+ debug_printf("sample %d texcoord %f %f\n",
+ j,
+ store[0+j],
+ store[4+j]);
+#endif
+
+ {
+ float rgba[NUM_CHANNELS][QUAD_SIZE];
+ (*sampler)->get_samples(*sampler,
+ &store[0],
+ &store[4],
+ &store[8],
+ 0.0f, /*store[12], lodbias */
+ rgba);
+
+ memcpy( store, rgba, 16 * sizeof(float));
+ }
+
+#if 0
+ for (j = 0; j < 4; j++)
+ debug_printf("sample %d result %f %f %f %f\n",
+ j,
+ store[0+j],
+ store[4+j],
+ store[8+j],
+ store[12+j]);
+#endif
+}
+
/**
* High-level instruction translators.
*/
+static void
+emit_tex( struct x86_function *func,
+ const struct tgsi_full_instruction *inst,
+ boolean lodbias,
+ boolean projected)
+{
+ const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+ struct x86_reg args[2];
+ unsigned count;
+ unsigned i;
+
+ switch (inst->InstructionExtTexture.Texture) {
+ case TGSI_TEXTURE_1D:
+ case TGSI_TEXTURE_SHADOW1D:
+ count = 1;
+ break;
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_RECT:
+ case TGSI_TEXTURE_SHADOW2D:
+ case TGSI_TEXTURE_SHADOWRECT:
+ count = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ case TGSI_TEXTURE_CUBE:
+ count = 3;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ if (lodbias) {
+ FETCH( func, *inst, 3, 0, 3 );
+ }
+ else {
+ emit_tempf(
+ func,
+ 3,
+ TGSI_EXEC_TEMP_00000000_I,
+ TGSI_EXEC_TEMP_00000000_C );
+
+ }
+
+ /* store lodbias whether enabled or not -- fetch_texel currently
+ * respects it always.
+ */
+ sse_movaps( func,
+ get_temp( TEMP_R0, 3 ),
+ make_xmm( 3 ) );
+
+
+ if (projected) {
+ FETCH( func, *inst, 3, 0, 3 );
+
+ emit_rcp( func, 3, 3 );
+ }
+
+ for (i = 0; i < count; i++) {
+ FETCH( func, *inst, i, 0, i );
+
+ if (projected) {
+ sse_mulps(
+ func,
+ make_xmm( i ),
+ make_xmm( 3 ) );
+ }
+
+ /* Store in the argument buffer:
+ */
+ sse_movaps(
+ func,
+ get_temp( TEMP_R0, i ),
+ make_xmm( i ) );
+ }
+
+ args[0] = get_temp( TEMP_R0, 0 );
+ args[1] = get_sampler_ptr( unit );
+
+
+ emit_func_call( func,
+ 0,
+ args,
+ Elements(args),
+ fetch_texel );
+
+ /* If all four channels are enabled, could use a pointer to
+ * dst[0].x instead of TEMP_R0 for store?
+ */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
+
+ sse_movaps(
+ func,
+ make_xmm( 0 ),
+ get_temp( TEMP_R0, i ) );
+
+ STORE( func, *inst, 0, 0, i );
+ }
+}
+
+
static void
emit_kil(
struct x86_function *func,
const struct tgsi_full_src_register *reg )
{
unsigned uniquemask;
- unsigned registers[4];
- unsigned nextregister = 0;
- unsigned firstchan = ~0;
+ unsigned unique_count = 0;
unsigned chan_index;
+ unsigned i;
/* This mask stores component bits that were already tested. Note that
* we test if the value is less than zero, so 1.0 and 0.0 need not to be
uniquemask |= 1 << swizzle;
/* allocate register */
- registers[chan_index] = nextregister;
emit_fetch(
func,
- nextregister,
+ unique_count++,
reg,
chan_index );
- nextregister++;
-
- /* mark the first channel used */
- if( firstchan == ~0 ) {
- firstchan = chan_index;
- }
}
}
func,
x86_make_reg( file_REG32, reg_DX ) );
- FOR_EACH_CHANNEL( chan_index ) {
- if( uniquemask & (1 << chan_index) ) {
- sse_cmpps(
+ for (i = 0 ; i < unique_count; i++ ) {
+ struct x86_reg dataXMM = make_xmm(i);
+
+ sse_cmpps(
+ func,
+ dataXMM,
+ get_temp(
+ TGSI_EXEC_TEMP_00000000_I,
+ TGSI_EXEC_TEMP_00000000_C ),
+ cc_LessThan );
+
+ if( i == 0 ) {
+ sse_movmskps(
func,
- make_xmm( registers[chan_index] ),
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ),
- cc_LessThan );
-
- if( chan_index == firstchan ) {
- sse_pmovmskb(
- func,
- x86_make_reg( file_REG32, reg_AX ),
- make_xmm( registers[chan_index] ) );
- }
- else {
- sse_pmovmskb(
- func,
- x86_make_reg( file_REG32, reg_DX ),
- make_xmm( registers[chan_index] ) );
- x86_or(
- func,
- x86_make_reg( file_REG32, reg_AX ),
- x86_make_reg( file_REG32, reg_DX ) );
- }
+ x86_make_reg( file_REG32, reg_AX ),
+ dataXMM );
+ }
+ else {
+ sse_movmskps(
+ func,
+ x86_make_reg( file_REG32, reg_DX ),
+ dataXMM );
+ x86_or(
+ func,
+ x86_make_reg( file_REG32, reg_AX ),
+ x86_make_reg( file_REG32, reg_DX ) );
}
}
}
}
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+ uint i;
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+ const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
+ if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
+ reg->SrcRegister.Indirect)
+ return TRUE;
+ }
+ for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+ const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
+ if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+ reg->DstRegister.Indirect)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
static int
emit_instruction(
struct x86_function *func,
{
unsigned chan_index;
+ /* we can't handle indirect addressing into temp register file yet */
+ if (indirect_temp_reference(inst))
+ return FALSE;
+
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ARL:
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
+ emit_flr(func, 0, 0);
emit_f2it( func, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
get_temp(
TGSI_EXEC_TEMP_MINUS_128_I,
TGSI_EXEC_TEMP_MINUS_128_C ) );
- emit_pow( func, 1, 2 );
+ emit_pow( func, 3, 1, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
sse_xorps(
func,
func,
make_xmm( 2 ),
make_xmm( 0 ),
- cc_LessThanEqual );
+ cc_LessThan );
sse_andps(
func,
make_xmm( 2 ),
case TGSI_OPCODE_RSQ:
/* TGSI_OPCODE_RECIPSQRT */
FETCH( func, *inst, 0, 0, CHAN_X );
+ emit_abs( func, 0 );
emit_rsqrt( func, 1, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 1, 0, chan_index );
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_MOV( func, 1, 0 );
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = ex2(floor(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
emit_MOV( func, 2, 1 );
- emit_ex2( func, 2 );
+ emit_ex2( func, 3, 2 );
STORE( func, *inst, 2, 0, CHAN_X );
}
/* dst.y = src.x - floor(src.x) */
}
/* dst.z = ex2(src.x) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- emit_ex2( func, 0 );
+ emit_ex2( func, 3, 0 );
STORE( func, *inst, 0, 0, CHAN_Z );
}
}
FETCH( func, *inst, 0, 0, CHAN_X );
emit_abs( func, 0 );
emit_MOV( func, 1, 0 );
- emit_lg2( func, 1 );
+ emit_lg2( func, 2, 1 );
/* dst.z = lg2(abs(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
STORE( func, *inst, 1, 0, CHAN_Z );
}
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = floor(lg2(abs(src.x))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( func, *inst, 1, 0, CHAN_X );
}
/* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_ex2( func, 1 );
+ emit_ex2( func, 2, 1 );
emit_rcp( func, 1, 1 );
emit_mul( func, 0, 1 );
STORE( func, *inst, 0, 0, CHAN_Y );
case TGSI_OPCODE_DOT2ADD:
/* TGSI_OPCODE_DP2A */
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_INDEX:
/* TGSI_OPCODE_FRC */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_frc( func, 0 );
+ emit_frc( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
/* TGSI_OPCODE_FLR */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_flr( func, 0 );
+ emit_flr( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
case TGSI_OPCODE_ROUND:
- return 0;
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ FETCH( func, *inst, 0, 0, chan_index );
+ emit_rnd( func, 0, 0 );
+ STORE( func, *inst, 0, 0, chan_index );
+ }
break;
case TGSI_OPCODE_EXPBASE2:
/* TGSI_OPCODE_EX2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_ex2( func, 0 );
+ emit_ex2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
case TGSI_OPCODE_LOGBASE2:
/* TGSI_OPCODE_LG2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_lg2( func, 0 );
+ emit_lg2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
/* TGSI_OPCODE_POW */
FETCH( func, *inst, 0, 0, CHAN_X );
FETCH( func, *inst, 1, 1, CHAN_X );
- emit_pow( func, 0, 1 );
+ emit_pow( func, 0, 0, 0, 1 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
case TGSI_OPCODE_COS:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
case TGSI_OPCODE_SIN:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
break;
case TGSI_OPCODE_TEX:
- if (0) {
- /* Disable dummy texture code:
- */
- emit_tempf(
- func,
- 0,
- TEMP_ONE_I,
- TEMP_ONE_C );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- }
- else {
- return 0;
- }
+ emit_tex( func, inst, FALSE, FALSE );
break;
case TGSI_OPCODE_TXD:
break;
case TGSI_OPCODE_ARR:
- return 0;
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ FETCH( func, *inst, 0, 0, chan_index );
+ emit_rnd( func, 0, 0 );
+ emit_f2it( func, 0 );
+ STORE( func, *inst, 0, 0, chan_index );
+ }
break;
case TGSI_OPCODE_BRA:
break;
case TGSI_OPCODE_SSG:
- return 0;
+ /* TGSI_OPCODE_SGN */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ FETCH( func, *inst, 0, 0, chan_index );
+ emit_sgn( func, 0, 0 );
+ STORE( func, *inst, 0, 0, chan_index );
+ }
break;
case TGSI_OPCODE_CMP:
case TGSI_OPCODE_SCS:
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_X );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_Y );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
break;
case TGSI_OPCODE_TXB:
- return 0;
+ emit_tex( func, inst, TRUE, FALSE );
break;
case TGSI_OPCODE_NRM:
- return 0;
+ /* fall-through */
+ case TGSI_OPCODE_NRM4:
+ /* 3 or 4-component normalization */
+ {
+ uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
+ (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
+
+ /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+ /* xmm4 = src.x */
+ /* xmm0 = src.x * src.x */
+ FETCH(func, *inst, 0, 0, CHAN_X);
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ emit_MOV(func, 4, 0);
+ }
+ emit_mul(func, 0, 0);
+
+ /* xmm5 = src.y */
+ /* xmm0 = xmm0 + src.y * src.y */
+ FETCH(func, *inst, 1, 0, CHAN_Y);
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ emit_MOV(func, 5, 1);
+ }
+ emit_mul(func, 1, 1);
+ emit_add(func, 0, 1);
+
+ /* xmm6 = src.z */
+ /* xmm0 = xmm0 + src.z * src.z */
+ FETCH(func, *inst, 1, 0, CHAN_Z);
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ emit_MOV(func, 6, 1);
+ }
+ emit_mul(func, 1, 1);
+ emit_add(func, 0, 1);
+
+ if (dims == 4) {
+ /* xmm7 = src.w */
+ /* xmm0 = xmm0 + src.w * src.w */
+ FETCH(func, *inst, 1, 0, CHAN_W);
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_MOV(func, 7, 1);
+ }
+ emit_mul(func, 1, 1);
+ emit_add(func, 0, 1);
+ }
+
+ /* xmm1 = 1 / sqrt(xmm0) */
+ emit_rsqrt(func, 1, 0);
+
+ /* dst.x = xmm1 * src.x */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ emit_mul(func, 4, 1);
+ STORE(func, *inst, 4, 0, CHAN_X);
+ }
+
+ /* dst.y = xmm1 * src.y */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ emit_mul(func, 5, 1);
+ STORE(func, *inst, 5, 0, CHAN_Y);
+ }
+
+ /* dst.z = xmm1 * src.z */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ emit_mul(func, 6, 1);
+ STORE(func, *inst, 6, 0, CHAN_Z);
+ }
+
+ /* dst.w = xmm1 * src.w */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
+ emit_mul(func, 7, 1);
+ STORE(func, *inst, 7, 0, CHAN_W);
+ }
+ }
+
+ /* dst0.w = 1.0 */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
+ emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
+ STORE(func, *inst, 0, 0, CHAN_W);
+ }
+ }
break;
case TGSI_OPCODE_DIV:
break;
case TGSI_OPCODE_DP2:
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_TXL:
- return 0;
+ emit_tex( func, inst, TRUE, FALSE );
break;
+ case TGSI_OPCODE_TXP:
+ emit_tex( func, inst, FALSE, TRUE );
+ break;
+
case TGSI_OPCODE_BRK:
return 0;
break;
static void aos_to_soa( struct x86_function *func,
uint arg_aos,
- uint arg_soa,
+ uint arg_machine,
uint arg_num,
uint arg_stride )
{
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
- x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
+ x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_input,
+ x86_make_disp( soa_input,
+ Offset(struct tgsi_exec_machine, Inputs) ) );
x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_input );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+static void soa_to_aos( struct x86_function *func,
+ uint arg_aos,
+ uint arg_machine,
+ uint arg_num,
+ uint arg_stride )
{
- struct x86_reg soa_output;
- struct x86_reg aos_output;
- struct x86_reg num_outputs;
- struct x86_reg temp;
+ struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
+ struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
+ struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
+ struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
int inner_loop;
- soa_output = x86_make_reg( file_REG32, reg_AX );
- aos_output = x86_make_reg( file_REG32, reg_BX );
- num_outputs = x86_make_reg( file_REG32, reg_CX );
- temp = x86_make_reg( file_REG32, reg_DX );
-
/* Save EBX */
- x86_push( func, aos_output );
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
- x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
- x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
- x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+ x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
+ x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_output,
+ x86_make_disp( soa_output,
+ Offset(struct tgsi_exec_machine, Outputs) ) );
+ x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
/* do */
inner_loop = x86_get_label( func );
sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
- x86_mov( func, temp, x86_fn_arg( func, stride ) );
+ x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
x86_push( func, aos_output );
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_output );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
/**
* Translate a TGSI vertex/fragment shader to SSE2 code.
* Slightly different things are done for vertex vs. fragment shaders.
*
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
* \param tokens the TGSI input shader
* \param func the output SSE code/function
* \param immediates buffer to place immediates, later passed to SSE func
boolean do_swizzles )
{
struct tgsi_parse_context parse;
- boolean instruction_phase = FALSE;
unsigned ok = 1;
uint num_immediates = 0;
/* Can't just use EDI, EBX without save/restoring them:
*/
- x86_push(
- func,
- get_immediate_base() );
-
- x86_push(
- func,
- get_temp_base() );
-
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+ x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
/*
* Different function args for vertex/fragment shaders:
*/
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- /* DECLARATION phase, do not load output argument. */
- x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- /* skipping outputs argument here */
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
- x86_mov(
- func,
- get_coef_base(),
- x86_fn_arg( func, 5 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 6 ) );
- }
- else {
- assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
aos_to_soa( func,
- 6, /* aos_input */
- 1, /* machine->input */
- 7, /* num_inputs */
- 8 ); /* input_stride */
+ 4, /* aos_input */
+ 1, /* machine */
+ 5, /* num_inputs */
+ 6 ); /* input_stride */
+ }
+ x86_mov(
+ func,
+ get_machine_base(),
+ x86_fn_arg( func, 1 ) );
+ x86_mov(
+ func,
+ get_const_base(),
+ x86_fn_arg( func, 2 ) );
+ x86_mov(
+ func,
+ get_immediate_base(),
+ x86_fn_arg( func, 3 ) );
+
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
+ func,
+ get_coef_base(),
+ x86_fn_arg( func, 4 ) );
+
x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 5 ) );
+ func,
+ get_sampler_base(),
+ x86_make_disp( get_machine_base(),
+ Offset( struct tgsi_exec_machine, Samplers ) ) );
}
+
while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
tgsi_parse_token( &parse );
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- if( !instruction_phase ) {
- /* INSTRUCTION phase, overwrite coeff with output. */
- instruction_phase = TRUE;
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- }
- }
-
ok = emit_instruction(
func,
&parse.FullToken.FullInstruction );
case TGSI_TOKEN_TYPE_IMMEDIATE:
/* simply copy the immediate values into the next immediates[] slot */
{
- const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+ const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
uint i;
assert(size <= 4);
assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
- soa_to_aos( func, 9, 2, 10, 11 );
+ soa_to_aos( func,
+ 7, /* aos_output */
+ 1, /* machine */
+ 8, /* num_outputs */
+ 9 ); /* output_stride */
}
/* Can't just use EBX, EDI without save/restoring them:
*/
- x86_pop(
- func,
- get_temp_base() );
-
- x86_pop(
- func,
- get_immediate_base() );
+ x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
emit_ret( func );