union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) fabs( (double) src->f[0] );
- dst->f[1] = (float) fabs( (double) src->f[1] );
- dst->f[2] = (float) fabs( (double) src->f[2] );
- dst->f[3] = (float) fabs( (double) src->f[3] );
+ dst->f[0] = fabsf( src->f[0] );
+ dst->f[1] = fabsf( src->f[1] );
+ dst->f[2] = fabsf( src->f[2] );
+ dst->f[3] = fabsf( src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) ceil( (double) src->f[0] );
- dst->f[1] = (float) ceil( (double) src->f[1] );
- dst->f[2] = (float) ceil( (double) src->f[2] );
- dst->f[3] = (float) ceil( (double) src->f[3] );
+ dst->f[0] = ceilf( src->f[0] );
+ dst->f[1] = ceilf( src->f[1] );
+ dst->f[2] = ceilf( src->f[2] );
+ dst->f[3] = ceilf( src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) cos( (double) src->f[0] );
- dst->f[1] = (float) cos( (double) src->f[1] );
- dst->f[2] = (float) cos( (double) src->f[2] );
- dst->f[3] = (float) cos( (double) src->f[3] );
+ dst->f[0] = cosf( src->f[0] );
+ dst->f[1] = cosf( src->f[1] );
+ dst->f[2] = cosf( src->f[2] );
+ dst->f[3] = cosf( src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src)
{
- dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
- dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
- dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
- dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+ dst->f[0] = powf( 2.0f, src->f[0] );
+ dst->f[1] = powf( 2.0f, src->f[1] );
+ dst->f[2] = powf( 2.0f, src->f[2] );
+ dst->f[3] = powf( 2.0f, src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) floor( (double) src->f[0] );
- dst->f[1] = (float) floor( (double) src->f[1] );
- dst->f[2] = (float) floor( (double) src->f[2] );
- dst->f[3] = (float) floor( (double) src->f[3] );
+ dst->f[0] = floorf( src->f[0] );
+ dst->f[1] = floorf( src->f[1] );
+ dst->f[2] = floorf( src->f[2] );
+ dst->f[3] = floorf( src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
- dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
- dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
- dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+ dst->f[0] = src->f[0] - floorf( src->f[0] );
+ dst->f[1] = src->f[1] - floorf( src->f[1] );
+ dst->f[2] = src->f[2] - floorf( src->f[2] );
+ dst->f[3] = src->f[3] - floorf( src->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
- dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
- dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
- dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+ dst->f[0] = logf( src->f[0] ) * 1.442695f;
+ dst->f[1] = logf( src->f[1] ) * 1.442695f;
+ dst->f[2] = logf( src->f[2] ) * 1.442695f;
+ dst->f[3] = logf( src->f[3] ) * 1.442695f;
}
static void
const union tgsi_exec_channel *src0,
const union tgsi_exec_channel *src1 )
{
- dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
- dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
- dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
- dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+ dst->f[0] = powf( src0->f[0], src1->f[0] );
+ dst->f[1] = powf( src0->f[1], src1->f[1] );
+ dst->f[2] = powf( src0->f[2], src1->f[2] );
+ dst->f[3] = powf( src0->f[3], src1->f[3] );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
- dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
- dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
- dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+ dst->f[0] = floorf( src->f[0] + 0.5f );
+ dst->f[1] = floorf( src->f[1] + 0.5f );
+ dst->f[2] = floorf( src->f[2] + 0.5f );
+ dst->f[3] = floorf( src->f[3] + 0.5f );
}
static void
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) sin( (double) src->f[0] );
- dst->f[1] = (float) sin( (double) src->f[1] );
- dst->f[2] = (float) sin( (double) src->f[2] );
- dst->f[3] = (float) sin( (double) src->f[3] );
+ dst->f[0] = sinf( src->f[0] );
+ dst->f[1] = sinf( src->f[1] );
+ dst->f[2] = sinf( src->f[2] );
+ dst->f[3] = sinf( src->f[3] );
}
static void
micro_sqrt( union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
{
- dst->f[0] = (float) sqrt( (double) src->f[0] );
- dst->f[1] = (float) sqrt( (double) src->f[1] );
- dst->f[2] = (float) sqrt( (double) src->f[2] );
- dst->f[3] = (float) sqrt( (double) src->f[3] );
+ dst->f[0] = sqrtf( src->f[0] );
+ dst->f[1] = sqrtf( src->f[1] );
+ dst->f[2] = sqrtf( src->f[2] );
+ dst->f[3] = sqrtf( src->f[3] );
}
static void
#define HIGH_PRECISION 1 /* for 1/sqrt() */
-#define DUMP_SSE 0
-
-#if DUMP_SSE
-
-static void
-_print_reg(
- struct x86_reg reg )
-{
- if (reg.mod != mod_REG)
- debug_printf( "[" );
-
- switch( reg.file ) {
- case file_REG32:
- switch( reg.idx ) {
- case reg_AX:
- debug_printf( "EAX" );
- break;
- case reg_CX:
- debug_printf( "ECX" );
- break;
- case reg_DX:
- debug_printf( "EDX" );
- break;
- case reg_BX:
- debug_printf( "EBX" );
- break;
- case reg_SP:
- debug_printf( "ESP" );
- break;
- case reg_BP:
- debug_printf( "EBP" );
- break;
- case reg_SI:
- debug_printf( "ESI" );
- break;
- case reg_DI:
- debug_printf( "EDI" );
- break;
- }
- break;
- case file_MMX:
- assert( 0 );
- break;
- case file_XMM:
- debug_printf( "XMM%u", reg.idx );
- break;
- case file_x87:
- assert( 0 );
- break;
- }
-
- if (reg.mod == mod_DISP8 ||
- reg.mod == mod_DISP32)
- debug_printf("+%d", reg.disp);
-
- if (reg.mod != mod_REG)
- debug_printf( "]" );
-}
-
-static void
-_fill(
- const char *op )
-{
- unsigned count = 10 - strlen( op );
-
- while( count-- ) {
- debug_printf( " " );
- }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 );\
- debug_printf( ", " );\
- _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
- debug_printf( "\n%s", OP );\
- _fill( OP );\
- _print_reg( R0 );\
- debug_printf( ", " );\
- _print_reg( R1 );\
- debug_printf( ", " );\
- debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
-
-#endif
#define FOR_EACH_CHANNEL( CHAN )\
for( CHAN = 0; CHAN < 4; CHAN++ )
((vec * 3 + member) * 4 + chan) * 4 );
}
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ADDPS", dst, src );
- sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ANDNPS", dst, src );
- sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ANDPS", dst, src );
- sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
- struct x86_function *func,
- void (* addr)() )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
- DUMP_I( "CALL", addr );
- x86_mov_reg_imm( func, ecx, (unsigned long) addr );
- x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src,
- enum sse_cc cc )
-{
- DUMP_RRI( "CMPPS", dst, src, cc );
- sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "CVTTPS2DQ", dst, src );
- sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MAXPS", dst, src );
- sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MINPS", dst, src );
- sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOV", dst, src );
- x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVAPS", dst, src );
- sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVSS", dst, src );
- sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MOVUPS", dst, src );
- sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "MULPS", dst, src );
- sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "OR", dst, src );
- x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "ORPS", dst, src );
- sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "PMOVMSKB", dst, src );
- sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
- struct x86_function *func,
- struct x86_reg dst )
-{
- DUMP_R( "POP", dst );
- x86_pop( func, dst );
-}
-
-static void
-emit_push(
- struct x86_function *func,
- struct x86_reg dst )
-{
- DUMP_R( "PUSH", dst );
- x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "RCPPS", dst, src );
- sse2_rcpps( func, dst, src );
-}
#ifdef WIN32
static void
struct x86_function *func,
unsigned size )
{
- DUMP_I( "RET", size );
x86_retw( func, size );
}
#else
emit_ret(
struct x86_function *func )
{
- DUMP( "RET" );
x86_ret( func );
}
#endif
-static void
-emit_rsqrtps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "RSQRTPS", dst, src );
- sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src,
- unsigned char shuf )
-{
- DUMP_RRI( "SHUFPS", dst, src, shuf );
- sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "SUBPS", dst, src );
- sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
- struct x86_function *func,
- struct x86_reg dst,
- struct x86_reg src )
-{
- DUMP_RR( "XORPS", dst, src );
- sse_xorps( func, dst, src );
-}
/**
* Data fetch helpers.
unsigned vec,
unsigned chan )
{
- emit_movss(
+ sse_movss(
func,
make_xmm( xmm ),
get_const( vec, chan ) );
- emit_shufps(
+ sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
unsigned vec,
unsigned chan )
{
- emit_movss(
+ sse_movss(
func,
make_xmm( xmm ),
get_immediate( vec, chan ) );
- emit_shufps(
+ sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
make_xmm( xmm ),
get_input( vec, chan ) );
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
get_output( vec, chan ),
make_xmm( xmm ) );
unsigned vec,
unsigned chan )
{
- emit_movaps(
+ sse_movaps(
func,
make_xmm( xmm ),
get_temp( vec, chan ) );
unsigned chan,
unsigned member )
{
- emit_movss(
+ sse_movss(
func,
make_xmm( xmm ),
get_coef( vec, chan, member ) );
- emit_shufps(
+ sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
unsigned vec,
unsigned chan )
{
- emit_movups(
+ sse_movups(
func,
get_input( vec, chan ),
make_xmm( xmm ) );
unsigned vec,
unsigned chan )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( vec, chan ),
make_xmm( xmm ) );
emit_push_gp(
struct x86_function *func )
{
- emit_push(
+ x86_push(
func,
get_const_base() );
- emit_push(
+ x86_push(
func,
get_input_base() );
- emit_push(
+ x86_push(
func,
get_output_base() );
/* It is important on non-win32 platforms that temp base is pushed last.
*/
- emit_push(
+ x86_push(
func,
get_temp_base() );
}
static void
-emit_pop_gp(
+x86_pop_gp(
struct x86_function *func )
{
/* Restore GP registers in a reverse order.
*/
- emit_pop(
+ x86_pop(
func,
get_temp_base() );
- emit_pop(
+ x86_pop(
func,
get_output_base() );
- emit_pop(
+ x86_pop(
func,
get_input_base() );
- emit_pop(
+ x86_pop(
func,
get_const_base() );
}
unsigned xmm_dst,
void (*code)() )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( TEMP_R0, 0 ),
make_xmm( xmm_dst ) );
func );
#ifdef WIN32
- emit_push(
+ x86_push(
func,
get_temp( TEMP_R0, 0 ) );
#endif
- emit_call(
- func,
- code );
+ {
+ struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+ x86_mov_reg_imm( func, ecx, (unsigned long) code );
+ x86_call( func, ecx );
+ }
- emit_pop_gp(
+ x86_pop_gp(
func );
- emit_movaps(
+ sse_movaps(
func,
make_xmm( xmm_dst ),
get_temp( TEMP_R0, 0 ) );
unsigned xmm_src,
void (*code)() )
{
- emit_movaps(
+ sse_movaps(
func,
get_temp( TEMP_R0, 1 ),
make_xmm( xmm_src ) );
struct x86_function *func,
unsigned xmm )
{
- emit_andps(
+ sse_andps(
func,
make_xmm( xmm ),
get_temp(
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_addps(
+ sse_addps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
float *store )
{
#ifdef WIN32
- store[0] = (float) cos( (double) store[0] );
- store[1] = (float) cos( (double) store[1] );
- store[2] = (float) cos( (double) store[2] );
- store[3] = (float) cos( (double) store[3] );
+ const unsigned X = 0;
#else
const unsigned X = TEMP_R0 * 16;
+#endif
+
store[X + 0] = cosf( store[X + 0] );
store[X + 1] = cosf( store[X + 1] );
store[X + 2] = cosf( store[X + 2] );
store[X + 3] = cosf( store[X + 3] );
-#endif
}
static void
float *store )
{
#ifdef WIN32
- store[0] = (float) pow( 2.0, (double) store[0] );
- store[1] = (float) pow( 2.0, (double) store[1] );
- store[2] = (float) pow( 2.0, (double) store[2] );
- store[3] = (float) pow( 2.0, (double) store[3] );
+ const unsigned X = 0;
#else
const unsigned X = TEMP_R0 * 16;
+#endif
store[X + 0] = powf( 2.0f, store[X + 0] );
store[X + 1] = powf( 2.0f, store[X + 1] );
store[X + 2] = powf( 2.0f, store[X + 2] );
store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
}
static void
struct x86_function *func,
unsigned xmm )
{
- emit_cvttps2dq(
+ sse2_cvttps2dq(
func,
make_xmm( xmm ),
make_xmm( xmm ) );
#else
const unsigned X = TEMP_R0 * 16;
#endif
- store[X + 0] = (float) floor( (double) store[X + 0] );
- store[X + 1] = (float) floor( (double) store[X + 1] );
- store[X + 2] = (float) floor( (double) store[X + 2] );
- store[X + 3] = (float) floor( (double) store[X + 3] );
+ store[X + 0] = floorf( store[X + 0] );
+ store[X + 1] = floorf( store[X + 1] );
+ store[X + 2] = floorf( store[X + 2] );
+ store[X + 3] = floorf( store[X + 3] );
}
static void
#else
const unsigned X = TEMP_R0 * 16;
#endif
- store[X + 0] -= (float) floor( (double) store[X + 0] );
- store[X + 1] -= (float) floor( (double) store[X + 1] );
- store[X + 2] -= (float) floor( (double) store[X + 2] );
- store[X + 3] -= (float) floor( (double) store[X + 3] );
+ store[X + 0] -= floorf( store[X + 0] );
+ store[X + 1] -= floorf( store[X + 1] );
+ store[X + 2] -= floorf( store[X + 2] );
+ store[X + 3] -= floorf( store[X + 3] );
}
static void
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_movups(
+ sse_movups(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
unsigned xmm_dst,
unsigned xmm_src)
{
- emit_mulps(
+ sse_mulps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
struct x86_function *func,
unsigned xmm )
{
- emit_xorps(
+ sse_xorps(
func,
make_xmm( xmm ),
get_temp(
float *store )
{
#ifdef WIN32
- store[0] = (float) pow( (double) store[0], (double) store[4] );
- store[1] = (float) pow( (double) store[1], (double) store[5] );
- store[2] = (float) pow( (double) store[2], (double) store[6] );
- store[3] = (float) pow( (double) store[3], (double) store[7] );
+ const unsigned X = 0;
#else
const unsigned X = TEMP_R0 * 16;
+#endif
store[X + 0] = powf( store[X + 0], store[X + 4] );
store[X + 1] = powf( store[X + 1], store[X + 5] );
store[X + 2] = powf( store[X + 2], store[X + 6] );
store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
}
static void
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_rcpps(
+ /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+ * good enough. Need to either emit a proper divide or use the
+ * iterative technique described below in emit_rsqrt().
+ */
+ sse2_rcpps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
float *store )
{
#ifdef WIN32
- store[0] = 1.0F / (float) sqrt( (double) store[0] );
- store[1] = 1.0F / (float) sqrt( (double) store[1] );
- store[2] = 1.0F / (float) sqrt( (double) store[2] );
- store[3] = 1.0F / (float) sqrt( (double) store[3] );
+ const unsigned X = 0;
#else
const unsigned X = TEMP_R0 * 16;
- store[X + 0] = 1.0F / sqrt( store[X + 0] );
- store[X + 1] = 1.0F / sqrt( store[X + 1] );
- store[X + 2] = 1.0F / sqrt( store[X + 2] );
- store[X + 3] = 1.0F / sqrt( store[X + 3] );
#endif
+ store[X + 0] = 1.0F / sqrtf( store[X + 0] );
+ store[X + 1] = 1.0F / sqrtf( store[X + 1] );
+ store[X + 2] = 1.0F / sqrtf( store[X + 2] );
+ store[X + 3] = 1.0F / sqrtf( store[X + 3] );
}
#endif
unsigned xmm_src )
{
#if HIGH_PRECISION
+#if 1
emit_func_call_dst_src(
func,
xmm_dst,
xmm_src,
rsqrt4f );
#else
+ /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+ * implementations, it is possible to improve its precision at
+ * fairly low cost, using a newton/raphson step, as below:
+ *
+ * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+ * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+ *
+ * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+ /* This is some code that woudl do the above for a scalar 'a'. We
+ * obviously are interested in a vector version:
+ *
+ * movss xmm3, a;
+ * movss xmm1, half;
+ * movss xmm2, three;
+ * rsqrtss xmm0, xmm3;
+ * mulss xmm3, xmm0;
+ * mulss xmm1, xmm0;
+ * mulss xmm3, xmm0;
+ * subss xmm2, xmm3;
+ * mulss xmm1, xmm2;
+ * movss x, xmm1;
+ */
+#endif
+#else
+ /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+ * good enough.
+ */
emit_rsqrtps(
func,
make_xmm( xmm_dst ),
struct x86_function *func,
unsigned xmm )
{
- emit_orps(
+ sse_orps(
func,
make_xmm( xmm ),
get_temp(
float *store )
{
#ifdef WIN32
- store[0] = (float) sin( (double) store[0] );
- store[1] = (float) sin( (double) store[1] );
- store[2] = (float) sin( (double) store[2] );
- store[3] = (float) sin( (double) store[3] );
+ const unsigned X = 0;
#else
const unsigned X = TEMP_R0 * 16;
+#endif
store[X + 0] = sinf( store[X + 0] );
store[X + 1] = sinf( store[X + 1] );
store[X + 2] = sinf( store[X + 2] );
store[X + 3] = sinf( store[X + 3] );
-#endif
}
static void
unsigned xmm_dst,
unsigned xmm_src )
{
- emit_subps(
+ sse_subps(
func,
make_xmm( xmm_dst ),
make_xmm( xmm_src ) );
}
}
- emit_push(
+ x86_push(
func,
x86_make_reg( file_REG32, reg_AX ) );
- emit_push(
+ x86_push(
func,
x86_make_reg( file_REG32, reg_DX ) );
FOR_EACH_CHANNEL( chan_index ) {
if( uniquemask & (1 << chan_index) ) {
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( registers[chan_index] ),
get_temp(
cc_LessThan );
if( chan_index == firstchan ) {
- emit_pmovmskb(
+ sse_pmovmskb(
func,
x86_make_reg( file_REG32, reg_AX ),
make_xmm( registers[chan_index] ) );
}
else {
- emit_pmovmskb(
+ sse_pmovmskb(
func,
x86_make_reg( file_REG32, reg_DX ),
make_xmm( registers[chan_index] ) );
- emit_or(
+ x86_or(
func,
x86_make_reg( file_REG32, reg_AX ),
x86_make_reg( file_REG32, reg_DX ) );
}
}
- emit_or(
+ x86_or(
func,
get_temp(
TGSI_EXEC_TEMP_KILMASK_I,
TGSI_EXEC_TEMP_KILMASK_C ),
x86_make_reg( file_REG32, reg_AX ) );
- emit_pop(
+ x86_pop(
func,
x86_make_reg( file_REG32, reg_DX ) );
- emit_pop(
+ x86_pop(
func,
x86_make_reg( file_REG32, reg_AX ) );
}
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 0 ),
make_xmm( 1 ),
cc );
- emit_andps(
+ sse_andps(
func,
make_xmm( 0 ),
get_temp(
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
FETCH( func, *inst, 2, 2, chan_index );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 0 ),
get_temp(
TGSI_EXEC_TEMP_00000000_I,
TGSI_EXEC_TEMP_00000000_C ),
cc_LessThan );
- emit_andps(
+ sse_andps(
func,
make_xmm( 1 ),
make_xmm( 0 ) );
- emit_andnps(
+ sse_andnps(
func,
make_xmm( 0 ),
make_xmm( 2 ) );
- emit_orps(
+ sse_orps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 0 ),
get_temp(
/* XMM[1] = SrcReg[0].yyyy */
FETCH( func, *inst, 1, 0, CHAN_Y );
/* XMM[1] = max(XMM[1], 0) */
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 1 ),
get_temp(
/* XMM[2] = SrcReg[0].wwww */
FETCH( func, *inst, 2, 0, CHAN_W );
/* XMM[2] = min(XMM[2], 128.0) */
- emit_minps(
+ sse_minps(
func,
make_xmm( 2 ),
get_temp(
TGSI_EXEC_TEMP_128_I,
TGSI_EXEC_TEMP_128_C ) );
/* XMM[2] = max(XMM[2], -128.0) */
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 2 ),
get_temp(
TGSI_EXEC_TEMP_MINUS_128_C ) );
emit_pow( func, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_xorps(
+ sse_xorps(
func,
make_xmm( 2 ),
make_xmm( 2 ) );
- emit_cmpps(
+ sse_cmpps(
func,
make_xmm( 2 ),
make_xmm( 0 ),
cc_LessThanEqual );
- emit_andps(
+ sse_andps(
func,
make_xmm( 2 ),
make_xmm( 1 ) );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_minps(
+ sse_minps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
FETCH( func, *inst, 1, 1, chan_index );
- emit_maxps(
+ sse_maxps(
func,
make_xmm( 0 ),
make_xmm( 1 ) );
unsigned ok = 1;
uint num_immediates = 0;
- DUMP_START();
-
func->csr = func->store;
tgsi_parse_init( &parse, tokens );
*/
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
/* DECLARATION phase, do not load output argument. */
- emit_mov(
+ x86_mov(
func,
get_input_base(),
get_argument( 0 ) );
/* skipping outputs argument here */
- emit_mov(
+ x86_mov(
func,
get_const_base(),
get_argument( 2 ) );
- emit_mov(
+ x86_mov(
func,
get_temp_base(),
get_argument( 3 ) );
- emit_mov(
+ x86_mov(
func,
get_coef_base(),
get_argument( 4 ) );
- emit_mov(
+ x86_mov(
func,
get_immediate_base(),
get_argument( 5 ) );
else {
assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
- emit_mov(
+ x86_mov(
func,
get_input_base(),
get_argument( 0 ) );
- emit_mov(
+ x86_mov(
func,
get_output_base(),
get_argument( 1 ) );
- emit_mov(
+ x86_mov(
func,
get_const_base(),
get_argument( 2 ) );
- emit_mov(
+ x86_mov(
func,
get_temp_base(),
get_argument( 3 ) );
- emit_mov(
+ x86_mov(
func,
get_immediate_base(),
get_argument( 4 ) );
if( !instruction_phase ) {
/* INSTRUCTION phase, overwrite coeff with output. */
instruction_phase = TRUE;
- emit_mov(
+ x86_mov(
func,
get_output_base(),
get_argument( 1 ) );
&parse.FullToken.FullInstruction );
if (!ok) {
- debug_printf("failed to translate tgsi opcode %d to SSE\n",
- parse.FullToken.FullInstruction.Instruction.Opcode );
+ debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
+ parse.FullToken.FullInstruction.Instruction.Opcode,
+ parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+ "vertex shader" : "fragment shader");
}
break;
tgsi_parse_free( &parse );
- DUMP_END();
-
return ok;
}