#include "pipe/p_config.h"
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
+#endif
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
void (PIPE_CDECL *code)() )
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- unsigned i, n, xmm;
+ unsigned i, n;
unsigned xmm_mask;
/* Bitmask of the xmm registers to save */
sse_movups(
func,
x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
- make_xmm( xmm ) );
+ make_xmm( i ) );
++n;
}
if(xmm_mask & (1 << i)) {
sse_movups(
func,
- make_xmm( xmm ),
+ make_xmm( i ),
x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
++n;
}
code );
}
+
+#if defined(PIPE_ARCH_SSE)
+
/*
* Fast SSE2 implementation of special math functions.
*/
return _mm_mul_ps(expipart, expfpart);
}
+
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
*/
return _mm_add_ps(logmant, exp);
}
+
static INLINE __m128
powf4(__m128 x, __m128 y)
{
return exp2f4(_mm_mul_ps(log2f4(x), y));
}
+#endif /* PIPE_ARCH_SSE */
+
+
/**
* Low-level instruction translators.
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
ex24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+ store[0] = util_fast_exp2( store[0] );
+ store[1] = util_fast_exp2( store[1] );
+ store[2] = util_fast_exp2( store[2] );
+ store[3] = util_fast_exp2( store[3] );
+#endif
}
static void
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
lg24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+ store[0] = util_fast_log2( store[0] );
+ store[1] = util_fast_log2( store[1] );
+ store[2] = util_fast_log2( store[2] );
+ store[3] = util_fast_log2( store[3] );
+#endif
}
static void
}
static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
__attribute__((force_align_arg_pointer))
#endif
pow4f(
float *store )
{
-#if 1
+#if defined(PIPE_ARCH_SSE)
_mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
#else
- store[0] = powf( store[0], store[4] );
- store[1] = powf( store[1], store[5] );
- store[2] = powf( store[2], store[6] );
- store[3] = powf( store[3], store[7] );
+ store[0] = util_fast_pow( store[0], store[4] );
+ store[1] = util_fast_pow( store[1], store[5] );
+ store[2] = util_fast_pow( store[2], store[6] );
+ store[3] = util_fast_pow( store[3], store[7] );
#endif
}
break;
case TGSI_OPCODE_NRM:
- return 0;
+ /* fall-through */
+ case TGSI_OPCODE_NRM4:
+ /* 3 or 4-component normalization */
+ {
+ uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+ /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+ FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
+ FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
+ FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
+ if (dims == 4) {
+ FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+ }
+ emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
+ emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
+ emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
+ emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
+ emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ if (dims == 4) {
+ emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
+ emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
+ emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
+ }
+ emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ if (chan_index < dims) {
+ emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+ STORE( func, *inst, 4+chan_index, 0, chan_index );
+ }
+ }
+ }
break;
case TGSI_OPCODE_DIV: