X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Ftgsi%2Ftgsi_sse2.c;h=a4b86aba98660fe24281811b597487fc63ec0981;hb=27a19be8d1c59c64240198261af348b868b101e4;hp=3df0c5db3fa9c125660c320d09fab82de95512ce;hpb=c417a2c3f37a6a28947db5dc5aa240473d29dd19;p=mesa.git

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 3df0c5db3fa..a4b86aba986 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,12 +27,15 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
+#if defined(PIPE_ARCH_SSE)
 #include "util/u_sse.h"
+#endif
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -98,27 +101,43 @@ get_const_base( void )
 {
    return x86_make_reg(
       file_REG32,
-      reg_CX );
+      reg_AX );
 }
 
 static struct x86_reg
-get_input_base( void )
+get_machine_base( void )
 {
    return x86_make_reg(
       file_REG32,
-      reg_AX );
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Inputs) );
 }
 
 static struct x86_reg
 get_output_base( void )
 {
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Outputs) );
 }
 
 static struct x86_reg
 get_temp_base( void )
+{
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Temps) );
+}
+
+static struct x86_reg
+get_coef_base( void )
 {
    return x86_make_reg(
       file_REG32,
@@ -126,9 +145,11 @@ get_temp_base( void )
 }
 
 static struct x86_reg
-get_coef_base( void )
+get_sampler_base( void )
 {
-   return get_output_base();
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
 }
 
 static struct x86_reg
@@ -136,7 +157,7 @@ get_immediate_base( void )
 {
    return x86_make_reg(
       file_REG32,
-      reg_DI );
+      reg_DX );
 }
 
 
@@ -165,6 +186,15 @@ get_const(
       (vec * 4 + chan) * 4 );
 }
 
+static struct x86_reg
+get_sampler_ptr(
+   unsigned unit )
+{
+   return x86_make_disp(
+      get_sampler_base(),
+      unit * sizeof( struct tgsi_sampler * ) );
+}
+
 static struct x86_reg
 get_input(
    unsigned vec,
@@ -239,12 +269,14 @@ emit_const(
       /* 'vec' is the offset from the address register's value.
        * We're loading CONST[ADDR+vec] into an xmm register.
        */
-      struct x86_reg r0 = get_input_base();
-      struct x86_reg r1 = get_output_base();
+      struct x86_reg r0 = get_immediate_base();
+      struct x86_reg r1 = get_coef_base();
       uint i;
 
       assert( indirectFile == TGSI_FILE_ADDRESS );
       assert( indirectIndex == 0 );
+      assert( r0.mod == mod_REG );
+      assert( r1.mod == mod_REG );
 
       x86_push( func, r0 );
       x86_push( func, r1 );
@@ -518,24 +550,15 @@ emit_coef_dady(
  * that the stack pointer is 16 byte aligned, as expected.
  */
 static void
-emit_func_call_dst(
+emit_func_call(
    struct x86_function *func,
-   unsigned xmm_save,
-   unsigned xmm_dst,
+   unsigned xmm_save_mask,
+   const struct x86_reg *arg,
+   unsigned nr_args,
    void (PIPE_CDECL *code)() )
 {
    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n, xmm;
-   unsigned xmm_mask;
-   
-   /* Bitmask of the xmm registers to save */
-   xmm_mask = (1 << xmm_save) - 1;
-   xmm_mask &= ~(1 << xmm_dst);
-
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
+   unsigned i, n;
 
    x86_push(
       func,
@@ -547,8 +570,10 @@ emit_func_call_dst(
       func,
       x86_make_reg( file_REG32, reg_DX) );
    
+   /* Store XMM regs to the stack
+    */
    for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i))
+      if(xmm_save_mask & (1 << i))
          ++n;
    
    x86_sub_imm(
@@ -557,29 +582,45 @@ emit_func_call_dst(
       n*16);
 
    for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
+      if(xmm_save_mask & (1 << i)) {
          sse_movups(
             func,
             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( xmm ) );
+            make_xmm( i ) );
          ++n;
       }
+
+   for (i = 0; i < nr_args; i++) {
+      /* Load the address of the buffer we use for passing arguments and
+       * receiving results:
+       */
+      x86_lea(
+	 func,
+	 ecx,
+	 arg[i] );
    
-   x86_lea(
-      func,
-      ecx,
-      get_temp( TEMP_R0, 0 ) );
-   
-   x86_push( func, ecx );
+      /* Push actual function arguments (currently just the pointer to
+       * the buffer above), and call the function:
+       */
+      x86_push( func, ecx );
+   }
+
    x86_mov_reg_imm( func, ecx, (unsigned long) code );
    x86_call( func, ecx );
-   x86_pop(func, ecx );
-   
+
+   /* Pop the arguments (or just add an immediate to esp)
+    */
+   for (i = 0; i < nr_args; i++) {
+      x86_pop(func, ecx );
+   }
+
+   /* Pop the saved XMM regs:
+    */
    for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_mask & (1 << i)) {
+      if(xmm_save_mask & (1 << i)) {
          sse_movups(
             func,
-            make_xmm( xmm ),
+            make_xmm( i ),
             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
          ++n;
       }
@@ -600,33 +641,88 @@ emit_func_call_dst(
    x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX) );
+}
+
+static void
+emit_func_call_dst_src1(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src0,
+   void (PIPE_CDECL *code)() )
+{
+   struct x86_reg store = get_temp( TEMP_R0, 0 );
+   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+   
+   /* Store our input parameters (in xmm regs) to the buffer we use
+    * for passing arguments.  We will pass a pointer to this buffer as
+    * the actual function argument.
+    */
+   sse_movaps(
+      func,
+      store,
+      make_xmm( xmm_src0 ) );
+
+   emit_func_call( func,
+                   xmm_mask,
+                   &store,
+                   1,
+                   code );
 
    sse_movaps(
       func,
       make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
+      store );
 }
 
+
 static void
-emit_func_call_dst_src(
+emit_func_call_dst_src2(
    struct x86_function *func,
    unsigned xmm_save, 
    unsigned xmm_dst,
-   unsigned xmm_src,
+   unsigned xmm_src0,
+   unsigned xmm_src1,
    void (PIPE_CDECL *code)() )
 {
+   struct x86_reg store = get_temp( TEMP_R0, 0 );
+   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+
+   /* Store two inputs to parameter buffer.
+    */
    sse_movaps(
       func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
+      store,
+      make_xmm( xmm_src0 ) );
 
-   emit_func_call_dst(
+   sse_movaps(
       func,
-      xmm_save,
-      xmm_dst,
-      code );
+      x86_make_disp( store, 4 * sizeof(float) ),
+      make_xmm( xmm_src1 ) );
+
+
+   /* Emit the call
+    */
+   emit_func_call( func,
+                   xmm_mask,
+                   &store,
+                   1,
+                   code );
+
+   /* Retrieve the results:
+    */
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      store );
 }
 
+
+
+
+
+#if defined(PIPE_ARCH_SSE)
+
 /*
  * Fast SSE2 implementation of special math functions.
  */
@@ -678,6 +774,7 @@ exp2f4(__m128 x)
    return _mm_mul_ps(expipart, expfpart);
 }
 
+
 /**
  * See http://www.devmaster.net/forums/showthread.php?p=43580
  */
@@ -720,12 +817,16 @@ log2f4(__m128 x)
    return _mm_add_ps(logmant, exp);
 }
 
+
 static INLINE __m128
 powf4(__m128 x, __m128 y)
 {
    return exp2f4(_mm_mul_ps(log2f4(x), y));
 }
 
+#endif /* PIPE_ARCH_SSE */
+
+
 
 /**
  * Low-level instruction translators.
@@ -772,21 +873,29 @@ emit_cos(
    unsigned xmm_save, 
    unsigned xmm_dst )
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save, 
       xmm_dst,
+      xmm_dst,
       cos4f );
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 ex24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
+#endif
 }
 
 static void
@@ -795,10 +904,11 @@ emit_ex2(
    unsigned xmm_save, 
    unsigned xmm_dst )
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save,
       xmm_dst,
+      xmm_dst,
       ex24f );
 }
 
@@ -840,10 +950,11 @@ emit_flr(
    unsigned xmm_save, 
    unsigned xmm_dst )
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save,
       xmm_dst,
+      xmm_dst,
       flr4f );
 }
 
@@ -863,21 +974,29 @@ emit_frc(
    unsigned xmm_save, 
    unsigned xmm_dst )
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save,
       xmm_dst,
+      xmm_dst,
       frc4f );
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 lg24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
+#endif
 }
 
 static void
@@ -886,10 +1005,11 @@ emit_lg2(
    unsigned xmm_save, 
    unsigned xmm_dst )
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save,
       xmm_dst,
+      xmm_dst,
       lg24f );
 }
 
@@ -930,19 +1050,19 @@ emit_neg(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 pow4f(
    float *store )
 {
-#if 1
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
 #endif
 }
 
@@ -951,13 +1071,15 @@ emit_pow(
    struct x86_function *func,
    unsigned xmm_save, 
    unsigned xmm_dst,
-   unsigned xmm_src )
+   unsigned xmm_src0,
+   unsigned xmm_src1 )
 {
-   emit_func_call_dst_src(
+   emit_func_call_dst_src2(
       func,
       xmm_save,
       xmm_dst,
-      xmm_src,
+      xmm_src0,
+      xmm_src1,
       pow4f );
 }
 
@@ -977,6 +1099,30 @@ emit_rcp (
       make_xmm( xmm_src ) );
 }
 
+static void PIPE_CDECL
+rnd4f(
+   float *store )
+{
+   store[0] = floorf( store[0] + 0.5f );
+   store[1] = floorf( store[1] + 0.5f );
+   store[2] = floorf( store[2] + 0.5f );
+   store[3] = floorf( store[3] + 0.5f );
+}
+
+static void
+emit_rnd(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst_src1(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_dst,
+      rnd4f );
+}
+
 static void
 emit_rsqrt(
    struct x86_function *func,
@@ -1036,6 +1182,30 @@ emit_setsign(
          TGSI_EXEC_TEMP_80000000_C ) );
 }
 
+static void PIPE_CDECL
+sgn4f(
+   float *store )
+{
+   store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
+   store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
+   store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
+   store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+emit_sgn(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst_src1(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_dst,
+      sgn4f );
+}
+
 static void PIPE_CDECL
 sin4f(
    float *store )
@@ -1051,10 +1221,11 @@ emit_sin (struct x86_function *func,
           unsigned xmm_save, 
           unsigned xmm_dst)
 {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
       func,
       xmm_save,
       xmm_dst,
+      xmm_dst,
       sin4f );
 }
 
@@ -1070,6 +1241,12 @@ emit_sub(
       make_xmm( xmm_src ) );
 }
 
+
+
+
+
+
+
 /**
  * Register fetch.
  */
@@ -1228,20 +1405,164 @@ emit_store(
 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
+
+static void PIPE_CDECL
+fetch_texel( struct tgsi_sampler **sampler,
+             float *store )
+{
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n", 
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n", 
+                   j, 
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      (*sampler)->get_samples(*sampler, 
+                              &store[0], 
+                              &store[4], 
+                              &store[8], 
+                              0.0f, /*store[12],  lodbias */
+                              rgba);
+
+      memcpy( store, rgba, 16 * sizeof(float));
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n", 
+                   j, 
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
 /**
  * High-level instruction translators.
  */
 
+static void
+emit_tex( struct x86_function *func,
+          const struct tgsi_full_instruction *inst,
+          boolean lodbias,
+          boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   struct x86_reg args[2];
+   unsigned count;
+   unsigned i;
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+      count = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      count = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      count = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (lodbias) {
+      FETCH( func, *inst, 3, 0, 3 );
+   }
+   else {
+      emit_tempf(
+         func,
+         3,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+
+   }
+
+   /* store lodbias whether enabled or not -- fetch_texel currently
+    * respects it always.
+    */
+   sse_movaps( func,
+               get_temp( TEMP_R0, 3 ),
+               make_xmm( 3 ) );
+
+   
+   if (projected) {
+      FETCH( func, *inst, 3, 0, 3 );
+
+      emit_rcp( func, 3, 3 );
+   }
+
+   for (i = 0; i < count; i++) {
+      FETCH( func, *inst, i, 0, i );
+
+      if (projected) {
+         sse_mulps(
+            func,
+            make_xmm( i ),
+            make_xmm( 3 ) );
+      }
+      
+      /* Store in the argument buffer:
+       */
+      sse_movaps(
+         func,
+         get_temp( TEMP_R0, i ),
+         make_xmm( i ) );
+   }
+
+   args[0] = get_temp( TEMP_R0, 0 );
+   args[1] = get_sampler_ptr( unit );
+
+
+   emit_func_call( func,
+                   0,
+                   args,
+                   Elements(args),
+                   fetch_texel );
+
+   /* If all four channels are enabled, could use a pointer to
+    * dst[0].x instead of TEMP_R0 for store?
+    */
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
+
+      sse_movaps(
+         func,
+         make_xmm( 0 ),
+         get_temp( TEMP_R0, i ) );
+
+      STORE( func, *inst, 0, 0, i );
+   }
+}
+
+
 static void
 emit_kil(
    struct x86_function *func,
    const struct tgsi_full_src_register *reg )
 {
    unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
+   unsigned unique_count = 0;
    unsigned chan_index;
+   unsigned i;
 
    /* This mask stores component bits that were already tested. Note that
     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
@@ -1261,18 +1582,11 @@ emit_kil(
          uniquemask |= 1 << swizzle;
 
          /* allocate register */
-         registers[chan_index] = nextregister;
          emit_fetch(
             func,
-            nextregister,
+            unique_count++,
             reg,
             chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
       }
    }
 
@@ -1283,32 +1597,32 @@ emit_kil(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
 
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
+   for (i = 0 ; i < unique_count; i++ ) {
+      struct x86_reg dataXMM = make_xmm(i);
+
+      sse_cmpps(
+         func,
+         dataXMM,
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      
+      if( i == 0 ) {
+         sse_movmskps(
             func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
+            x86_make_reg( file_REG32, reg_AX ),
+            dataXMM );
+      }
+      else {
+         sse_movmskps(
+            func,
+            x86_make_reg( file_REG32, reg_DX ),
+            dataXMM );
+         x86_or(
+            func,
+            x86_make_reg( file_REG32, reg_AX ),
+            x86_make_reg( file_REG32, reg_DX ) );
       }
    }
 
@@ -1396,6 +1710,31 @@ emit_cmp(
    }
 }
 
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+   uint i;
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
+      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->SrcRegister.Indirect)
+         return TRUE;
+   }
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
+      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->DstRegister.Indirect)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
 static int
 emit_instruction(
    struct x86_function *func,
@@ -1403,10 +1742,15 @@ emit_instruction(
 {
    unsigned chan_index;
 
+   /* we can't handle indirect addressing into temp register file yet */
+   if (indirect_temp_reference(inst))
+      return FALSE;
+
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr(func, 0, 0);
          emit_f2it( func, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1473,7 +1817,7 @@ emit_instruction(
                get_temp(
                   TGSI_EXEC_TEMP_MINUS_128_I,
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 3, 1, 2 );
+            emit_pow( func, 3, 1, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
             sse_xorps(
                func,
@@ -1483,7 +1827,7 @@ emit_instruction(
                func,
                make_xmm( 2 ),
                make_xmm( 0 ),
-               cc_LessThanEqual );
+               cc_LessThan );
             sse_andps(
                func,
                make_xmm( 2 ),
@@ -1505,6 +1849,7 @@ emit_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_abs( func, 0 );
       emit_rsqrt( func, 1, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 1, 0, chan_index );
@@ -1742,7 +2087,18 @@ emit_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
    /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -1776,7 +2132,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_ROUND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_EXPBASE2:
@@ -1801,7 +2161,7 @@ emit_instruction(
    /* TGSI_OPCODE_POW */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 0, 1 );
+      emit_pow( func, 0, 0, 0, 1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1970,21 +2330,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      if (0) {
-	 /* Disable dummy texture code: 
-	  */
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-	    STORE( func, *inst, 0, 0, chan_index );
-	 }
-      }
-      else {
-	 return 0;
-      }
+      emit_tex( func, inst, FALSE, FALSE );
       break;
 
    case TGSI_OPCODE_TXD:
@@ -2016,7 +2362,12 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_ARR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_BRA:
@@ -2035,7 +2386,12 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-      return 0;
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_sgn( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_CMP:
@@ -2072,11 +2428,94 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXB:
-      return 0;
+      emit_tex( func, inst, TRUE, FALSE );
       break;
 
    case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
+             (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
+
+            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+            /* xmm4 = src.x */
+            /* xmm0 = src.x * src.x */
+            FETCH(func, *inst, 0, 0, CHAN_X);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_MOV(func, 4, 0);
+            }
+            emit_mul(func, 0, 0);
+
+            /* xmm5 = src.y */
+            /* xmm0 = xmm0 + src.y * src.y */
+            FETCH(func, *inst, 1, 0, CHAN_Y);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_MOV(func, 5, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            /* xmm6 = src.z */
+            /* xmm0 = xmm0 + src.z * src.z */
+            FETCH(func, *inst, 1, 0, CHAN_Z);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_MOV(func, 6, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            if (dims == 4) {
+               /* xmm7 = src.w */
+               /* xmm0 = xmm0 + src.w * src.w */
+               FETCH(func, *inst, 1, 0, CHAN_W);
+               if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+                  emit_MOV(func, 7, 1);
+               }
+               emit_mul(func, 1, 1);
+               emit_add(func, 0, 1);
+            }
+
+            /* xmm1 = 1 / sqrt(xmm0) */
+            emit_rsqrt(func, 1, 0);
+
+            /* dst.x = xmm1 * src.x */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_mul(func, 4, 1);
+               STORE(func, *inst, 4, 0, CHAN_X);
+            }
+
+            /* dst.y = xmm1 * src.y */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_mul(func, 5, 1);
+               STORE(func, *inst, 5, 0, CHAN_Y);
+            }
+
+            /* dst.z = xmm1 * src.z */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_mul(func, 6, 1);
+               STORE(func, *inst, 6, 0, CHAN_Z);
+            }
+
+            /* dst.w = xmm1 * src.w */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
+               emit_mul(func, 7, 1);
+               STORE(func, *inst, 7, 0, CHAN_W);
+            }
+         }
+
+         /* dst0.w = 1.0 */
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
+            emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
+            STORE(func, *inst, 0, 0, CHAN_W);
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
@@ -2084,13 +2523,26 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_TXL:
-      return 0;
+      emit_tex( func, inst, TRUE, FALSE );
       break;
 
+   case TGSI_OPCODE_TXP:
+      emit_tex( func, inst, FALSE, TRUE );
+      break;
+      
    case TGSI_OPCODE_BRK:
       return 0;
       break;
@@ -2270,7 +2722,7 @@ emit_declaration(
 
 static void aos_to_soa( struct x86_function *func, 
                         uint arg_aos,
-                        uint arg_soa, 
+                        uint arg_machine, 
                         uint arg_num, 
                         uint arg_stride )
 {
@@ -2285,7 +2737,10 @@ static void aos_to_soa( struct x86_function *func,
    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
 
    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
+   x86_lea( func, soa_input,  
+	    x86_make_disp( soa_input, 
+			   Offset(struct tgsi_exec_machine, Inputs) ) );
    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
 
@@ -2327,28 +2782,30 @@ static void aos_to_soa( struct x86_function *func,
    x86_jcc( func, cc_NE, inner_loop );
 
    /* Restore EBX */
-   x86_pop( func, aos_input );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
 }
 
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+static void soa_to_aos( struct x86_function *func, 
+			uint arg_aos, 
+			uint arg_machine, 
+			uint arg_num, 
+			uint arg_stride )
 {
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
+   struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
    int inner_loop;
 
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
    /* Save EBX */
-   x86_push( func, aos_output );
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
 
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
+   x86_lea( func, soa_output, 
+	    x86_make_disp( soa_output, 
+			   Offset(struct tgsi_exec_machine, Outputs) ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
 
    /* do */
    inner_loop = x86_get_label( func );
@@ -2365,7 +2822,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
 
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
       x86_push( func, aos_output );
       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
@@ -2389,20 +2846,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
    x86_jcc( func, cc_NE, inner_loop );
 
    /* Restore EBX */
-   x86_pop( func, aos_output );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
 }
 
 /**
  * Translate a TGSI vertex/fragment shader to SSE2 code.
  * Slightly different things are done for vertex vs. fragment shaders.
  *
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
  * \param tokens  the TGSI input shader
  * \param func  the output SSE code/function
  * \param immediates  buffer to place immediates, later passed to SSE func
@@ -2416,7 +2866,6 @@ tgsi_emit_sse2(
    boolean do_swizzles )
 {
    struct tgsi_parse_context parse;
-   boolean instruction_phase = FALSE;
    unsigned ok = 1;
    uint num_immediates = 0;
 
@@ -2428,74 +2877,48 @@ tgsi_emit_sse2(
 
    /* Can't just use EDI, EBX without save/restoring them:
     */
-   x86_push(
-      func,
-      get_immediate_base() );
-
-   x86_push(
-      func,
-      get_temp_base() );
-
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+   x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
 
    /*
     * Different function args for vertex/fragment shaders:
     */
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
       if (do_swizzles)
          aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
+                     4,         /* aos_input */
+                     1,         /* machine */
+                     5,         /* num_inputs */
+                     6 );       /* input_stride */
+   }
 
+   x86_mov(
+      func,
+      get_machine_base(),
+      x86_fn_arg( func, 1 ) );
+   x86_mov(
+      func,
+      get_const_base(),
+      x86_fn_arg( func, 2 ) );
+   x86_mov(
+      func,
+      get_immediate_base(),
+      x86_fn_arg( func, 3 ) );
+
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
       x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
+	 func,
+	 get_coef_base(),
+	 x86_fn_arg( func, 4 ) );
+
       x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
+	 func,
+	 get_sampler_base(),
+	 x86_make_disp( get_machine_base(),
+                        Offset( struct tgsi_exec_machine, Samplers ) ) );
    }
 
+
    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
       tgsi_parse_token( &parse );
 
@@ -2509,17 +2932,6 @@ tgsi_emit_sse2(
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-         }
-
          ok = emit_instruction(
             func,
             &parse.FullToken.FullInstruction );
@@ -2535,7 +2947,7 @@ tgsi_emit_sse2(
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          /* simply copy the immediate values into the next immediates[] slot */
          {
-            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
             uint i;
             assert(size <= 4);
             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
@@ -2563,18 +2975,17 @@ tgsi_emit_sse2(
 
    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
       if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
+         soa_to_aos( func, 
+		     7, 	/* aos_output */
+		     1, 	/* machine */
+		     8, 	/* num_outputs */
+		     9 );	/* output_stride */
    }
 
    /* Can't just use EBX, EDI without save/restoring them:
     */
-   x86_pop(
-      func,
-      get_temp_base() );
-
-   x86_pop(
-      func,
-      get_immediate_base() );
+   x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
 
    emit_ret( func );