tgsi: fix regression in indexed const lookups

[mesa.git] / src / gallium / auxiliary / tgsi / tgsi_sse2.c
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c

index c115956c5d788389c3b75765b08effb010a7fc78..a4b86aba98660fe24281811b597487fc63ec0981 100644 (file)
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,17 @@
   * 
   **************************************************************************/
  
-#include "pipe/p_debug.h"
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
+#include "util/u_debug.h"
  #include "pipe/p_shader_tokens.h"
  #include "util/u_math.h"
+#include "util/u_memory.h"
+#if defined(PIPE_ARCH_SSE)
+#include "util/u_sse.h"
+#endif
  #include "tgsi/tgsi_parse.h"
  #include "tgsi/tgsi_util.h"
  #include "tgsi_exec.h"
@@ -35,8 +43,6 @@
  
  #include "rtasm/rtasm_x86sse.h"
  
-#ifdef PIPE_ARCH_X86
-
  /* for 1/sqrt()
   *
   * This costs about 100fps (close to 10%) in gears:
@@ -69,6 +75,9 @@
  
  #define TEMP_R0   TGSI_EXEC_TEMP_R0
  #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
  
  /**
   * X86 utility functions.
@@ -92,27 +101,43 @@ get_const_base( void )
  {
     return x86_make_reg(
        file_REG32,
-      reg_CX );
+      reg_AX );
  }
  
  static struct x86_reg
-get_input_base( void )
+get_machine_base( void )
  {
     return x86_make_reg(
        file_REG32,
-      reg_AX );
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Inputs) );
  }
  
  static struct x86_reg
  get_output_base( void )
  {
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Outputs) );
  }
  
  static struct x86_reg
  get_temp_base( void )
+{
+   return x86_make_disp(
+      get_machine_base(),
+      Offset(struct tgsi_exec_machine, Temps) );
+}
+
+static struct x86_reg
+get_coef_base( void )
  {
     return x86_make_reg(
        file_REG32,
@@ -120,9 +145,11 @@ get_temp_base( void )
  }
  
  static struct x86_reg
-get_coef_base( void )
+get_sampler_base( void )
  {
-   return get_output_base();
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
  }
  
  static struct x86_reg
@@ -130,7 +157,7 @@ get_immediate_base( void )
  {
     return x86_make_reg(
        file_REG32,
-      reg_DI );
+      reg_DX );
  }
  
  
@@ -159,6 +186,15 @@ get_const(
        (vec * 4 + chan) * 4 );
  }
  
+static struct x86_reg
+get_sampler_ptr(
+   unsigned unit )
+{
+   return x86_make_disp(
+      get_sampler_base(),
+      unit * sizeof( struct tgsi_sampler * ) );
+}
+
  static struct x86_reg
  get_input(
     unsigned vec,
@@ -230,28 +266,55 @@ emit_const(
     int indirectIndex )
  {
     if (indirect) {
-      struct x86_reg r0 = get_input_base();
-      struct x86_reg r1 = get_output_base();
+      /* 'vec' is the offset from the address register's value.
+       * We're loading CONST[ADDR+vec] into an xmm register.
+       */
+      struct x86_reg r0 = get_immediate_base();
+      struct x86_reg r1 = get_coef_base();
        uint i;
  
        assert( indirectFile == TGSI_FILE_ADDRESS );
        assert( indirectIndex == 0 );
+      assert( r0.mod == mod_REG );
+      assert( r1.mod == mod_REG );
  
        x86_push( func, r0 );
        x86_push( func, r1 );
  
+      /*
+       * Loop over the four pixels or vertices in the quad.
+       * Get the value of the address (offset) register for pixel/vertex[i],
+       * add it to the src offset and index into the constant buffer.
+       * Note that we're working on SOA data.
+       * If any of the pixel/vertex execution channels are unused their
+       * values will be garbage.  It's very important that we don't use
+       * those garbage values as indexes into the constant buffer since
+       * that'll cause segfaults.
+       * The solution is to bitwise-AND the offset with the execution mask
+       * register whose values are either 0 or ~0.
+       * The caller must setup the execution mask register to indicate
+       * which channels are valid/alive before running the shader.
+       * The execution mask will also figure into loops and conditionals
+       * someday.
+       */
        for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
+         /* r1 = address register[i] */
           x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+         /* r0 = execution mask[i] */
+         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+         /* r1 = r1 & r0 */
+         x86_and( func, r1, r0 );
+         /* r0 = 'vec', the offset */
+         x86_lea( func, r0, get_const( vec, chan ) );
  
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
            */
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
           x86_add( func, r1, r1 );
  
-         x86_add( func, r0, r1 );
+         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
           x86_mov( func, r1, x86_deref( r0 ) );
           x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
        }
@@ -265,6 +328,7 @@ emit_const(
           get_temp( TEMP_R0, CHAN_X ) );
     }
     else {
+      /* 'vec' is the index into the src register file, such as TEMP[vec] */
        assert( vec >= 0 );
  
        sse_movss(
@@ -480,10 +544,22 @@ emit_coef_dady(
   * Function call helpers.
   */
  
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
  static void
-emit_push_gp(
-   struct x86_function *func )
+emit_func_call(
+   struct x86_function *func,
+   unsigned xmm_save_mask,
+   const struct x86_reg *arg,
+   unsigned nr_args,
+   void (PIPE_CDECL *code)() )
  {
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n;
+
     x86_push(
        func,
        x86_make_reg( file_REG32, reg_AX) );
@@ -493,12 +569,67 @@ emit_push_gp(
     x86_push(
        func,
        x86_make_reg( file_REG32, reg_DX) );
-}
+   
+   /* Store XMM regs to the stack
+    */
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_save_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_save_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( i ) );
+         ++n;
+      }
+
+   for (i = 0; i < nr_args; i++) {
+      /* Load the address of the buffer we use for passing arguments and
+       * receiving results:
+       */
+      x86_lea(
+        func,
+        ecx,
+        arg[i] );
+   
+      /* Push actual function arguments (currently just the pointer to
+       * the buffer above), and call the function:
+       */
+      x86_push( func, ecx );
+   }
+
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+
+   /* Pop the arguments (or just add an immediate to esp)
+    */
+   for (i = 0; i < nr_args; i++) {
+      x86_pop(func, ecx );
+   }
+
+   /* Pop the saved XMM regs:
+    */
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_save_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( i ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
  
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
     /* Restore GP registers in a reverse order.
      */
     x86_pop(
@@ -513,61 +644,190 @@ x86_pop_gp(
  }
  
  static void
-emit_func_call_dst(
+emit_func_call_dst_src1(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst,
+   unsigned xmm_src0,
     void (PIPE_CDECL *code)() )
  {
+   struct x86_reg store = get_temp( TEMP_R0, 0 );
+   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+   
+   /* Store our input parameters (in xmm regs) to the buffer we use
+    * for passing arguments.  We will pass a pointer to this buffer as
+    * the actual function argument.
+    */
     sse_movaps(
        func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
+      store,
+      make_xmm( xmm_src0 ) );
  
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
+   emit_func_call( func,
+                   xmm_mask,
+                   &store,
+                   1,
+                   code );
  
     sse_movaps(
        func,
        make_xmm( xmm_dst ),
-      get_temp( TEMP_R0, 0 ) );
+      store );
  }
  
+
  static void
-emit_func_call_dst_src(
+emit_func_call_dst_src2(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst,
-   unsigned xmm_src,
+   unsigned xmm_src0,
+   unsigned xmm_src1,
     void (PIPE_CDECL *code)() )
  {
+   struct x86_reg store = get_temp( TEMP_R0, 0 );
+   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
+
+   /* Store two inputs to parameter buffer.
+    */
     sse_movaps(
        func,
-      get_temp( TEMP_R0, 1 ),
-      make_xmm( xmm_src ) );
+      store,
+      make_xmm( xmm_src0 ) );
  
-   emit_func_call_dst(
+   sse_movaps(
        func,
-      xmm_dst,
-      code );
+      x86_make_disp( store, 4 * sizeof(float) ),
+      make_xmm( xmm_src1 ) );
+
+
+   /* Emit the call
+    */
+   emit_func_call( func,
+                   xmm_mask,
+                   &store,
+                   1,
+                   code );
+
+   /* Retrieve the results:
+    */
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      store );
+}
+
+
+
+
+
+#if defined(PIPE_ARCH_SSE)
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
  }
  
+#endif /* PIPE_ARCH_SSE */
+
+
+
  /**
   * Low-level instruction translators.
   */
@@ -610,38 +870,44 @@ cos4f(
  static void
  emit_cos(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst )
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save, 
+      xmm_dst,
        xmm_dst,
        cos4f );
  }
  
  static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
  ex24f(
     float *store )
  {
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
     store[0] = util_fast_exp2( store[0] );
     store[1] = util_fast_exp2( store[1] );
     store[2] = util_fast_exp2( store[2] );
     store[3] = util_fast_exp2( store[3] );
-#else
-   store[0] = powf( 2.0f, store[0] );
-   store[1] = powf( 2.0f, store[1] );
-   store[2] = powf( 2.0f, store[2] );
-   store[3] = powf( 2.0f, store[3] );
  #endif
  }
  
  static void
  emit_ex2(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst )
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save,
+      xmm_dst,
        xmm_dst,
        ex24f );
  }
@@ -681,10 +947,13 @@ flr4f(
  static void
  emit_flr(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst )
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save,
+      xmm_dst,
        xmm_dst,
        flr4f );
  }
@@ -702,31 +971,44 @@ frc4f(
  static void
  emit_frc(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst )
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save,
+      xmm_dst,
        xmm_dst,
        frc4f );
  }
  
  static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
  lg24f(
     float *store )
  {
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
     store[0] = util_fast_log2( store[0] );
     store[1] = util_fast_log2( store[1] );
     store[2] = util_fast_log2( store[2] );
     store[3] = util_fast_log2( store[3] );
+#endif
  }
  
  static void
  emit_lg2(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst )
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save,
+      xmm_dst,
        xmm_dst,
        lg24f );
  }
@@ -768,32 +1050,36 @@ emit_neg(
  }
  
  static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
  pow4f(
     float *store )
  {
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
     store[0] = util_fast_pow( store[0], store[4] );
     store[1] = util_fast_pow( store[1], store[5] );
     store[2] = util_fast_pow( store[2], store[6] );
     store[3] = util_fast_pow( store[3], store[7] );
-#else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
  #endif
  }
  
  static void
  emit_pow(
     struct x86_function *func,
+   unsigned xmm_save, 
     unsigned xmm_dst,
-   unsigned xmm_src )
+   unsigned xmm_src0,
+   unsigned xmm_src1 )
  {
-   emit_func_call_dst_src(
+   emit_func_call_dst_src2(
        func,
+      xmm_save,
        xmm_dst,
-      xmm_src,
+      xmm_src0,
+      xmm_src1,
        pow4f );
  }
  
@@ -813,6 +1099,30 @@ emit_rcp (
        make_xmm( xmm_src ) );
  }
  
+static void PIPE_CDECL
+rnd4f(
+   float *store )
+{
+   store[0] = floorf( store[0] + 0.5f );
+   store[1] = floorf( store[1] + 0.5f );
+   store[2] = floorf( store[2] + 0.5f );
+   store[3] = floorf( store[3] + 0.5f );
+}
+
+static void
+emit_rnd(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst_src1(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_dst,
+      rnd4f );
+}
+
  static void
  emit_rsqrt(
     struct x86_function *func,
@@ -872,6 +1182,30 @@ emit_setsign(
           TGSI_EXEC_TEMP_80000000_C ) );
  }
  
+static void PIPE_CDECL
+sgn4f(
+   float *store )
+{
+   store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
+   store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
+   store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
+   store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+emit_sgn(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst_src1(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_dst,
+      sgn4f );
+}
+
  static void PIPE_CDECL
  sin4f(
     float *store )
@@ -884,10 +1218,13 @@ sin4f(
  
  static void
  emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
            unsigned xmm_dst)
  {
-   emit_func_call_dst(
+   emit_func_call_dst_src1(
        func,
+      xmm_save,
+      xmm_dst,
        xmm_dst,
        sin4f );
  }
@@ -904,6 +1241,12 @@ emit_sub(
        make_xmm( xmm_src ) );
  }
  
+
+
+
+
+
+
  /**
   * Register fetch.
   */
@@ -1062,20 +1405,164 @@ emit_store(
  #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
     emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
  
+
+static void PIPE_CDECL
+fetch_texel( struct tgsi_sampler **sampler,
+             float *store )
+{
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n", 
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n", 
+                   j, 
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      (*sampler)->get_samples(*sampler, 
+                              &store[0], 
+                              &store[4], 
+                              &store[8], 
+                              0.0f, /*store[12],  lodbias */
+                              rgba);
+
+      memcpy( store, rgba, 16 * sizeof(float));
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n", 
+                   j, 
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
  /**
   * High-level instruction translators.
   */
  
+static void
+emit_tex( struct x86_function *func,
+          const struct tgsi_full_instruction *inst,
+          boolean lodbias,
+          boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   struct x86_reg args[2];
+   unsigned count;
+   unsigned i;
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+      count = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      count = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      count = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (lodbias) {
+      FETCH( func, *inst, 3, 0, 3 );
+   }
+   else {
+      emit_tempf(
+         func,
+         3,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+
+   }
+
+   /* store lodbias whether enabled or not -- fetch_texel currently
+    * respects it always.
+    */
+   sse_movaps( func,
+               get_temp( TEMP_R0, 3 ),
+               make_xmm( 3 ) );
+
+   
+   if (projected) {
+      FETCH( func, *inst, 3, 0, 3 );
+
+      emit_rcp( func, 3, 3 );
+   }
+
+   for (i = 0; i < count; i++) {
+      FETCH( func, *inst, i, 0, i );
+
+      if (projected) {
+         sse_mulps(
+            func,
+            make_xmm( i ),
+            make_xmm( 3 ) );
+      }
+      
+      /* Store in the argument buffer:
+       */
+      sse_movaps(
+         func,
+         get_temp( TEMP_R0, i ),
+         make_xmm( i ) );
+   }
+
+   args[0] = get_temp( TEMP_R0, 0 );
+   args[1] = get_sampler_ptr( unit );
+
+
+   emit_func_call( func,
+                   0,
+                   args,
+                   Elements(args),
+                   fetch_texel );
+
+   /* If all four channels are enabled, could use a pointer to
+    * dst[0].x instead of TEMP_R0 for store?
+    */
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
+
+      sse_movaps(
+         func,
+         make_xmm( 0 ),
+         get_temp( TEMP_R0, i ) );
+
+      STORE( func, *inst, 0, 0, i );
+   }
+}
+
+
  static void
  emit_kil(
     struct x86_function *func,
     const struct tgsi_full_src_register *reg )
  {
     unsigned uniquemask;
-   unsigned registers[4];
-   unsigned nextregister = 0;
-   unsigned firstchan = ~0;
+   unsigned unique_count = 0;
     unsigned chan_index;
+   unsigned i;
  
     /* This mask stores component bits that were already tested. Note that
      * we test if the value is less than zero, so 1.0 and 0.0 need not to be
@@ -1095,18 +1582,11 @@ emit_kil(
           uniquemask |= 1 << swizzle;
  
           /* allocate register */
-         registers[chan_index] = nextregister;
           emit_fetch(
              func,
-            nextregister,
+            unique_count++,
              reg,
              chan_index );
-         nextregister++;
-
-         /* mark the first channel used */
-         if( firstchan == ~0 ) {
-            firstchan = chan_index;
-         }
        }
     }
  
@@ -1117,32 +1597,32 @@ emit_kil(
        func,
        x86_make_reg( file_REG32, reg_DX ) );
  
-   FOR_EACH_CHANNEL( chan_index ) {
-      if( uniquemask & (1 << chan_index) ) {
-         sse_cmpps(
+   for (i = 0 ; i < unique_count; i++ ) {
+      struct x86_reg dataXMM = make_xmm(i);
+
+      sse_cmpps(
+         func,
+         dataXMM,
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      
+      if( i == 0 ) {
+         sse_movmskps(
              func,
-            make_xmm( registers[chan_index] ),
-            get_temp(
-               TGSI_EXEC_TEMP_00000000_I,
-               TGSI_EXEC_TEMP_00000000_C ),
-            cc_LessThan );
-
-         if( chan_index == firstchan ) {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               make_xmm( registers[chan_index] ) );
-         }
-         else {
-            sse_pmovmskb(
-               func,
-               x86_make_reg( file_REG32, reg_DX ),
-               make_xmm( registers[chan_index] ) );
-            x86_or(
-               func,
-               x86_make_reg( file_REG32, reg_AX ),
-               x86_make_reg( file_REG32, reg_DX ) );
-         }
+            x86_make_reg( file_REG32, reg_AX ),
+            dataXMM );
+      }
+      else {
+         sse_movmskps(
+            func,
+            x86_make_reg( file_REG32, reg_DX ),
+            dataXMM );
+         x86_or(
+            func,
+            x86_make_reg( file_REG32, reg_AX ),
+            x86_make_reg( file_REG32, reg_DX ) );
        }
     }
  
@@ -1230,6 +1710,31 @@ emit_cmp(
     }
  }
  
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+   uint i;
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
+      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->SrcRegister.Indirect)
+         return TRUE;
+   }
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
+      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+          reg->DstRegister.Indirect)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
  static int
  emit_instruction(
     struct x86_function *func,
@@ -1237,10 +1742,15 @@ emit_instruction(
  {
     unsigned chan_index;
  
+   /* we can't handle indirect addressing into temp register file yet */
+   if (indirect_temp_reference(inst))
+      return FALSE;
+
     switch (inst->Instruction.Opcode) {
     case TGSI_OPCODE_ARL:
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr(func, 0, 0);
           emit_f2it( func, 0 );
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1307,7 +1817,7 @@ emit_instruction(
                 get_temp(
                    TGSI_EXEC_TEMP_MINUS_128_I,
                    TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
+            emit_pow( func, 3, 1, 1, 2 );
              FETCH( func, *inst, 0, 0, CHAN_X );
              sse_xorps(
                 func,
@@ -1317,7 +1827,7 @@ emit_instruction(
                 func,
                 make_xmm( 2 ),
                 make_xmm( 0 ),
-               cc_LessThanEqual );
+               cc_LessThan );
              sse_andps(
                 func,
                 make_xmm( 2 ),
@@ -1339,6 +1849,7 @@ emit_instruction(
     case TGSI_OPCODE_RSQ:
     /* TGSI_OPCODE_RECIPSQRT */
        FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_abs( func, 0 );
        emit_rsqrt( func, 1, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 1, 0, chan_index );
@@ -1353,11 +1864,11 @@ emit_instruction(
           if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
               IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
              emit_MOV( func, 1, 0 );
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
              /* dst.x = ex2(floor(src.x)) */
              if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                 emit_MOV( func, 2, 1 );
-               emit_ex2( func, 2 );
+               emit_ex2( func, 3, 2 );
                 STORE( func, *inst, 2, 0, CHAN_X );
              }
              /* dst.y = src.x - floor(src.x) */
@@ -1369,7 +1880,7 @@ emit_instruction(
           }
           /* dst.z = ex2(src.x) */
           if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 0 );
+            emit_ex2( func, 3, 0 );
              STORE( func, *inst, 0, 0, CHAN_Z );
           }
        }
@@ -1387,21 +1898,21 @@ emit_instruction(
           FETCH( func, *inst, 0, 0, CHAN_X );
           emit_abs( func, 0 );
           emit_MOV( func, 1, 0 );
-         emit_lg2( func, 1 );
+         emit_lg2( func, 2, 1 );
           /* dst.z = lg2(abs(src.x)) */
           if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
              STORE( func, *inst, 1, 0, CHAN_Z );
           }
           if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
               IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
              /* dst.x = floor(lg2(abs(src.x))) */
              if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                 STORE( func, *inst, 1, 0, CHAN_X );
              }
              /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
              if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 1 );
+               emit_ex2( func, 2, 1 );
                 emit_rcp( func, 1, 1 );
                 emit_mul( func, 0, 1 );
                 STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1576,7 +2087,18 @@ emit_instruction(
  
     case TGSI_OPCODE_DOT2ADD:
     /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
        break;
  
     case TGSI_OPCODE_INDEX:
@@ -1591,7 +2113,7 @@ emit_instruction(
     /* TGSI_OPCODE_FRC */
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
+         emit_frc( func, 0, 0 );
           STORE( func, *inst, 0, 0, chan_index );
        }
        break;
@@ -1604,19 +2126,23 @@ emit_instruction(
     /* TGSI_OPCODE_FLR */
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
+         emit_flr( func, 0, 0 );
           STORE( func, *inst, 0, 0, chan_index );
        }
        break;
  
     case TGSI_OPCODE_ROUND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_EXPBASE2:
     /* TGSI_OPCODE_EX2 */
        FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
+      emit_ex2( func, 0, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1625,7 +2151,7 @@ emit_instruction(
     case TGSI_OPCODE_LOGBASE2:
     /* TGSI_OPCODE_LG2 */
        FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
+      emit_lg2( func, 0, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1635,7 +2161,7 @@ emit_instruction(
     /* TGSI_OPCODE_POW */
        FETCH( func, *inst, 0, 0, CHAN_X );
        FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
+      emit_pow( func, 0, 0, 0, 1 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1726,7 +2252,7 @@ emit_instruction(
  
     case TGSI_OPCODE_COS:
        FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
+      emit_cos( func, 0, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1785,7 +2311,7 @@ emit_instruction(
  
     case TGSI_OPCODE_SIN:
        FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
+      emit_sin( func, 0, 0 );
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           STORE( func, *inst, 0, 0, chan_index );
        }
@@ -1804,21 +2330,7 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_TEX:
-      if (0) {
-        /* Disable dummy texture code: 
-         */
-        emit_tempf(
-           func,
-           0,
-           TEMP_ONE_I,
-           TEMP_ONE_C );
-        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-           STORE( func, *inst, 0, 0, chan_index );
-        }
-      }
-      else {
-        return 0;
-      }
+      emit_tex( func, inst, FALSE, FALSE );
        break;
  
     case TGSI_OPCODE_TXD:
@@ -1850,7 +2362,12 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_ARR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_BRA:
@@ -1869,7 +2386,12 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_SSG:
-      return 0;
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_sgn( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
        break;
  
     case TGSI_OPCODE_CMP:
@@ -1879,12 +2401,12 @@ emit_instruction(
     case TGSI_OPCODE_SCS:
        IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
           FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
+         emit_cos( func, 0, 0 );
           STORE( func, *inst, 0, 0, CHAN_X );
        }
        IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
           FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
+         emit_sin( func, 0, 0 );
           STORE( func, *inst, 0, 0, CHAN_Y );
        }
        IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
@@ -1906,11 +2428,94 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_TXB:
-      return 0;
+      emit_tex( func, inst, TRUE, FALSE );
        break;
  
     case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
+             (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
+
+            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+            /* xmm4 = src.x */
+            /* xmm0 = src.x * src.x */
+            FETCH(func, *inst, 0, 0, CHAN_X);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_MOV(func, 4, 0);
+            }
+            emit_mul(func, 0, 0);
+
+            /* xmm5 = src.y */
+            /* xmm0 = xmm0 + src.y * src.y */
+            FETCH(func, *inst, 1, 0, CHAN_Y);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_MOV(func, 5, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            /* xmm6 = src.z */
+            /* xmm0 = xmm0 + src.z * src.z */
+            FETCH(func, *inst, 1, 0, CHAN_Z);
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_MOV(func, 6, 1);
+            }
+            emit_mul(func, 1, 1);
+            emit_add(func, 0, 1);
+
+            if (dims == 4) {
+               /* xmm7 = src.w */
+               /* xmm0 = xmm0 + src.w * src.w */
+               FETCH(func, *inst, 1, 0, CHAN_W);
+               if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+                  emit_MOV(func, 7, 1);
+               }
+               emit_mul(func, 1, 1);
+               emit_add(func, 0, 1);
+            }
+
+            /* xmm1 = 1 / sqrt(xmm0) */
+            emit_rsqrt(func, 1, 0);
+
+            /* dst.x = xmm1 * src.x */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+               emit_mul(func, 4, 1);
+               STORE(func, *inst, 4, 0, CHAN_X);
+            }
+
+            /* dst.y = xmm1 * src.y */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+               emit_mul(func, 5, 1);
+               STORE(func, *inst, 5, 0, CHAN_Y);
+            }
+
+            /* dst.z = xmm1 * src.z */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+               emit_mul(func, 6, 1);
+               STORE(func, *inst, 6, 0, CHAN_Z);
+            }
+
+            /* dst.w = xmm1 * src.w */
+            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
+               emit_mul(func, 7, 1);
+               STORE(func, *inst, 7, 0, CHAN_W);
+            }
+         }
+
+         /* dst0.w = 1.0 */
+         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
+            emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
+            STORE(func, *inst, 0, 0, CHAN_W);
+         }
+      }
        break;
  
     case TGSI_OPCODE_DIV:
@@ -1918,13 +2523,26 @@ emit_instruction(
        break;
  
     case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
        break;
  
     case TGSI_OPCODE_TXL:
-      return 0;
+      emit_tex( func, inst, TRUE, FALSE );
        break;
  
+   case TGSI_OPCODE_TXP:
+      emit_tex( func, inst, FALSE, TRUE );
+      break;
+      
     case TGSI_OPCODE_BRK:
        return 0;
        break;
@@ -2104,7 +2722,7 @@ emit_declaration(
  
  static void aos_to_soa( struct x86_function *func, 
                          uint arg_aos,
-                        uint arg_soa, 
+                        uint arg_machine, 
                          uint arg_num, 
                          uint arg_stride )
  {
@@ -2119,7 +2737,10 @@ static void aos_to_soa( struct x86_function *func,
     x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
  
     x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
+   x86_lea( func, soa_input,  
+           x86_make_disp( soa_input, 
+                          Offset(struct tgsi_exec_machine, Inputs) ) );
     x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
     x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
  
@@ -2161,28 +2782,30 @@ static void aos_to_soa( struct x86_function *func,
     x86_jcc( func, cc_NE, inner_loop );
  
     /* Restore EBX */
-   x86_pop( func, aos_input );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
  }
  
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+static void soa_to_aos( struct x86_function *func, 
+                       uint arg_aos, 
+                       uint arg_machine, 
+                       uint arg_num, 
+                       uint arg_stride )
  {
-   struct x86_reg soa_output;
-   struct x86_reg aos_output;
-   struct x86_reg num_outputs;
-   struct x86_reg temp;
+   struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
     int inner_loop;
  
-   soa_output = x86_make_reg( file_REG32, reg_AX );
-   aos_output = x86_make_reg( file_REG32, reg_BX );
-   num_outputs = x86_make_reg( file_REG32, reg_CX );
-   temp = x86_make_reg( file_REG32, reg_DX );
-
     /* Save EBX */
-   x86_push( func, aos_output );
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
  
-   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
-   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
+   x86_lea( func, soa_output, 
+           x86_make_disp( soa_output, 
+                          Offset(struct tgsi_exec_machine, Outputs) ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
  
     /* do */
     inner_loop = x86_get_label( func );
@@ -2199,7 +2822,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
        sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
        sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
  
-      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
        x86_push( func, aos_output );
        sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
        sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
@@ -2223,20 +2846,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
     x86_jcc( func, cc_NE, inner_loop );
  
     /* Restore EBX */
-   x86_pop( func, aos_output );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
  }
  
  /**
   * Translate a TGSI vertex/fragment shader to SSE2 code.
   * Slightly different things are done for vertex vs. fragment shaders.
   *
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
   * \param tokens  the TGSI input shader
   * \param func  the output SSE code/function
   * \param immediates  buffer to place immediates, later passed to SSE func
@@ -2250,7 +2866,6 @@ tgsi_emit_sse2(
     boolean do_swizzles )
  {
     struct tgsi_parse_context parse;
-   boolean instruction_phase = FALSE;
     unsigned ok = 1;
     uint num_immediates = 0;
  
@@ -2262,74 +2877,48 @@ tgsi_emit_sse2(
  
     /* Can't just use EDI, EBX without save/restoring them:
      */
-   x86_push(
-      func,
-      get_immediate_base() );
-
-   x86_push(
-      func,
-      get_temp_base() );
-
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+   x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
  
     /*
      * Different function args for vertex/fragment shaders:
      */
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      /* DECLARATION phase, do not load output argument. */
-      x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      /* skipping outputs argument here */
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
-      x86_mov(
-         func,
-         get_coef_base(),
-         x86_fn_arg( func, 5 ) );
-      x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 6 ) );
-   }
-   else {
-      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
        if (do_swizzles)
           aos_to_soa( func, 
-                     6,         /* aos_input */
-                     1,         /* machine->input */
-                     7,         /* num_inputs */
-                     8 );       /* input_stride */
+                     4,         /* aos_input */
+                     1,         /* machine */
+                     5,         /* num_inputs */
+                     6 );       /* input_stride */
+   }
  
+   x86_mov(
+      func,
+      get_machine_base(),
+      x86_fn_arg( func, 1 ) );
+   x86_mov(
+      func,
+      get_const_base(),
+      x86_fn_arg( func, 2 ) );
+   x86_mov(
+      func,
+      get_immediate_base(),
+      x86_fn_arg( func, 3 ) );
+
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
        x86_mov(
-         func,
-         get_input_base(),
-         x86_fn_arg( func, 1 ) );
-      x86_mov(
-         func,
-         get_output_base(),
-         x86_fn_arg( func, 2 ) );
-      x86_mov(
-         func,
-         get_const_base(),
-         x86_fn_arg( func, 3 ) );
-      x86_mov(
-         func,
-         get_temp_base(),
-         x86_fn_arg( func, 4 ) );
+        func,
+        get_coef_base(),
+        x86_fn_arg( func, 4 ) );
+
        x86_mov(
-         func,
-         get_immediate_base(),
-         x86_fn_arg( func, 5 ) );
+        func,
+        get_sampler_base(),
+        x86_make_disp( get_machine_base(),
+                        Offset( struct tgsi_exec_machine, Samplers ) ) );
     }
  
+
     while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
        tgsi_parse_token( &parse );
  
@@ -2343,17 +2932,6 @@ tgsi_emit_sse2(
           break;
  
        case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-            if( !instruction_phase ) {
-               /* INSTRUCTION phase, overwrite coeff with output. */
-               instruction_phase = TRUE;
-               x86_mov(
-                  func,
-                  get_output_base(),
-                  x86_fn_arg( func, 2 ) );
-            }
-         }
-
           ok = emit_instruction(
              func,
              &parse.FullToken.FullInstruction );
@@ -2369,7 +2947,7 @@ tgsi_emit_sse2(
        case TGSI_TOKEN_TYPE_IMMEDIATE:
           /* simply copy the immediate values into the next immediates[] slot */
           {
-            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
              uint i;
              assert(size <= 4);
              assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
@@ -2397,18 +2975,17 @@ tgsi_emit_sse2(
  
     if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
        if (do_swizzles)
-         soa_to_aos( func, 9, 2, 10, 11 );
+         soa_to_aos( func, 
+                    7,         /* aos_output */
+                    1,         /* machine */
+                    8,         /* num_outputs */
+                    9 );       /* output_stride */
     }
  
     /* Can't just use EBX, EDI without save/restoring them:
      */
-   x86_pop(
-      func,
-      get_temp_base() );
-
-   x86_pop(
-      func,
-      get_immediate_base() );
+   x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
  
     emit_ret( func );