i965: Share OPCODE_TXB between brw_wm_emit.c and brw_wm_glsl.c

[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_emit.c
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c

index 2e2885254ef615fbab6740256288498bf418c3b5..5390fd25849d16127d7e470f4f794cd25522ac65 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -34,8 +34,6 @@
  #include "brw_context.h"
  #include "brw_wm.h"
  
-#define SATURATE (1<<5)
-
  /* Not quite sure how correct this is - need to understand horiz
   * vs. vertical strides a little better.
   */
@@ -46,6 +44,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
     return reg;
  }
  
+
  /* Payload R0:
   *
   * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
@@ -62,44 +61,50 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
   * R1.8 -- ?
   */
  
-
-static void emit_pixel_xy(struct brw_compile *p,
-                         const struct brw_reg *dst,
-                         GLuint mask,
-                         const struct brw_reg *arg0)
+void emit_pixel_xy(struct brw_wm_compile *c,
+                  const struct brw_reg *dst,
+                  GLuint mask)
  {
+   struct brw_compile *p = &c->func;
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+   struct brw_reg dst0_uw, dst1_uw;
  
+   brw_push_insn_state(p);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  
+   if (c->dispatch_width == 16) {
+      dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   } else {
+      dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   }
+
     /* Calculate pixel centers by adding 1 or 0 to each of the
      * micro-tile coordinates passed in r1.
      */
     if (mask & WRITEMASK_X) {
        brw_ADD(p,
-             vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
+             dst0_uw,
               stride(suboffset(r1_uw, 4), 2, 4, 0),
               brw_imm_v(0x10101010));
     }
  
     if (mask & WRITEMASK_Y) {
        brw_ADD(p,
-             vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
+             dst1_uw,
               stride(suboffset(r1_uw,5), 2, 4, 0),
               brw_imm_v(0x11001100));
     }
-
-   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   brw_pop_insn_state(p);
  }
  
  
-
-static void emit_delta_xy(struct brw_compile *p,
-                         const struct brw_reg *dst,
-                         GLuint mask,
-                         const struct brw_reg *arg0,
-                         const struct brw_reg *arg1)
+void emit_delta_xy(struct brw_compile *p,
+                  const struct brw_reg *dst,
+                  GLuint mask,
+                  const struct brw_reg *arg0)
  {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
  
@@ -122,10 +127,10 @@ static void emit_delta_xy(struct brw_compile *p,
     }
  }
  
-static void emit_wpos_xy(struct brw_wm_compile *c,
-                        const struct brw_reg *dst,
-                        GLuint mask,
-                        const struct brw_reg *arg0)
+void emit_wpos_xy(struct brw_wm_compile *c,
+                 const struct brw_reg *dst,
+                 GLuint mask,
+                 const struct brw_reg *arg0)
  {
     struct brw_compile *p = &c->func;
  
@@ -150,12 +155,14 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
  }
  
  
-static void emit_pixel_w( struct brw_compile *p,
-                         const struct brw_reg *dst,
-                         GLuint mask,
-                         const struct brw_reg *arg0,
-                         const struct brw_reg *deltas)
+void emit_pixel_w(struct brw_wm_compile *c,
+                 const struct brw_reg *dst,
+                 GLuint mask,
+                 const struct brw_reg *arg0,
+                 const struct brw_reg *deltas)
  {
+   struct brw_compile *p = &c->func;
+
     /* Don't need this if all you are doing is interpolating color, for
      * instance.
      */
@@ -169,21 +176,29 @@ static void emit_pixel_w( struct brw_compile *p,
        brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
  
        /* Calc w */
-      brw_math_16( p, dst[3],
-                  BRW_MATH_FUNCTION_INV,
-                  BRW_MATH_SATURATE_NONE,
-                  2, brw_null_reg(),
-                  BRW_MATH_PRECISION_FULL);
+      if (c->dispatch_width == 16) {
+        brw_math_16(p, dst[3],
+                    BRW_MATH_FUNCTION_INV,
+                    BRW_MATH_SATURATE_NONE,
+                    2, brw_null_reg(),
+                    BRW_MATH_PRECISION_FULL);
+      } else {
+        brw_math(p, dst[3],
+                 BRW_MATH_FUNCTION_INV,
+                 BRW_MATH_SATURATE_NONE,
+                 2, brw_null_reg(),
+                 BRW_MATH_DATA_VECTOR,
+                 BRW_MATH_PRECISION_FULL);
+      }
     }
  }
  
  
-
-static void emit_linterp( struct brw_compile *p, 
-                        const struct brw_reg *dst,
-                        GLuint mask,
-                        const struct brw_reg *arg0,
-                        const struct brw_reg *deltas )
+void emit_linterp(struct brw_compile *p,
+                 const struct brw_reg *dst,
+                 GLuint mask,
+                 const struct brw_reg *arg0,
+                 const struct brw_reg *deltas)
  {
     struct brw_reg interp[4];
     GLuint nr = arg0[0].nr;
@@ -203,12 +218,12 @@ static void emit_linterp( struct brw_compile *p,
  }
  
  
-static void emit_pinterp( struct brw_compile *p, 
-                         const struct brw_reg *dst,
-                         GLuint mask,
-                         const struct brw_reg *arg0,
-                         const struct brw_reg *deltas,
-                         const struct brw_reg *w)
+void emit_pinterp(struct brw_compile *p,
+                 const struct brw_reg *dst,
+                 GLuint mask,
+                 const struct brw_reg *arg0,
+                 const struct brw_reg *deltas,
+                 const struct brw_reg *w)
  {
     struct brw_reg interp[4];
     GLuint nr = arg0[0].nr;
@@ -233,10 +248,10 @@ static void emit_pinterp( struct brw_compile *p,
  }
  
  
-static void emit_cinterp( struct brw_compile *p, 
-                        const struct brw_reg *dst,
-                        GLuint mask,
-                        const struct brw_reg *arg0 )
+void emit_cinterp(struct brw_compile *p,
+                 const struct brw_reg *dst,
+                 GLuint mask,
+                 const struct brw_reg *arg0)
  {
     struct brw_reg interp[4];
     GLuint nr = arg0[0].nr;
@@ -255,9 +270,9 @@ static void emit_cinterp( struct brw_compile *p,
  }
  
  /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
-static void emit_frontfacing( struct brw_compile *p,
-                             const struct brw_reg *dst,
-                             GLuint mask )
+void emit_frontfacing(struct brw_compile *p,
+                     const struct brw_reg *dst,
+                     GLuint mask)
  {
     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
     GLuint i;
@@ -283,13 +298,86 @@ static void emit_frontfacing( struct brw_compile *p,
     brw_set_predicate_control_flag_value(p, 0xff);
  }
  
-static void emit_alu1( struct brw_compile *p, 
-                      struct brw_instruction *(*func)(struct brw_compile *, 
-                                                      struct brw_reg, 
-                                                      struct brw_reg),
-                      const struct brw_reg *dst,
-                      GLuint mask,
-                      const struct brw_reg *arg0 )
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * and we're trying to produce:
+ *
+ *           DDX                     DDY
+ * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
+ *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
+ *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
+ *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
+ *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
+ *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
+ *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
+ *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
+ * between each other.  We could probably do it like ddx and swizzle the right
+ * order later, but bail for now and just produce
+ * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
+ */
+void emit_ddxy(struct brw_compile *p,
+              const struct brw_reg *dst,
+              GLuint mask,
+              GLboolean is_ddx,
+              const struct brw_reg *arg0)
+{
+   int i;
+   struct brw_reg src0, src1;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+   for (i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+        if (is_ddx) {
+           src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
+                          BRW_REGISTER_TYPE_F,
+                          BRW_VERTICAL_STRIDE_2,
+                          BRW_WIDTH_2,
+                          BRW_HORIZONTAL_STRIDE_0,
+                          BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+           src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+                          BRW_REGISTER_TYPE_F,
+                          BRW_VERTICAL_STRIDE_2,
+                          BRW_WIDTH_2,
+                          BRW_HORIZONTAL_STRIDE_0,
+                          BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+        } else {
+           src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+                          BRW_REGISTER_TYPE_F,
+                          BRW_VERTICAL_STRIDE_4,
+                          BRW_WIDTH_4,
+                          BRW_HORIZONTAL_STRIDE_0,
+                          BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+           src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
+                          BRW_REGISTER_TYPE_F,
+                          BRW_VERTICAL_STRIDE_4,
+                          BRW_WIDTH_4,
+                          BRW_HORIZONTAL_STRIDE_0,
+                          BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+        }
+        brw_ADD(p, dst[i], src0, negate(src1));
+      }
+   }
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+void emit_alu1(struct brw_compile *p,
+              struct brw_instruction *(*func)(struct brw_compile *,
+                                              struct brw_reg,
+                                              struct brw_reg),
+              const struct brw_reg *dst,
+              GLuint mask,
+              const struct brw_reg *arg0)
  {
     GLuint i;
  
@@ -307,15 +395,15 @@ static void emit_alu1( struct brw_compile *p,
  }
  
  
-static void emit_alu2( struct brw_compile *p, 
-                      struct brw_instruction *(*func)(struct brw_compile *, 
-                                                      struct brw_reg, 
-                                                      struct brw_reg, 
-                                                      struct brw_reg),
-                      const struct brw_reg *dst,
-                      GLuint mask,
-                      const struct brw_reg *arg0,
-                      const struct brw_reg *arg1 )
+void emit_alu2(struct brw_compile *p,
+              struct brw_instruction *(*func)(struct brw_compile *,
+                                              struct brw_reg,
+                                              struct brw_reg,
+                                              struct brw_reg),
+              const struct brw_reg *dst,
+              GLuint mask,
+              const struct brw_reg *arg0,
+              const struct brw_reg *arg1)
  {
     GLuint i;
  
@@ -333,12 +421,12 @@ static void emit_alu2( struct brw_compile *p,
  }
  
  
-static void emit_mad( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1,
-                     const struct brw_reg *arg2 )
+void emit_mad(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1,
+             const struct brw_reg *arg2)
  {
     GLuint i;
  
@@ -353,26 +441,12 @@ static void emit_mad( struct brw_compile *p,
     }
  }
  
-static void emit_trunc( struct brw_compile *p,
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0)
-{
-   GLuint i;
-
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-        brw_RNDZ(p, dst[i], arg0[i]);
-      }
-   }
-}
-
-static void emit_lrp( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1,
-                     const struct brw_reg *arg2 )
+void emit_lrp(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1,
+             const struct brw_reg *arg2)
  {
     GLuint i;
  
@@ -392,21 +466,24 @@ static void emit_lrp( struct brw_compile *p,
     }
  }
  
-static void emit_sop( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     GLuint cond,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_sop(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             GLuint cond,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
     GLuint i;
  
     for (i = 0; i < 4; i++) {
        if (mask & (1<<i)) {     
-        brw_MOV(p, dst[i], brw_imm_f(0));
+        brw_push_insn_state(p);
          brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+        brw_MOV(p, dst[i], brw_imm_f(0));
+        brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
          brw_MOV(p, dst[i], brw_imm_f(1.0));
-        brw_set_predicate_control_flag_value(p, 0xff);
+        brw_pop_insn_state(p);
        }
     }
  }
@@ -490,11 +567,11 @@ static void emit_cmp( struct brw_compile *p,
     }
  }
  
-static void emit_max( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_max(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
     GLuint i;
  
@@ -514,11 +591,11 @@ static void emit_max( struct brw_compile *p,
     }
  }
  
-static void emit_min( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_min(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
     GLuint i;
  
@@ -539,73 +616,79 @@ static void emit_min( struct brw_compile *p,
  }
  
  
-static void emit_dp3( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_dp3(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
     if (!(mask & WRITEMASK_XYZW))
        return; /* Do not emit dead code */
  
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
  
     brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
     brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
  
     brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
     brw_set_saturate(p, 0);
  }
  
  
-static void emit_dp4( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_dp4(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
     if (!(mask & WRITEMASK_XYZW))
        return; /* Do not emit dead code */
  
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
  
     brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
     brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
     brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
  
     brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[3], arg1[3]);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
     brw_set_saturate(p, 0);
  }
  
  
-static void emit_dph( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_dph(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
+   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
     if (!(mask & WRITEMASK_XYZW))
        return; /* Do not emit dead code */
  
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
  
     brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
     brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
  
     brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_ADD(p, dst[0], dst[0], arg1[3]);
+   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
     brw_set_saturate(p, 0);
  }
  
  
-static void emit_xpd( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0,
-                     const struct brw_reg *arg1 )
+void emit_xpd(struct brw_compile *p,
+             const struct brw_reg *dst,
+             GLuint mask,
+             const struct brw_reg *arg0,
+             const struct brw_reg *arg1)
  {
     GLuint i;
  
@@ -626,207 +709,303 @@ static void emit_xpd( struct brw_compile *p,
  }
  
  
-static void emit_math1( struct brw_compile *p, 
-                       GLuint function,
-                       const struct brw_reg *dst,
-                       GLuint mask,
-                       const struct brw_reg *arg0 )
+void emit_math1(struct brw_wm_compile *c,
+               GLuint function,
+               const struct brw_reg *dst,
+               GLuint mask,
+               const struct brw_reg *arg0)
  {
+   struct brw_compile *p = &c->func;
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+                     BRW_MATH_SATURATE_SATURATE :
+                     BRW_MATH_SATURATE_NONE);
+
     if (!(mask & WRITEMASK_XYZW))
        return; /* Do not emit dead code */
  
-   //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
-   //    function == BRW_MATH_FUNCTION_SINCOS);
-   
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+   /* If compressed, this will write message reg 2,3 from arg0.x's 16
+    * channels.
+    */
     brw_MOV(p, brw_message_reg(2), arg0[0]);
  
     /* Send two messages to perform all 16 operations:
      */
-   brw_math_16(p, 
-              dst[0],
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_math(p,
+           dst[dst_chan],
+           function,
+           saturate,
+           2,
+           brw_null_reg(),
+           BRW_MATH_DATA_VECTOR,
+           BRW_MATH_PRECISION_FULL);
+
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+              offset(dst[dst_chan],1),
                function,
-              (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-              2,
+              saturate,
+              3,
                brw_null_reg(),
+              BRW_MATH_DATA_VECTOR,
                BRW_MATH_PRECISION_FULL);
+   }
+   brw_pop_insn_state(p);
  }
  
  
-static void emit_math2( struct brw_compile *p, 
-                       GLuint function,
-                       const struct brw_reg *dst,
-                       GLuint mask,
-                       const struct brw_reg *arg0,
-                       const struct brw_reg *arg1)
+void emit_math2(struct brw_wm_compile *c,
+               GLuint function,
+               const struct brw_reg *dst,
+               GLuint mask,
+               const struct brw_reg *arg0,
+               const struct brw_reg *arg1)
  {
+   struct brw_compile *p = &c->func;
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+                     BRW_MATH_SATURATE_SATURATE :
+                     BRW_MATH_SATURATE_NONE);
+
     if (!(mask & WRITEMASK_XYZW))
        return; /* Do not emit dead code */
  
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
  
     brw_push_insn_state(p);
  
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, brw_message_reg(2), arg0[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   }
  
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, brw_message_reg(3), arg1[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   }
  
-   
-   /* Send two messages to perform all 16 operations:
-    */
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_math(p, 
-           dst[0],
+           dst[dst_chan],
             function,
-           (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+           saturate,
             2,
             brw_null_reg(),
             BRW_MATH_DATA_VECTOR,
             BRW_MATH_PRECISION_FULL);
  
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_math(p, 
-           offset(dst[0],1),
-           function,
-           (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-           4,
-           brw_null_reg(),
-           BRW_MATH_DATA_VECTOR,
-           BRW_MATH_PRECISION_FULL);
-   
+   /* Send two messages to perform all 16 operations:
+    */
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+              offset(dst[dst_chan],1),
+              function,
+              saturate,
+              4,
+              brw_null_reg(),
+              BRW_MATH_DATA_VECTOR,
+              BRW_MATH_PRECISION_FULL);
+   }
     brw_pop_insn_state(p);
  }
-                    
  
  
-static void emit_tex( struct brw_wm_compile *c,
-                     const struct brw_wm_instruction *inst,
-                     struct brw_reg *dst,
-                     GLuint dst_flags,
-                     struct brw_reg *arg )
+void emit_tex(struct brw_wm_compile *c,
+             struct brw_reg *dst,
+             GLuint dst_flags,
+             struct brw_reg *arg,
+             struct brw_reg depth_payload,
+             GLuint tex_idx,
+             GLuint sampler,
+             GLboolean shadow)
  {
     struct brw_compile *p = &c->func;
-   GLuint msgLength, responseLength;
-   GLuint i, nr;
+   struct brw_reg dst_retyped;
+   GLuint cur_mrf = 2, response_length;
+   GLuint i, nr_texcoords;
     GLuint emit;
     GLuint msg_type;
+   GLuint mrf_per_channel;
+   GLuint simd_mode;
+
+   if (c->dispatch_width == 16) {
+      mrf_per_channel = 2;
+      response_length = 8;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   } else {
+      mrf_per_channel = 1;
+      response_length = 4;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+   }
  
     /* How many input regs are there?
      */
-   switch (inst->tex_idx) {
+   switch (tex_idx) {
     case TEXTURE_1D_INDEX:
        emit = WRITEMASK_X;
-      nr = 1;
+      nr_texcoords = 1;
        break;
     case TEXTURE_2D_INDEX:
     case TEXTURE_RECT_INDEX:
        emit = WRITEMASK_XY;
-      nr = 2;
+      nr_texcoords = 2;
        break;
-   default:
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
        emit = WRITEMASK_XYZ;
-      nr = 3;
+      nr_texcoords = 3;
        break;
+   default:
+      /* unexpected target */
+      abort();
     }
  
-   if (inst->tex_shadow) {
-      nr = 4;
-      emit |= WRITEMASK_W;
-   }
+   /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
+   if (!BRW_IS_IGDNG(p->brw) && c->dispatch_width == 8)
+      nr_texcoords = 3;
  
-   msgLength = 1;
+   /* For shadow comparisons, we have to supply u,v,r. */
+   if (shadow)
+      nr_texcoords = 3;
  
-   for (i = 0; i < nr; i++) {
-      static const GLuint swz[4] = {0,1,2,2};
-      if (emit & (1<<i)) 
-        brw_MOV(p, brw_message_reg(msgLength+1), arg[swz[i]]);
+   /* Emit the texcoords. */
+   for (i = 0; i < nr_texcoords; i++) {
+      if (emit & (1<<i))
+        brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
        else
-        brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
-      msgLength += 2;
+        brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+      cur_mrf += mrf_per_channel;
     }
  
-   responseLength = 8;         /* always */
+   /* Fill in the shadow comparison reference value. */
+   if (shadow) {
+      if (BRW_IS_IGDNG(p->brw)) {
+        /* Fill in the cube map array index value. */
+        brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+        cur_mrf += mrf_per_channel;
+      } else if (c->dispatch_width == 8) {
+        /* Fill in the LOD bias value. */
+        brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+        cur_mrf += mrf_per_channel;
+      }
+      brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
+      cur_mrf += mrf_per_channel;
+   }
  
     if (BRW_IS_IGDNG(p->brw)) {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+      if (shadow)
+        msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
+      else
+        msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
     } else {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+      /* Note that G45 and older determines shadow compare and dispatch width
+       * from message length for most messages.
+       */
+      if (c->dispatch_width == 16 && shadow)
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+      else
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
     }
  
-   brw_SAMPLE(p, 
-             retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+   brw_SAMPLE(p,
+             dst_retyped,
               1,
-             retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-             inst->tex_unit,     /* sampler */
-             inst->writemask,
-             msg_type, 
-             responseLength,
-             msgLength,
-             0,        
+             retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+             sampler,
+             dst_flags & WRITEMASK_XYZW,
+             msg_type,
+             response_length,
+             cur_mrf - 1,
+             0,
               1,
-             BRW_SAMPLER_SIMD_MODE_SIMD16);    
+             simd_mode);
  }
  
  
-static void emit_txb( struct brw_wm_compile *c,
-                     const struct brw_wm_instruction *inst,
-                     struct brw_reg *dst,
-                     GLuint dst_flags,
-                     struct brw_reg *arg )
+void emit_txb(struct brw_wm_compile *c,
+             struct brw_reg *dst,
+             GLuint dst_flags,
+             struct brw_reg *arg,
+             struct brw_reg depth_payload,
+             GLuint tex_idx,
+             GLuint sampler)
  {
     struct brw_compile *p = &c->func;
     GLuint msgLength;
     GLuint msg_type;
-   /* Shadow ignored for txb.
+   GLuint mrf_per_channel;
+   GLuint response_length;
+   struct brw_reg dst_retyped;
+
+   /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
+    * samples, so we'll use the 16-wide instruction, leave the second halves
+    * undefined, and trust the execution mask to keep the undefined pixels
+    * from mattering.
      */
-   switch (inst->tex_idx) {
+   if (c->dispatch_width == 16 || !BRW_IS_IGDNG(p->brw)) {
+      if (BRW_IS_IGDNG(p->brw))
+        msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      else
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+      mrf_per_channel = 2;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 8;
+   } else {
+      msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      mrf_per_channel = 1;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 4;
+   }
+
+   /* Shadow ignored for txb. */
+   switch (tex_idx) {
     case TEXTURE_1D_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
        break;
     case TEXTURE_2D_INDEX:
     case TEXTURE_RECT_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
        break;
-   default:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), arg[2]);
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
        break;
+   default:
+      /* unexpected target */
+      abort();
     }
  
-   brw_MOV(p, brw_message_reg(8), arg[3]);
-   msgLength = 9;
-
-   if (BRW_IS_IGDNG(p->brw))
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
-   else
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+   brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
+   msgLength = 2 + 4 * mrf_per_channel - 1;
  
     brw_SAMPLE(p, 
-             retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+             dst_retyped,
               1,
-             retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-             inst->tex_unit,     /* sampler */
-             inst->writemask,
+             retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+             sampler,
+             dst_flags & WRITEMASK_XYZW,
               msg_type,
-             8,                /* responseLength */
+             response_length,
               msgLength,
               0,        
               1,
@@ -834,11 +1013,13 @@ static void emit_txb( struct brw_wm_compile *c,
  }
  
  
-static void emit_lit( struct brw_compile *p, 
-                     const struct brw_reg *dst,
-                     GLuint mask,
-                     const struct brw_reg *arg0 )
+static void emit_lit(struct brw_wm_compile *c,
+                    const struct brw_reg *dst,
+                    GLuint mask,
+                    const struct brw_reg *arg0)
  {
+   struct brw_compile *p = &c->func;
+
     assert((mask & WRITEMASK_XW) == 0);
  
     if (mask & WRITEMASK_Y) {
@@ -848,7 +1029,7 @@ static void emit_lit( struct brw_compile *p,
     }
  
     if (mask & WRITEMASK_Z) {
-      emit_math2(p, BRW_MATH_FUNCTION_POW,
+      emit_math2(c, BRW_MATH_FUNCTION_POW,
                  &dst[2],
                  WRITEMASK_X | (mask & SATURATE),
                  &arg0[1],
@@ -893,6 +1074,20 @@ static void emit_kil( struct brw_wm_compile *c,
     }
  }
  
+/* KIL_NV kills the pixels that are currently executing, not based on a test
+ * of the arguments.
+ */
+static void emit_kil_nv( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
+   brw_pop_insn_state(p);
+}
  
  static void fire_fb_write( struct brw_wm_compile *c,
                            GLuint base_reg,
@@ -901,7 +1096,13 @@ static void fire_fb_write( struct brw_wm_compile *c,
                            GLuint eot )
  {
     struct brw_compile *p = &c->func;
-   
+   struct brw_reg dst;
+
+   if (c->dispatch_width == 16)
+      dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
     /* Pass through control information:
      */
  /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
@@ -918,7 +1119,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
     /* Send framebuffer write message: */
  /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
     brw_fb_WRITE(p,
-               retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+               dst,
                 base_reg,
                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
                 target,         
@@ -950,14 +1151,15 @@ static void emit_aa( struct brw_wm_compile *c,
   * \param arg1  the pass-through depth value
   * \param arg2  the shader-computed depth value
   */
-static void emit_fb_write( struct brw_wm_compile *c,
-                          struct brw_reg *arg0,
-                          struct brw_reg *arg1,
-                          struct brw_reg *arg2,
-                          GLuint target,
-                          GLuint eot)
+void emit_fb_write(struct brw_wm_compile *c,
+                  struct brw_reg *arg0,
+                  struct brw_reg *arg1,
+                  struct brw_reg *arg2,
+                  GLuint target,
+                  GLuint eot)
  {
     struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
     GLuint nr = 2;
     GLuint channel;
  
@@ -969,30 +1171,37 @@ static void emit_fb_write( struct brw_wm_compile *c,
     /* I don't really understand how this achieves the color interleave
      * (ie RGBARGBA) in the result:  [Do the saturation here]
      */
-   {
-      brw_push_insn_state(p);
-      
-      for (channel = 0; channel < 4; channel++) {
+   brw_push_insn_state(p);
+
+   for (channel = 0; channel < 4; channel++) {
+      if (c->dispatch_width == 16 && (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))) {
+        /* By setting the high bit of the MRF register number, we indicate
+         * that we want COMPR4 mode - instead of doing the usual destination
+         * + 1 for the second half we get destination + 4.
+         */
+        brw_MOV(p,
+                brw_message_reg(nr + channel + (1 << 7)),
+                arg0[channel]);
+      } else {
          /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
          /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-
          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
          brw_MOV(p,
                  brw_message_reg(nr + channel),
                  arg0[channel]);
-       
-        brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-        brw_MOV(p,
-                brw_message_reg(nr + channel + 4),
-                sechalf(arg0[channel]));
-      }
  
-      /* skip over the regs populated above:
-       */
-      nr += 8;
-   
-      brw_pop_insn_state(p);
+        if (c->dispatch_width == 16) {
+           brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+           brw_MOV(p,
+                   brw_message_reg(nr + channel + 4),
+                   sechalf(arg0[channel]));
+        }
+      }
     }
+   /* skip over the regs populated above:
+    */
+   nr += 8;
+   brw_pop_insn_state(p);
  
     if (c->key.source_depth_to_render_target)
     {
@@ -1042,7 +1251,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
               get_element_ud(brw_vec8_grf(1,0), 6), 
               brw_imm_ud(1<<26)); 
  
-      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
        {
          emit_aa(c, arg1, 2);
          fire_fb_write(c, 0, nr, target, eot);
@@ -1056,7 +1265,6 @@ static void emit_fb_write( struct brw_wm_compile *c,
     }
  }
  
-
  /**
   * Move a GPR to scratch memory. 
   */
@@ -1194,11 +1402,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
          /* Generated instructions for calculating triangle interpolants:
           */
        case WM_PIXELXY:
-        emit_pixel_xy(p, dst, dst_flags, args[0]);
+        emit_pixel_xy(c, dst, dst_flags);
          break;
  
        case WM_DELTAXY:
-        emit_delta_xy(p, dst, dst_flags, args[0], args[1]);
+        emit_delta_xy(p, dst, dst_flags, args[0]);
          break;
  
        case WM_WPOSXY:
@@ -1206,7 +1414,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
          break;
  
        case WM_PIXELW:
-        emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
+        emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
          break;
  
        case WM_LINTERP:
@@ -1243,6 +1451,14 @@ void brw_wm_emit( struct brw_wm_compile *c )
          emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
          break;
  
+      case OPCODE_DDX:
+        emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
+        break;
+
+      case OPCODE_DDY:
+        emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
+        break;
+
        case OPCODE_DP3:
          emit_dp3(p, dst, dst_flags, args[0], args[1]);
          break;
@@ -1256,7 +1472,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
          break;
  
        case OPCODE_TRUNC:
-        emit_trunc(p, dst, dst_flags, args[0]);
+        emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_LRP:
@@ -1283,27 +1499,27 @@ void brw_wm_emit( struct brw_wm_compile *c )
          /* Higher math functions:
           */
        case OPCODE_RCP:
-        emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_RSQ:
-        emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_SIN:
-        emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_COS:
-        emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_EX2:
-        emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_LG2:
-        emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
+        emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
          break;
  
        case OPCODE_SCS:
@@ -1311,13 +1527,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
           * fixup for 16-element execution.
           */
          if (dst_flags & WRITEMASK_X)
-           emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+           emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
          if (dst_flags & WRITEMASK_Y)
-           emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+           emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
          break;
  
        case OPCODE_POW:
-        emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
+        emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
          break;
  
          /* Comparisons:
@@ -1355,23 +1571,30 @@ void brw_wm_emit( struct brw_wm_compile *c )
         break;
  
        case OPCODE_LIT:
-        emit_lit(p, dst, dst_flags, args[0]);
+        emit_lit(c, dst, dst_flags, args[0]);
          break;
  
          /* Texturing operations:
           */
        case OPCODE_TEX:
-        emit_tex(c, inst, dst, dst_flags, args[0]);
+        emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+                 inst->tex_idx, inst->tex_unit,
+                 inst->tex_shadow);
          break;
  
        case OPCODE_TXB:
-        emit_txb(c, inst, dst, dst_flags, args[0]);
+        emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+                 inst->tex_idx, inst->tex_unit);
          break;
  
        case OPCODE_KIL:
          emit_kil(c, args[0]);
          break;
  
+      case OPCODE_KIL_NV:
+        emit_kil_nv(c);
+        break;
+
        default:
          _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
                       inst->opcode, inst->opcode < MAX_OPCODE ?