X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_wm_emit.c;h=96fecc97ee2bf1c03eae446d3b169336db7e3be8;hb=b2ddb93ff3b8c88682634ccdef247967e31fab84;hp=f316e0cda49b205a36318057d16d3d7a0619cc43;hpb=b01ffb12ed82dcf435b186e2aff35e344a8a6ebe;p=mesa.git

diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index f316e0cda49..96fecc97ee2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -34,6 +34,23 @@
 #include "brw_context.h"
 #include "brw_wm.h"
 
+static GLboolean can_do_pln(struct intel_context *intel,
+			    const struct brw_reg *deltas)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   if (!brw->has_pln)
+      return GL_FALSE;
+
+   if (deltas[1].nr != deltas[0].nr + 1)
+      return GL_FALSE;
+
+   if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
+      return GL_FALSE;
+
+   return GL_TRUE;
+}
+
 /* Not quite sure how correct this is - need to understand horiz
  * vs. vertical strides a little better.
  */
@@ -44,8 +61,53 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
    return reg;
 }
 
+/* Return the SrcReg index of the channels that can be immediate float operands
+ * instead of usage of PROGRAM_CONSTANT values through push/pull.
+ */
+GLboolean
+brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
+{
+   int opcode_array[] = {
+      [OPCODE_ADD] = 2,
+      [OPCODE_CMP] = 3,
+      [OPCODE_DP3] = 2,
+      [OPCODE_DP4] = 2,
+      [OPCODE_DPH] = 2,
+      [OPCODE_MAX] = 2,
+      [OPCODE_MIN] = 2,
+      [OPCODE_MOV] = 1,
+      [OPCODE_MUL] = 2,
+      [OPCODE_SEQ] = 2,
+      [OPCODE_SGE] = 2,
+      [OPCODE_SGT] = 2,
+      [OPCODE_SLE] = 2,
+      [OPCODE_SLT] = 2,
+      [OPCODE_SNE] = 2,
+      [OPCODE_SWZ] = 1,
+      [OPCODE_XPD] = 2,
+   };
+
+   /* These opcodes get broken down in a way that allow two
+    * args to be immediates.
+    */
+   if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
+      if (arg == 1 || arg == 2)
+	 return GL_TRUE;
+   }
+
+   if (opcode > ARRAY_SIZE(opcode_array))
+      return GL_FALSE;
+
+   return arg == opcode_array[opcode] - 1;
+}
 
-/* Payload R0:
+/**
+ * Computes the screen-space x,y position of the pixels.
+ *
+ * This will be used by emit_delta_xy() or emit_wpos_xy() for
+ * interpolation of attributes..
+ *
+ * Payload R0:
  *
  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
  *         corresponding to each of the 16 execution channels.
@@ -60,7 +122,6 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
  * R1.7 -- ?
  * R1.8 -- ?
  */
-
 void emit_pixel_xy(struct brw_wm_compile *c,
 		   const struct brw_reg *dst,
 		   GLuint mask)
@@ -100,33 +161,58 @@ void emit_pixel_xy(struct brw_wm_compile *c,
    brw_pop_insn_state(p);
 }
 
-
+/**
+ * Computes the screen-space x,y distance of the pixels from the start
+ * vertex.
+ *
+ * This will be used in linterp or pinterp with the start vertex value
+ * and the Cx, Cy, and C0 coefficients passed in from the setup engine
+ * to produce interpolated attribute values.
+ */
 void emit_delta_xy(struct brw_compile *p,
 		   const struct brw_reg *dst,
 		   GLuint mask,
 		   const struct brw_reg *arg0)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
-   /* Calc delta X,Y by subtracting origin in r1 from the pixel
-    * centers.
-    */
-   if (mask & WRITEMASK_X) {
-      brw_ADD(p,
-	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
-	      negate(r1));
-   }
-
-   if (mask & WRITEMASK_Y) {
-      brw_ADD(p,
-	      dst[1],
-	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
-	      negate(suboffset(r1,1)));
+   if (mask == 0)
+      return;
 
+   assert(mask == WRITEMASK_XY);
+
+   if (intel->gen >= 6) {
+       /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
+	  Just add them with 0.0 for dst reg.. */
+       r1 = brw_imm_v(0x00000000);
+       brw_ADD(p,
+	       dst[0],
+	       retype(arg0[0], BRW_REGISTER_TYPE_UW),
+	       r1);
+       brw_ADD(p,
+	       dst[1],
+	       retype(arg0[1], BRW_REGISTER_TYPE_UW),
+	       r1);
+       return;
    }
+
+   /* Calc delta X,Y by subtracting origin in r1 from the pixel
+    * centers produced by emit_pixel_xy().
+    */
+   brw_ADD(p,
+	   dst[0],
+	   retype(arg0[0], BRW_REGISTER_TYPE_UW),
+	   negate(r1));
+   brw_ADD(p,
+	   dst[1],
+	   retype(arg0[1], BRW_REGISTER_TYPE_UW),
+	   negate(suboffset(r1,1)));
 }
 
+/**
+ * Computes the pixel offset from the window origin for gl_FragCoord().
+ */
 void emit_wpos_xy(struct brw_wm_compile *c,
 		  const struct brw_reg *dst,
 		  GLuint mask,
@@ -134,23 +220,44 @@ void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   /* Calculate the pixel offset from window bottom left into destination
-    * X and Y channels.
-    */
    if (mask & WRITEMASK_X) {
-      /* X' = X - origin */
-      brw_ADD(p,
-	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_W),
-	      brw_imm_d(0 - c->key.origin_x));
+      if (c->fp->program.PixelCenterInteger) {
+	 /* X' = X */
+	 brw_MOV(p,
+		 dst[0],
+		 retype(arg0[0], BRW_REGISTER_TYPE_W));
+      } else {
+	 /* X' = X + 0.5 */
+	 brw_ADD(p,
+		 dst[0],
+		 retype(arg0[0], BRW_REGISTER_TYPE_W),
+		 brw_imm_f(0.5));
+      }
    }
 
    if (mask & WRITEMASK_Y) {
-      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
-      brw_ADD(p,
-	      dst[1],
-	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
-	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+      if (c->fp->program.OriginUpperLeft) {
+	 if (c->fp->program.PixelCenterInteger) {
+	    /* Y' = Y */
+	    brw_MOV(p,
+		    dst[1],
+		    retype(arg0[1], BRW_REGISTER_TYPE_W));
+	 } else {
+	    /* Y' = Y + 0.5 */
+	    brw_ADD(p,
+		    dst[1],
+		    retype(arg0[1], BRW_REGISTER_TYPE_W),
+		    brw_imm_f(0.5));
+	 }
+      } else {
+	 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
+
+	 /* Y' = (height - 1) - Y + center */
+	 brw_ADD(p,
+		 dst[1],
+		 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+		 brw_imm_f(c->key.drawable_height - 1 + center_offset));
+      }
    }
 }
 
@@ -162,6 +269,16 @@ void emit_pixel_w(struct brw_wm_compile *c,
 		  const struct brw_reg *deltas)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg src;
+   struct brw_reg temp_dst;
+
+   if (intel->gen >= 6)
+	temp_dst = dst[3];
+   else
+	temp_dst = brw_message_reg(2);
+
+   assert(intel->gen < 6);
 
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
@@ -172,34 +289,43 @@ void emit_pixel_w(struct brw_wm_compile *c,
       /* Calc 1/w - just linterp wpos[3] optimized by putting the
        * result straight into a message reg.
        */
-      brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
-      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
+      if (can_do_pln(intel, deltas)) {
+	 brw_PLN(p, temp_dst, interp3, deltas[0]);
+      } else {
+	 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
+	 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
+      }
 
       /* Calc w */
+      if (intel->gen >= 6)
+	 src = temp_dst;
+      else
+	 src = brw_null_reg();
+
       if (c->dispatch_width == 16) {
 	 brw_math_16(p, dst[3],
 		     BRW_MATH_FUNCTION_INV,
 		     BRW_MATH_SATURATE_NONE,
-		     2, brw_null_reg(),
+		     2, src,
 		     BRW_MATH_PRECISION_FULL);
       } else {
 	 brw_math(p, dst[3],
 		  BRW_MATH_FUNCTION_INV,
 		  BRW_MATH_SATURATE_NONE,
-		  2, brw_null_reg(),
+		  2, src,
 		  BRW_MATH_DATA_VECTOR,
 		  BRW_MATH_PRECISION_FULL);
       }
    }
 }
 
-
 void emit_linterp(struct brw_compile *p,
 		  const struct brw_reg *dst,
 		  GLuint mask,
 		  const struct brw_reg *arg0,
 		  const struct brw_reg *deltas)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
    GLuint i;
@@ -211,8 +337,14 @@ void emit_linterp(struct brw_compile *p,
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {
-	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
-	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+	 if (intel->gen >= 6) {
+	    brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
+	 } else if (can_do_pln(intel, deltas)) {
+	    brw_PLN(p, dst[i], interp[i], deltas[0]);
+	 } else {
+	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+	 }
       }
    }
 }
@@ -225,10 +357,16 @@ void emit_pinterp(struct brw_compile *p,
 		  const struct brw_reg *deltas,
 		  const struct brw_reg *w)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
    GLuint i;
 
+   if (intel->gen >= 6) {
+      emit_linterp(p, dst, mask, arg0, interp);
+      return;
+   }
+
    interp[0] = brw_vec1_grf(nr, 0);
    interp[1] = brw_vec1_grf(nr, 4);
    interp[2] = brw_vec1_grf(nr+1, 0);
@@ -236,8 +374,12 @@ void emit_pinterp(struct brw_compile *p,
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {
-	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
-	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+	 if (can_do_pln(intel, deltas)) {
+	    brw_PLN(p, dst[i], interp[i], deltas[0]);
+	 } else {
+	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+	 }
       }
    }
    for (i = 0; i < 4; i++) {
@@ -542,31 +684,49 @@ static void emit_sne( struct brw_compile *p,
    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
 }
 
-static void emit_cmp( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1,
-		      const struct brw_reg *arg2 )
+void emit_cmp(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2)
 {
    GLuint i;
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
-	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg2[i]);
-	 brw_set_saturate(p, 0);
-
 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
 
 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_SEL(p, dst[i], arg1[i], arg2[i]);
 	 brw_set_saturate(p, 0);
 	 brw_set_predicate_control_flag_value(p, 0xff);
       }
    }
 }
 
+void emit_sign(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(0.0));
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
+	 brw_MOV(p, dst[i], brw_imm_f(-1.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+   }
+}
+
 void emit_max(struct brw_compile *p,
 	      const struct brw_reg *dst,
 	      GLuint mask,
@@ -577,14 +737,10 @@ void emit_max(struct brw_compile *p,
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
-	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg0[i]);
-	 brw_set_saturate(p, 0);
-
-	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
 
 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
 	 brw_set_saturate(p, 0);
 	 brw_set_predicate_control_flag_value(p, 0xff);
       }
@@ -601,14 +757,10 @@ void emit_min(struct brw_compile *p,
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
-	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg1[i]);
-	 brw_set_saturate(p, 0);
-
 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
 
 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-	 brw_MOV(p, dst[i], arg0[i]);
+	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
 	 brw_set_saturate(p, 0);
 	 brw_set_predicate_control_flag_value(p, 0xff);
       }
@@ -616,6 +768,27 @@ void emit_min(struct brw_compile *p,
 }
 
 
+void emit_dp2(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
+{
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
+   brw_set_saturate(p, 0);
+}
+
+
 void emit_dp3(struct brw_compile *p,
 	      const struct brw_reg *dst,
 	      GLuint mask,
@@ -716,21 +889,29 @@ void emit_math1(struct brw_wm_compile *c,
 		const struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
    GLuint saturate = ((mask & SATURATE) ?
 		      BRW_MATH_SATURATE_SATURATE :
 		      BRW_MATH_SATURATE_NONE);
+   struct brw_reg src;
+
+   if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
+			   arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
+      /* Gen6 math requires that source and dst horizontal stride be 1,
+       * and that the argument be in the GRF.
+       */
+      src = dst[dst_chan];
+      brw_MOV(p, src, arg0[0]);
+   } else {
+      src = arg0[0];
+   }
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
-   /* If compressed, this will write message reg 2,3 from arg0.x's 16
-    * channels.
-    */
-   brw_MOV(p, brw_message_reg(2), arg0[0]);
-
    /* Send two messages to perform all 16 operations:
     */
    brw_push_insn_state(p);
@@ -740,7 +921,7 @@ void emit_math1(struct brw_wm_compile *c,
 	    function,
 	    saturate,
 	    2,
-	    brw_null_reg(),
+	    src,
 	    BRW_MATH_DATA_VECTOR,
 	    BRW_MATH_PRECISION_FULL);
 
@@ -751,7 +932,7 @@ void emit_math1(struct brw_wm_compile *c,
 	       function,
 	       saturate,
 	       3,
-	       brw_null_reg(),
+	       sechalf(src),
 	       BRW_MATH_DATA_VECTOR,
 	       BRW_MATH_PRECISION_FULL);
    }
@@ -767,10 +948,8 @@ void emit_math2(struct brw_wm_compile *c,
 		const struct brw_reg *arg1)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-   GLuint saturate = ((mask & SATURATE) ?
-		      BRW_MATH_SATURATE_SATURATE :
-		      BRW_MATH_SATURATE_NONE);
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
@@ -779,42 +958,103 @@ void emit_math2(struct brw_wm_compile *c,
 
    brw_push_insn_state(p);
 
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_MOV(p, brw_message_reg(2), arg0[0]);
-   if (c->dispatch_width == 16) {
-      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-      brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
-   }
+   /* math can only operate on up to a vec8 at a time, so in
+    * dispatch_width==16 we have to do the second half manually.
+    */
+   if (intel->gen >= 6) {
+      struct brw_reg src0 = arg0[0];
+      struct brw_reg src1 = arg1[0];
+      struct brw_reg temp_dst = dst[dst_chan];
+
+      if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
+	 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
+	    /* Both scalar arguments.  Do scalar calc. */
+	    src0.hstride = BRW_HORIZONTAL_STRIDE_1;
+	    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
+	    temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+	    temp_dst.width = BRW_WIDTH_1;
+
+	    if (arg0[0].subnr != 0) {
+	       brw_MOV(p, temp_dst, src0);
+	       src0 = temp_dst;
+
+	       /* Ouch.  We've used the temp as a dst, and we still
+		* need a temp to store arg1 in, because src and dst
+		* offsets have to be equal.  Leaving this up to
+		* glsl2-965 to handle correctly.
+		*/
+	       assert(arg1[0].subnr == 0);
+	    } else if (arg1[0].subnr != 0) {
+	       brw_MOV(p, temp_dst, src1);
+	       src1 = temp_dst;
+	    }
+	 } else {
+	    brw_MOV(p, temp_dst, src0);
+	    src0 = temp_dst;
+	 }
+      } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
+	 brw_MOV(p, temp_dst, src1);
+	 src1 = temp_dst;
+      }
 
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_MOV(p, brw_message_reg(3), arg1[0]);
-   if (c->dispatch_width == 16) {
-      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-      brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
-   }
+      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_math2(p,
+		temp_dst,
+		function,
+		src0,
+		src1);
+      if (c->dispatch_width == 16) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_math2(p,
+		   sechalf(temp_dst),
+		   function,
+		   sechalf(src0),
+		   sechalf(src1));
+      }
 
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_math(p, 
-	    dst[dst_chan],
-	    function,
-	    saturate,
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
+      /* Splat a scalar result into all the channels. */
+      if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	  arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
+	 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
+	 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
+	 brw_MOV(p, dst[dst_chan], temp_dst);
+      }
+   } else {
+      GLuint saturate = ((mask & SATURATE) ?
+			 BRW_MATH_SATURATE_SATURATE :
+			 BRW_MATH_SATURATE_NONE);
 
-   /* Send two messages to perform all 16 operations:
-    */
-   if (c->dispatch_width == 16) {
-      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, brw_message_reg(3), arg1[0]);
+      if (c->dispatch_width == 16) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+      }
+
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_math(p,
-	       offset(dst[dst_chan],1),
+	       dst[dst_chan],
 	       function,
 	       saturate,
-	       4,
-	       brw_null_reg(),
+	       2,
+	       arg0[0],
 	       BRW_MATH_DATA_VECTOR,
 	       BRW_MATH_PRECISION_FULL);
+
+      /* Send two messages to perform all 16 operations:
+       */
+      if (c->dispatch_width == 16) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_math(p,
+		  offset(dst[dst_chan],1),
+		  function,
+		  saturate,
+		  4,
+		  sechalf(arg0[0]),
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+      }
    }
    brw_pop_insn_state(p);
 }
@@ -874,7 +1114,7 @@ void emit_tex(struct brw_wm_compile *c,
    }
 
    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
-   if (!intel->is_ironlake && c->dispatch_width == 8)
+   if (intel->gen < 5 && c->dispatch_width == 8)
       nr_texcoords = 3;
 
    /* For shadow comparisons, we have to supply u,v,r. */
@@ -892,7 +1132,7 @@ void emit_tex(struct brw_wm_compile *c,
 
    /* Fill in the shadow comparison reference value. */
    if (shadow) {
-      if (intel->is_ironlake) {
+      if (intel->gen >= 5) {
 	 /* Fill in the cube map array index value. */
 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 	 cur_mrf += mrf_per_channel;
@@ -905,11 +1145,11 @@ void emit_tex(struct brw_wm_compile *c,
       cur_mrf += mrf_per_channel;
    }
 
-   if (intel->is_ironlake) {
+   if (intel->gen >= 5) {
       if (shadow)
-	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
       else
-	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
    } else {
       /* Note that G45 and older determines shadow compare and dispatch width
        * from message length for most messages.
@@ -957,16 +1197,16 @@ void emit_txb(struct brw_wm_compile *c,
     * undefined, and trust the execution mask to keep the undefined pixels
     * from mattering.
     */
-   if (c->dispatch_width == 16 || !intel->is_ironlake) {
-      if (intel->is_ironlake)
-	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+   if (c->dispatch_width == 16 || intel->gen < 5) {
+      if (intel->gen >= 5)
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
       else
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
       mrf_per_channel = 2;
       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
       response_length = 8;
    } else {
-      msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
       mrf_per_channel = 1;
       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
       response_length = 4;
@@ -1062,11 +1302,19 @@ static void emit_kil( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-   GLuint i;
-   
-   /* XXX - usually won't need 4 compares!
-    */
+   GLuint i, j;
+
    for (i = 0; i < 4; i++) {
+      /* Check if we've already done the comparison for this reg
+       * -- common when someone does KIL TEMP.wwww.
+       */
+      for (j = 0; j < i; j++) {
+	 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
+	    break;
+      }
+      if (j != i)
+	 continue;
+
       brw_push_insn_state(p);
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
       brw_set_predicate_control_flag_value(p, 0xff);
@@ -1079,7 +1327,7 @@ static void emit_kil( struct brw_wm_compile *c,
 /* KIL_NV kills the pixels that are currently executing, not based on a test
  * of the arguments.
  */
-static void emit_kil_nv( struct brw_wm_compile *c )
+void emit_kil_nv( struct brw_wm_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
@@ -1098,6 +1346,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint eot )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg dst;
 
    if (c->dispatch_width == 16)
@@ -1108,6 +1357,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
    /* Pass through control information:
     */
 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   if (intel->gen < 6) /* gen6, use headerless for fb write */
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
@@ -1121,6 +1371,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
    /* Send framebuffer write message: */
 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
    brw_fb_WRITE(p,
+		c->dispatch_width,
 		dst,
 		base_reg,
 		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
@@ -1162,8 +1413,10 @@ void emit_fb_write(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    GLuint nr = 2;
    GLuint channel;
+   int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
 
    /* Reserve a space for AA - may not be needed:
     */
@@ -1175,14 +1428,45 @@ void emit_fb_write(struct brw_wm_compile *c,
     */
    brw_push_insn_state(p);
 
+   if (intel->gen >= 6)
+	base_reg = nr;
+   else
+	base_reg = 0;
+
    for (channel = 0; channel < 4; channel++) {
-      if (c->dispatch_width == 16 && brw->has_compr4) {
-	 /* By setting the high bit of the MRF register number, we indicate
+      if (intel->gen >= 6) {
+	 /* gen6 SIMD16 single source DP write looks like:
+	  * m + 0: r0
+	  * m + 1: r1
+	  * m + 2: g0
+	  * m + 3: g1
+	  * m + 4: b0
+	  * m + 5: b1
+	  * m + 6: a0
+	  * m + 7: a1
+	  */
+	 if (c->dispatch_width == 16) {
+	    brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
+	 } else {
+	    brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
+	 }
+      } else if (c->dispatch_width == 16 && brw->has_compr4) {
+	 /* pre-gen6 SIMD16 single source DP write looks like:
+	  * m + 0: r0
+	  * m + 1: g0
+	  * m + 2: b0
+	  * m + 3: a0
+	  * m + 4: r1
+	  * m + 5: g1
+	  * m + 6: b1
+	  * m + 7: a1
+	  *
+	  * By setting the high bit of the MRF register number, we indicate
 	  * that we want COMPR4 mode - instead of doing the usual destination
 	  * + 1 for the second half we get destination + 4.
 	  */
 	 brw_MOV(p,
-		 brw_message_reg(nr + channel + (1 << 7)),
+		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
 		 arg0[channel]);
       } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
@@ -1202,7 +1486,11 @@ void emit_fb_write(struct brw_wm_compile *c,
    }
    /* skip over the regs populated above:
     */
-   nr += 8;
+   if (c->dispatch_width == 16)
+      nr += 8;
+   else
+      nr += 4;
+
    brw_pop_insn_state(p);
 
    if (c->key.source_depth_to_render_target)
@@ -1235,11 +1523,16 @@ void emit_fb_write(struct brw_wm_compile *c,
       nr += 2;
    }
 
+   if (intel->gen >= 6) {
+      /* Subtract off the message header, since we send headerless. */
+      nr -= 2;
+   }
+
    if (!c->key.runtime_check_aads_emit) {
       if (c->key.aa_dest_stencil_reg)
 	 emit_aa(c, arg1, 2);
 
-      fire_fb_write(c, 0, nr, target, eot);
+      fire_fb_write(c, base_reg, nr, target, eot);
    }
    else {
       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
@@ -1285,9 +1578,7 @@ static void emit_spill( struct brw_wm_compile *c,
      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
    */
-   brw_dp_WRITE_16(p, 
-		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
-		   slot);
+   brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
 }
 
 
@@ -1312,9 +1603,7 @@ static void emit_unspill( struct brw_wm_compile *c,
      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
    */
 
-   brw_dp_READ_16(p,
-		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
-		  slot);
+   brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
 }
 
 
@@ -1364,9 +1653,12 @@ static void spill_values( struct brw_wm_compile *c,
 void brw_wm_emit( struct brw_wm_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    GLuint insn;
 
    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   if (intel->gen >= 6)
+	brw_set_acc_write_control(p, 1);
 
    /* Check if any of the payload regs need to be spilled:
     */
@@ -1461,6 +1753,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
 	 break;
 
+      case OPCODE_DP2:
+	 emit_dp2(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
       case OPCODE_DP3:
 	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
 	 break;
@@ -1474,7 +1770,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TRUNC:
-	 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
+	 for (i = 0; i < 4; i++) {
+	    if (dst_flags & (1<<i)) {
+	       brw_RNDZ(p, dst[i], args[0][i]);
+	    }
+	 }
 	 break;
 
       case OPCODE_LRP:
@@ -1572,6 +1872,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_sne(p, dst, dst_flags, args[0], args[1]);
 	break;
 
+      case OPCODE_SSG:
+	 emit_sign(p, dst, dst_flags, args[0]);
+	 break;
+
       case OPCODE_LIT:
 	 emit_lit(c, dst, dst_flags, args[0]);
 	 break;
@@ -1598,10 +1902,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       default:
-	 _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
-		      inst->opcode, inst->opcode < MAX_OPCODE ?
-				    _mesa_opcode_string(inst->opcode) :
-				    "unknown");
+	 printf("Unsupported opcode %i (%s) in fragment shader\n",
+		inst->opcode, inst->opcode < MAX_OPCODE ?
+		_mesa_opcode_string(inst->opcode) :
+		"unknown");
       }
       
       for (i = 0; i < 4; i++)
@@ -1611,12 +1915,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   /* Only properly tested on ILK */
+   if (p->brw->intel.gen == 5) {
+     brw_remove_duplicate_mrf_moves(p);
+     if (c->dispatch_width == 16)
+	brw_remove_grf_to_mrf_moves(p);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       int i;
 
-      _mesa_printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
-	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+     printf("wm-native:\n");
+     for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
+      printf("\n");
    }
 }
+