Merge remote branch 'origin/master' into lp-setup-llvm

[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c

index 8d6ac0083966c2b48cd180197bb4e28935153067..a1fead0a4fac06431fcb40dff7a4f7e70299a8b8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -75,6 +75,8 @@ static void brw_set_dest( struct brw_instruction *insn,
        else {
          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+        /* even ignored in da16, still need to set as '01' */
+        insn->bits1.da16.dest_horiz_stride = 1;
        }
     }
     else {
@@ -90,6 +92,8 @@ static void brw_set_dest( struct brw_instruction *insn,
        }
        else {
          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+        /* even ignored in da16, still need to set as '01' */
+        insn->bits1.ia16.dest_horiz_stride = 1;
        }
     }
  
@@ -99,14 +103,92 @@ static void brw_set_dest( struct brw_instruction *insn,
     guess_execution_size(insn, dest);
  }
  
+extern int reg_type_size[];
+
+static void
+validate_reg(struct brw_instruction *insn, struct brw_reg reg)
+{
+   int hstride_for_reg[] = {0, 1, 2, 4};
+   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
+   int width_for_reg[] = {1, 2, 4, 8, 16};
+   int execsize_for_reg[] = {1, 2, 4, 8, 16};
+   int width, hstride, vstride, execsize;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
+       * mean the destination has to be 128-bit aligned and the
+       * destination horiz stride has to be a word.
+       */
+      if (reg.type == BRW_REGISTER_TYPE_V) {
+        assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
+               reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
+      }
+
+      return;
+   }
+
+   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       reg.file == BRW_ARF_NULL)
+      return;
+
+   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
+   hstride = hstride_for_reg[reg.hstride];
+
+   if (reg.vstride == 0xf) {
+      vstride = -1;
+   } else {
+      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
+      vstride = vstride_for_reg[reg.vstride];
+   }
+
+   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
+   width = width_for_reg[reg.width];
+
+   assert(insn->header.execution_size >= 0 &&
+         insn->header.execution_size < Elements(execsize_for_reg));
+   execsize = execsize_for_reg[insn->header.execution_size];
+
+   /* Restrictions from 3.3.10: Register Region Restrictions. */
+   /* 3. */
+   assert(execsize >= width);
+
+   /* 4. */
+   if (execsize == width && hstride != 0) {
+      assert(vstride == -1 || vstride == width * hstride);
+   }
+
+   /* 5. */
+   if (execsize == width && hstride == 0) {
+      /* no restriction on vstride. */
+   }
+
+   /* 6. */
+   if (width == 1) {
+      assert(hstride == 0);
+   }
+
+   /* 7. */
+   if (execsize == 1 && width == 1) {
+      assert(hstride == 0);
+      assert(vstride == 0);
+   }
+
+   /* 8. */
+   if (vstride == 0 && hstride == 0) {
+      assert(width == 1);
+   }
+
+   /* 10. Check destination issues. */
+}
+
  static void brw_set_src0( struct brw_instruction *insn,
                            struct brw_reg reg )
  {
-   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
-
     if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
        assert(reg.nr < 128);
  
+   validate_reg(insn, reg);
+
     insn->bits1.da1.src0_reg_file = reg.file;
     insn->bits1.da1.src0_reg_type = reg.type;
     insn->bits2.da1.src0_abs = reg.abs;
@@ -182,6 +264,8 @@ void brw_set_src1( struct brw_instruction *insn,
  
     assert(reg.nr < 128);
  
+   validate_reg(insn, reg);
+
     insn->bits1.da1.src1_reg_file = reg.file;
     insn->bits1.da1.src1_reg_type = reg.type;
     insn->bits3.da1.src1_abs = reg.abs;
@@ -255,19 +339,19 @@ static void brw_set_math_message( struct brw_context *brw,
     struct intel_context *intel = &brw->intel;
     brw_set_src1(insn, brw_imm_d(0));
  
-   if (intel->is_ironlake) {
-       insn->bits3.math_igdng.function = function;
-       insn->bits3.math_igdng.int_type = integer_type;
-       insn->bits3.math_igdng.precision = low_precision;
-       insn->bits3.math_igdng.saturate = saturate;
-       insn->bits3.math_igdng.data_type = dataType;
-       insn->bits3.math_igdng.snapshot = 0;
-       insn->bits3.math_igdng.header_present = 0;
-       insn->bits3.math_igdng.response_length = response_length;
-       insn->bits3.math_igdng.msg_length = msg_length;
-       insn->bits3.math_igdng.end_of_thread = 0;
-       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
-       insn->bits2.send_igdng.end_of_thread = 0;
+   if (intel->gen == 5) {
+       insn->bits3.math_gen5.function = function;
+       insn->bits3.math_gen5.int_type = integer_type;
+       insn->bits3.math_gen5.precision = low_precision;
+       insn->bits3.math_gen5.saturate = saturate;
+       insn->bits3.math_gen5.data_type = dataType;
+       insn->bits3.math_gen5.snapshot = 0;
+       insn->bits3.math_gen5.header_present = 0;
+       insn->bits3.math_gen5.response_length = response_length;
+       insn->bits3.math_gen5.msg_length = msg_length;
+       insn->bits3.math_gen5.end_of_thread = 0;
+       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_gen5.end_of_thread = 0;
     } else {
         insn->bits3.math.function = function;
         insn->bits3.math.int_type = integer_type;
@@ -282,31 +366,31 @@ static void brw_set_math_message( struct brw_context *brw,
  }
  
  
-static void brw_set_ff_sync_message( struct brw_context *brw,
-                                struct brw_instruction *insn,
-                                GLboolean allocate,
-                                GLboolean used,
-                                GLuint msg_length,
-                                GLuint response_length,
-                                GLboolean end_of_thread,
-                                GLboolean complete,
-                                GLuint offset,
-                                GLuint swizzle_control )
+static void brw_set_ff_sync_message(struct brw_context *brw,
+                                   struct brw_instruction *insn,
+                                   GLboolean allocate,
+                                   GLuint response_length,
+                                   GLboolean end_of_thread)
  {
+       struct intel_context *intel = &brw->intel;
         brw_set_src1(insn, brw_imm_d(0));
  
-       insn->bits3.urb_igdng.opcode = 1;
-       insn->bits3.urb_igdng.offset = offset;
-       insn->bits3.urb_igdng.swizzle_control = swizzle_control;
-       insn->bits3.urb_igdng.allocate = allocate;
-       insn->bits3.urb_igdng.used = used;
-       insn->bits3.urb_igdng.complete = complete;
-       insn->bits3.urb_igdng.header_present = 1;
-       insn->bits3.urb_igdng.response_length = response_length;
-       insn->bits3.urb_igdng.msg_length = msg_length;
-       insn->bits3.urb_igdng.end_of_thread = end_of_thread;
-       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
-       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+       insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
+       insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
+       insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
+       insn->bits3.urb_gen5.allocate = allocate;
+       insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
+       insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
+       insn->bits3.urb_gen5.header_present = 1;
+       insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
+       insn->bits3.urb_gen5.msg_length = 1;
+       insn->bits3.urb_gen5.end_of_thread = end_of_thread;
+       if (intel->gen >= 6) {
+          insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+       } else {
+          insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
+          insn->bits2.send_gen5.end_of_thread = end_of_thread;
+       }
  }
  
  static void brw_set_urb_message( struct brw_context *brw,
@@ -323,19 +407,27 @@ static void brw_set_urb_message( struct brw_context *brw,
      struct intel_context *intel = &brw->intel;
      brw_set_src1(insn, brw_imm_d(0));
  
-    if (intel->is_ironlake) {
-        insn->bits3.urb_igdng.opcode = 0;      /* ? */
-        insn->bits3.urb_igdng.offset = offset;
-        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
-        insn->bits3.urb_igdng.allocate = allocate;
-        insn->bits3.urb_igdng.used = used;     /* ? */
-        insn->bits3.urb_igdng.complete = complete;
-        insn->bits3.urb_igdng.header_present = 1;
-        insn->bits3.urb_igdng.response_length = response_length;
-        insn->bits3.urb_igdng.msg_length = msg_length;
-        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
-        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
-        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    if (intel->gen >= 5) {
+        insn->bits3.urb_gen5.opcode = 0;       /* ? */
+        insn->bits3.urb_gen5.offset = offset;
+        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
+        insn->bits3.urb_gen5.allocate = allocate;
+        insn->bits3.urb_gen5.used = used;      /* ? */
+        insn->bits3.urb_gen5.complete = complete;
+        insn->bits3.urb_gen5.header_present = 1;
+        insn->bits3.urb_gen5.response_length = response_length;
+        insn->bits3.urb_gen5.msg_length = msg_length;
+        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
+       if (intel->gen >= 6) {
+          /* For SNB, the SFID bits moved to the condmod bits, and
+           * EOT stayed in bits3 above.  Does the EOT bit setting
+           * below on Ironlake even do anything?
+           */
+          insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+       } else {
+          insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
+          insn->bits2.send_gen5.end_of_thread = end_of_thread;
+       }
      } else {
          insn->bits3.urb.opcode = 0;    /* ? */
          insn->bits3.urb.offset = offset;
@@ -356,31 +448,47 @@ static void brw_set_dp_write_message( struct brw_context *brw,
                                       GLuint msg_control,
                                       GLuint msg_type,
                                       GLuint msg_length,
+                                     GLboolean header_present,
                                       GLuint pixel_scoreboard_clear,
                                       GLuint response_length,
-                                     GLuint end_of_thread )
+                                     GLuint end_of_thread,
+                                     GLuint send_commit_msg)
  {
     struct intel_context *intel = &brw->intel;
-   brw_set_src1(insn, brw_imm_d(0));
-
-   if (intel->is_ironlake) {
-       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
-       insn->bits3.dp_write_igdng.msg_control = msg_control;
-       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
-       insn->bits3.dp_write_igdng.msg_type = msg_type;
-       insn->bits3.dp_write_igdng.send_commit_msg = 0;
-       insn->bits3.dp_write_igdng.header_present = 1;
-       insn->bits3.dp_write_igdng.response_length = response_length;
-       insn->bits3.dp_write_igdng.msg_length = msg_length;
-       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
-       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   brw_set_src1(insn, brw_imm_ud(0));
+
+   if (intel->gen >= 6) {
+       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
+       insn->bits3.dp_render_cache.msg_control = msg_control;
+       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_render_cache.msg_type = msg_type;
+       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
+       insn->bits3.dp_render_cache.header_present = header_present;
+       insn->bits3.dp_render_cache.response_length = response_length;
+       insn->bits3.dp_render_cache.msg_length = msg_length;
+       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
+       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       /* XXX really need below? */
+       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_gen5.end_of_thread = end_of_thread;
+   } else if (intel->gen == 5) {
+       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_gen5.msg_control = msg_control;
+       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_gen5.msg_type = msg_type;
+       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
+       insn->bits3.dp_write_gen5.header_present = header_present;
+       insn->bits3.dp_write_gen5.response_length = response_length;
+       insn->bits3.dp_write_gen5.msg_length = msg_length;
+       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
+       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_gen5.end_of_thread = end_of_thread;
     } else {
         insn->bits3.dp_write.binding_table_index = binding_table_index;
         insn->bits3.dp_write.msg_control = msg_control;
         insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
         insn->bits3.dp_write.msg_type = msg_type;
-       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
         insn->bits3.dp_write.response_length = response_length;
         insn->bits3.dp_write.msg_length = msg_length;
         insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
@@ -401,18 +509,18 @@ static void brw_set_dp_read_message( struct brw_context *brw,
     struct intel_context *intel = &brw->intel;
     brw_set_src1(insn, brw_imm_d(0));
  
-   if (intel->is_ironlake) {
-       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
-       insn->bits3.dp_read_igdng.msg_control = msg_control;
-       insn->bits3.dp_read_igdng.msg_type = msg_type;
-       insn->bits3.dp_read_igdng.target_cache = target_cache;
-       insn->bits3.dp_read_igdng.header_present = 1;
-       insn->bits3.dp_read_igdng.response_length = response_length;
-       insn->bits3.dp_read_igdng.msg_length = msg_length;
-       insn->bits3.dp_read_igdng.pad1 = 0;
-       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
-       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
-       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   if (intel->gen == 5) {
+       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_gen5.msg_control = msg_control;
+       insn->bits3.dp_read_gen5.msg_type = msg_type;
+       insn->bits3.dp_read_gen5.target_cache = target_cache;
+       insn->bits3.dp_read_gen5.header_present = 1;
+       insn->bits3.dp_read_gen5.response_length = response_length;
+       insn->bits3.dp_read_gen5.msg_length = msg_length;
+       insn->bits3.dp_read_gen5.pad1 = 0;
+       insn->bits3.dp_read_gen5.end_of_thread = end_of_thread;
+       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_gen5.end_of_thread = end_of_thread;
     } else {
         insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
         insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
@@ -441,17 +549,21 @@ static void brw_set_sampler_message(struct brw_context *brw,
     assert(eot == 0);
     brw_set_src1(insn, brw_imm_d(0));
  
-   if (intel->is_ironlake) {
-      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
-      insn->bits3.sampler_igdng.sampler = sampler;
-      insn->bits3.sampler_igdng.msg_type = msg_type;
-      insn->bits3.sampler_igdng.simd_mode = simd_mode;
-      insn->bits3.sampler_igdng.header_present = header_present;
-      insn->bits3.sampler_igdng.response_length = response_length;
-      insn->bits3.sampler_igdng.msg_length = msg_length;
-      insn->bits3.sampler_igdng.end_of_thread = eot;
-      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
-      insn->bits2.send_igdng.end_of_thread = eot;
+   if (intel->gen >= 5) {
+      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
+      insn->bits3.sampler_gen5.sampler = sampler;
+      insn->bits3.sampler_gen5.msg_type = msg_type;
+      insn->bits3.sampler_gen5.simd_mode = simd_mode;
+      insn->bits3.sampler_gen5.header_present = header_present;
+      insn->bits3.sampler_gen5.response_length = response_length;
+      insn->bits3.sampler_gen5.msg_length = msg_length;
+      insn->bits3.sampler_gen5.end_of_thread = eot;
+      if (intel->gen >= 6)
+         insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
+      else {
+         insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+         insn->bits2.send_gen5.end_of_thread = eot;
+      }
     } else if (intel->is_g4x) {
        insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
        insn->bits3.sampler_g4x.sampler = sampler;
@@ -542,6 +654,26 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,    \
     return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);      \
  }
  
+/* Rounding operations (other than RNDD) require two instructions - the first
+ * stores a rounded value (possibly the wrong way) in the dest register, but
+ * also sets a per-channel "increment bit" in the flag register.  A predicated
+ * add of 1.0 fixes dest to contain the desired result.
+ */
+#define ROUND(OP)                                                            \
+void brw_##OP(struct brw_compile *p,                                         \
+             struct brw_reg dest,                                            \
+             struct brw_reg src)                                             \
+{                                                                            \
+   struct brw_instruction *rnd, *add;                                        \
+   rnd = next_insn(p, BRW_OPCODE_##OP);                                              \
+   brw_set_dest(rnd, dest);                                                  \
+   brw_set_src0(rnd, src);                                                   \
+   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
+                                                                             \
+   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                            \
+   add->header.predicate_control = BRW_PREDICATE_NORMAL;                     \
+}
+
  
  ALU1(MOV)
  ALU2(SEL)
@@ -554,11 +686,8 @@ ALU2(SHL)
  ALU2(RSR)
  ALU2(RSL)
  ALU2(ASR)
-ALU2(ADD)
-ALU2(MUL)
  ALU1(FRC)
  ALU1(RNDD)
-ALU1(RNDZ)
  ALU2(MAC)
  ALU2(MACH)
  ALU1(LZD)
@@ -567,8 +696,70 @@ ALU2(DPH)
  ALU2(DP3)
  ALU2(DP2)
  ALU2(LINE)
+ALU2(PLN)
+
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+
+struct brw_instruction *brw_ADD(struct brw_compile *p,
+                               struct brw_reg dest,
+                               struct brw_reg src0,
+                               struct brw_reg src1)
+{
+   /* 6.2.2: add */
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+       src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+       src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
+}
  
+struct brw_instruction *brw_MUL(struct brw_compile *p,
+                               struct brw_reg dest,
+                               struct brw_reg src0,
+                               struct brw_reg src1)
+{
+   /* 6.32.38: mul */
+   if (src0.type == BRW_REGISTER_TYPE_D ||
+       src0.type == BRW_REGISTER_TYPE_UD ||
+       src1.type == BRW_REGISTER_TYPE_D ||
+       src1.type == BRW_REGISTER_TYPE_UD) {
+      assert(dest.type != BRW_REGISTER_TYPE_F);
+   }
  
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+       src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+       src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+         src0.nr != BRW_ARF_ACCUMULATOR);
+   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+         src1.nr != BRW_ARF_ACCUMULATOR);
+
+   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
+}
  
  
  void brw_NOP(struct brw_compile *p)
@@ -620,6 +811,7 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p,
   */
  struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
  {
+   struct intel_context *intel = &p->brw->intel;
     struct brw_instruction *insn;
  
     if (p->single_program_flow) {
@@ -633,9 +825,15 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
  
     /* Override the defaults for this instruction:
      */
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+   if (intel->gen < 6) {
+      brw_set_dest(insn, brw_ip_reg());
+      brw_set_src0(insn, brw_ip_reg());
+      brw_set_src1(insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+   }
  
     insn->header.execution_size = execute_size;
     insn->header.compression_control = BRW_COMPRESSION_NONE;
@@ -657,7 +855,9 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
     struct brw_instruction *insn;
     GLuint br = 1;
  
-   if (intel->is_ironlake)
+   /* jump count is for 64bit data chunk each, so one 128bit
+      instruction requires 2 chunks. */
+   if (intel->gen >= 5)
        br = 2;
  
     if (p->single_program_flow) {
@@ -666,9 +866,15 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
        insn = next_insn(p, BRW_OPCODE_ELSE);
     }
  
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+   if (intel->gen < 6) {
+      brw_set_dest(insn, brw_ip_reg());
+      brw_set_src0(insn, brw_ip_reg());
+      brw_set_src1(insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+   }
  
     insn->header.compression_control = BRW_COMPRESSION_NONE;
     insn->header.execution_size = if_insn->header.execution_size;
@@ -685,9 +891,13 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
     } else {
        assert(if_insn->header.opcode == BRW_OPCODE_IF);
  
-      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
-      if_insn->bits3.if_else.pop_count = 0;
-      if_insn->bits3.if_else.pad0 = 0;
+      if (intel->gen < 6) {
+        if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+        if_insn->bits3.if_else.pop_count = 0;
+        if_insn->bits3.if_else.pad0 = 0;
+      } else {
+        if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
+      }
     }
  
     return insn;
@@ -699,7 +909,7 @@ void brw_ENDIF(struct brw_compile *p,
     struct intel_context *intel = &p->brw->intel;
     GLuint br = 1;
  
-   if (intel->is_ironlake)
+   if (intel->gen >= 5)
        br = 2; 
   
     if (p->single_program_flow) {
@@ -715,9 +925,15 @@ void brw_ENDIF(struct brw_compile *p,
     } else {
        struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
  
-      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src1(insn, brw_imm_d(0x0));
+      if (intel->gen < 6) {
+        brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+        brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+        brw_set_src1(insn, brw_imm_d(0x0));
+      } else {
+        brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_W));
+        brw_set_src0(insn, brw_null_reg());
+        brw_set_src1(insn, brw_null_reg());
+      }
  
        insn->header.compression_control = BRW_COMPRESSION_NONE;
        insn->header.execution_size = patch_insn->header.execution_size;
@@ -730,29 +946,46 @@ void brw_ENDIF(struct brw_compile *p,
         * instruction respectively.
         */
        if (patch_insn->header.opcode == BRW_OPCODE_IF) {
-        /* Automagically turn it into an IFF:
-         */
-        patch_insn->header.opcode = BRW_OPCODE_IFF;
-        patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
-        patch_insn->bits3.if_else.pop_count = 0;
-        patch_insn->bits3.if_else.pad0 = 0;
-      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-        patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
-        patch_insn->bits3.if_else.pop_count = 1;
-        patch_insn->bits3.if_else.pad0 = 0;
+        if (intel->gen < 6) {
+           /* Turn it into an IFF, which means no mask stack operations for
+            * all-false and jumping past the ENDIF.
+            */
+           patch_insn->header.opcode = BRW_OPCODE_IFF;
+           patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+           patch_insn->bits3.if_else.pop_count = 0;
+           patch_insn->bits3.if_else.pad0 = 0;
+        } else {
+           /* As of gen6, there is no IFF and IF must point to the ENDIF. */
+           patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
+        }
        } else {
-        assert(0);
+        assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
+        if (intel->gen < 6) {
+           /* BRW_OPCODE_ELSE pre-gen6 should point just past the
+            * matching ENDIF.
+            */
+           patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+           patch_insn->bits3.if_else.pop_count = 1;
+           patch_insn->bits3.if_else.pad0 = 0;
+        } else {
+           /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
+           patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
+        }
        }
  
        /* Also pop item off the stack in the endif instruction:
         */
-      insn->bits3.if_else.jump_count = 0;
-      insn->bits3.if_else.pop_count = 1;
-      insn->bits3.if_else.pad0 = 0;
+      if (intel->gen < 6) {
+        insn->bits3.if_else.jump_count = 0;
+        insn->bits3.if_else.pop_count = 1;
+        insn->bits3.if_else.pad0 = 0;
+      } else {
+        insn->bits1.branch_gen6.jump_count = 2;
+      }
     }
  }
  
-struct brw_instruction *brw_BREAK(struct brw_compile *p)
+struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
  {
     struct brw_instruction *insn;
     insn = next_insn(p, BRW_OPCODE_BREAK);
@@ -763,10 +996,11 @@ struct brw_instruction *brw_BREAK(struct brw_compile *p)
     insn->header.execution_size = BRW_EXECUTE_8;
     /* insn->header.mask_control = BRW_MASK_DISABLE; */
     insn->bits3.if_else.pad0 = 0;
+   insn->bits3.if_else.pop_count = pop_count;
     return insn;
  }
  
-struct brw_instruction *brw_CONT(struct brw_compile *p)
+struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
  {
     struct brw_instruction *insn;
     insn = next_insn(p, BRW_OPCODE_CONTINUE);
@@ -777,6 +1011,7 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
     insn->header.execution_size = BRW_EXECUTE_8;
     /* insn->header.mask_control = BRW_MASK_DISABLE; */
     insn->bits3.if_else.pad0 = 0;
+   insn->bits3.if_else.pop_count = pop_count;
     return insn;
  }
  
@@ -814,7 +1049,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
     struct brw_instruction *insn;
     GLuint br = 1;
  
-   if (intel->is_ironlake)
+   if (intel->gen >= 5)
        br = 2;
  
     if (p->single_program_flow)
@@ -858,7 +1093,7 @@ void brw_land_fwd_jump(struct brw_compile *p,
     struct brw_instruction *landing = &p->store[p->nr_insn];
     GLuint jmpi = 1;
  
-   if (intel->is_ironlake)
+   if (intel->gen >= 5)
         jmpi = 2;
  
     assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
@@ -900,6 +1135,20 @@ void brw_CMP(struct brw_compile *p,
     }
  }
  
+/* Issue 'wait' instruction for n1, host could program MMIO
+   to wake up thread. */
+void brw_WAIT (struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
+   struct brw_reg src = brw_notification_1_reg();
+
+   brw_set_dest(insn, src);
+   brw_set_src0(insn, src);
+   brw_set_src1(insn, brw_null_reg());
+   insn->header.execution_size = 0; /* must */
+   insn->header.predicate_control = 0;
+   insn->header.compression_control = 0;
+}
  
  
  /***********************************************************************
@@ -917,26 +1166,90 @@ void brw_math( struct brw_compile *p,
                GLuint data_type,
                GLuint precision )
  {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
-   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
+
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+      assert(src.file == BRW_GENERAL_REGISTER_FILE);
+
+      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+
+      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
+         function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+        assert(src.type == BRW_REGISTER_TYPE_F);
+      }
+
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+      /* Example code doesn't set predicate_control for send
+       * instructions.
+       */
+      insn->header.predicate_control = 0;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_math_message(p->brw,
+                          insn,
+                          msg_length, response_length,
+                          function,
+                          BRW_MATH_INTEGER_UNSIGNED,
+                          precision,
+                          saturate,
+                          data_type);
+   }
+}
+
+/** Extended math function, float[8].
+ */
+void brw_math2(struct brw_compile *p,
+              struct brw_reg dest,
+              GLuint function,
+              struct brw_reg src0,
+              struct brw_reg src1)
+{
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
  
-   /* Example code doesn't set predicate_control for send
-    * instructions.
+   assert(intel->gen >= 6);
+   (void) intel;
+
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
+   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+
+   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+
+   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
+       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+      assert(src0.type == BRW_REGISTER_TYPE_F);
+      assert(src1.type == BRW_REGISTER_TYPE_F);
+   }
+
+   /* Math is the same ISA format as other opcodes, except that CondModifier
+    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
      */
-   insn->header.predicate_control = 0; 
-   insn->header.destreg__conditionalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = function;
  
     brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(p->brw,
-                       insn, 
-                       msg_length, response_length, 
-                       function,
-                       BRW_MATH_INTEGER_UNSIGNED,
-                       precision,
-                       saturate,
-                       data_type);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
  }
  
  /**
@@ -951,10 +1264,25 @@ void brw_math_16( struct brw_compile *p,
                   struct brw_reg src,
                   GLuint precision )
  {
+   struct intel_context *intel = &p->brw->intel;
     struct brw_instruction *insn;
     GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
     GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
  
+   if (intel->gen >= 6) {
+      insn = next_insn(p, BRW_OPCODE_MATH);
+
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+      return;
+   }
+
     /* First instruction:
      */
     brw_push_insn_state(p);
@@ -1005,6 +1333,7 @@ void brw_dp_WRITE_16( struct brw_compile *p,
                       struct brw_reg src,
                       GLuint scratch_offset )
  {
+   struct intel_context *intel = &p->brw->intel;
     GLuint msg_reg_nr = 1;
     {
        brw_push_insn_state(p);
@@ -1021,13 +1350,32 @@ void brw_dp_WRITE_16( struct brw_compile *p,
  
     {
        GLuint msg_length = 3;
-      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_reg dest;
        struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
+      int send_commit_msg;
+
        insn->header.predicate_control = 0; /* XXX */
        insn->header.compression_control = BRW_COMPRESSION_NONE; 
        insn->header.destreg__conditionalmod = msg_reg_nr;
-  
+
+      /* Until gen6, writes followed by reads from the same location
+       * are not guaranteed to be ordered unless write_commit is set.
+       * If set, then a no-op write is issued to the destination
+       * register to set a dependency, and a read from the destination
+       * can be used to ensure the ordering.
+       *
+       * For gen6, only writes between different threads need ordering
+       * protection.  Our use of DP writes is all about register
+       * spilling within a thread.
+       */
+      if (intel->gen >= 6) {
+        dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+        send_commit_msg = 0;
+      } else {
+        dest = brw_uw16_grf(0, 0);
+        send_commit_msg = 1;
+      }
+
        brw_set_dest(insn, dest);
        brw_set_src0(insn, src);
  
@@ -1037,9 +1385,11 @@ void brw_dp_WRITE_16( struct brw_compile *p,
                                BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
                                BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
                                msg_length,
+                              GL_TRUE, /* header_present */
                                0, /* pixel scoreboard */
-                              0, /* response_length */
-                              0); /* eot */
+                              send_commit_msg, /* response_length */
+                              0, /* eot */
+                              send_commit_msg);
     }
  }
  
@@ -1080,7 +1430,7 @@ void brw_dp_READ_16( struct brw_compile *p,
        brw_set_dp_read_message(p->brw,
                               insn,
                               255, /* binding table index (255=stateless) */
-                             3,  /* msg_control (3 means 4 Owords) */
+                             BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
                               1, /* target cache (render/scratch) */
                               1, /* msg_length */
@@ -1155,73 +1505,113 @@ void brw_dp_READ_4( struct brw_compile *p,
   */
  void brw_dp_READ_4_vs(struct brw_compile *p,
                        struct brw_reg dest,
-                      GLuint oword,
-                      GLboolean relAddr,
-                      struct brw_reg addrReg,
                        GLuint location,
                        GLuint bind_table_index)
  {
+   struct brw_instruction *insn;
     GLuint msg_reg_nr = 1;
+   struct brw_reg b;
  
-   assert(oword < 2);
     /*
     printf("vs const read msg, location %u, msg_reg_nr %d\n",
            location, msg_reg_nr);
     */
  
     /* Setup MRF[1] with location/offset into const buffer */
-   {
-      struct brw_reg b;
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
  
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+    */
+   b = brw_message_reg(msg_reg_nr);
+   b = retype(b, BRW_REGISTER_TYPE_UD);
+   /*b = get_element_ud(b, 2);*/
+   brw_MOV(p, b, brw_imm_ud(location));
  
-      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-       */
-      b = brw_message_reg(msg_reg_nr);
-      b = retype(b, BRW_REGISTER_TYPE_UD);
-      /*b = get_element_ud(b, 2);*/
-      if (relAddr) {
-         brw_ADD(p, b, addrReg, brw_imm_ud(location));
-      }
-      else {
-         brw_MOV(p, b, brw_imm_ud(location));
-      }
+   brw_pop_insn_state(p);
  
-      brw_pop_insn_state(p);
-   }
+   insn = next_insn(p, BRW_OPCODE_SEND);
  
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = BRW_PREDICATE_NONE;
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-      insn->header.mask_control = BRW_MASK_DISABLE;
-      /*insn->header.access_mode = BRW_ALIGN_16;*/
-  
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, brw_null_reg());
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+   insn->header.mask_control = BRW_MASK_DISABLE;
  
-      brw_set_dp_read_message(p->brw,
-                             insn,
-                             bind_table_index,
-                             oword,  /* 0 = lower Oword, 1 = upper Oword */
-                             BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-                             0, /* source cache = data cache */
-                             1, /* msg_length */
-                             1, /* response_length (1 Oword) */
-                             0); /* eot */
-   }
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_null_reg());
+
+   brw_set_dp_read_message(p->brw,
+                          insn,
+                          bind_table_index,
+                          0,
+                          BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+                          0, /* source cache = data cache */
+                          1, /* msg_length */
+                          1, /* response_length (1 Oword) */
+                          0); /* eot */
+}
+
+/**
+ * Read a float[4] constant per vertex from VS constant buffer, with
+ * relative addressing.
+ */
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+                              struct brw_reg dest,
+                              struct brw_reg addr_reg,
+                              GLuint offset,
+                              GLuint bind_table_index)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int msg_type;
+
+   /* Setup MRF[1] with offset into const buffer */
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
+    * fields ignored.
+    */
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+          addr_reg, brw_imm_d(offset));
+   brw_pop_insn_state(p);
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = BRW_PREDICATE_NONE;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditionalmod = 0;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, brw_vec8_grf(0, 0));
+
+   if (intel->gen == 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   brw_set_dp_read_message(p->brw,
+                          insn,
+                          bind_table_index,
+                          BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                          msg_type,
+                          0, /* source cache = data cache */
+                          2, /* msg_length */
+                          1, /* response_length */
+                          0); /* eot */
  }
  
  
  
  void brw_fb_WRITE(struct brw_compile *p,
+                 int dispatch_width,
                    struct brw_reg dest,
                    GLuint msg_reg_nr,
                    struct brw_reg src0,
@@ -1230,23 +1620,47 @@ void brw_fb_WRITE(struct brw_compile *p,
                    GLuint response_length,
                    GLboolean eot)
  {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+   GLuint msg_control, msg_type;
+   GLboolean header_present = GL_TRUE;
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
     insn->header.predicate_control = 0; /* XXX */
-   insn->header.compression_control = BRW_COMPRESSION_NONE; 
-   insn->header.destreg__conditionalmod = msg_reg_nr;
-  
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (intel->gen >= 6) {
+      if (msg_length == 4)
+        header_present = GL_FALSE;
+
+       /* headerless version, just submit color payload */
+       src0 = brw_message_reg(msg_reg_nr);
+
+       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
+   } else {
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+   }
+
+   if (dispatch_width == 16)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   else
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+
     brw_set_dest(insn, dest);
     brw_set_src0(insn, src0);
     brw_set_dp_write_message(p->brw,
                             insn,
                             binding_table_index,
-                           BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
-                           BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+                           msg_control,
+                           msg_type,
                             msg_length,
+                           header_present,
                             1,  /* pixel scoreboard */
-                           response_length, 
-                           eot);
+                           response_length,
+                           eot,
+                           0 /* send_commit_msg */);
  }
  
  
@@ -1269,10 +1683,11 @@ void brw_SAMPLE(struct brw_compile *p,
                 GLuint header_present,
                 GLuint simd_mode)
  {
+   struct intel_context *intel = &p->brw->intel;
     GLboolean need_stall = 0;
-   
+
     if (writemask == 0) {
-      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+      /*printf("%s: zero writemask??\n", __FUNCTION__); */
        return;
     }
     
@@ -1304,11 +1719,17 @@ void brw_SAMPLE(struct brw_compile *p,
  
        if (newmask != writemask) {
          need_stall = 1;
-         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* printf("need stall %x %x\n", newmask , writemask); */
        }
        else {
+        GLboolean dispatch_16 = GL_FALSE;
+
          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
-        
+
+        guess_execution_size(p->current, dest);
+        if (p->current->header.execution_size == BRW_EXECUTE_16)
+           dispatch_16 = GL_TRUE;
+
          newmask = ~newmask & WRITEMASK_XYZW;
  
          brw_push_insn_state(p);
@@ -1323,16 +1744,36 @@ void brw_SAMPLE(struct brw_compile *p,
  
          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
          dest = offset(dest, dst_offset);
-        response_length = len * 2;
+
+        /* For 16-wide dispatch, masked channels are skipped in the
+         * response.  For 8-wide, masked channels still take up slots,
+         * and are just not written to.
+         */
+        if (dispatch_16)
+           response_length = len * 2;
        }
     }
  
     {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+      struct brw_instruction *insn;
     
+      /* Sandybridge doesn't have the implied move for SENDs,
+       * and the first message register index comes from src0.
+       */
+      if (intel->gen >= 6) {
+         brw_push_insn_state(p);
+         brw_set_mask_control( p, BRW_MASK_DISABLE );
+         /* m1 contains header? */
+         brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+         brw_pop_insn_state(p);
+         src0 = brw_message_reg(msg_reg_nr);
+      }
+
+      insn = next_insn(p, BRW_OPCODE_SEND);
        insn->header.predicate_control = 0; /* XXX */
        insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditionalmod = msg_reg_nr;
+      if (intel->gen < 6)
+         insn->header.destreg__conditionalmod = msg_reg_nr;
  
        brw_set_dest(insn, dest);
        brw_set_src0(insn, src0);
@@ -1377,7 +1818,21 @@ void brw_urb_WRITE(struct brw_compile *p,
                    GLuint offset,
                    GLuint swizzle)
  {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+
+   /* Sandybridge doesn't have the implied move for SENDs,
+    * and the first message register index comes from src0.
+    */
+   if (intel->gen >= 6) {
+      brw_push_insn_state(p);
+      brw_set_mask_control( p, BRW_MASK_DISABLE );
+      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+      brw_pop_insn_state(p);
+      src0 = brw_message_reg(msg_reg_nr);
+   }
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
  
     assert(msg_length < BRW_MAX_MRF);
  
@@ -1385,7 +1840,8 @@ void brw_urb_WRITE(struct brw_compile *p,
     brw_set_src0(insn, src0);
     brw_set_src1(insn, brw_imm_d(0));
  
-   insn->header.destreg__conditionalmod = msg_reg_nr;
+   if (intel->gen < 6)
+      insn->header.destreg__conditionalmod = msg_reg_nr;
  
     brw_set_urb_message(p->brw,
                        insn,
@@ -1404,32 +1860,35 @@ void brw_ff_sync(struct brw_compile *p,
                    GLuint msg_reg_nr,
                    struct brw_reg src0,
                    GLboolean allocate,
-                  GLboolean used,
-                  GLuint msg_length,
                    GLuint response_length,
-                  GLboolean eot,
-                  GLboolean writes_complete,
-                  GLuint offset,
-                  GLuint swizzle)
+                  GLboolean eot)
  {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
  
-   assert(msg_length < 16);
+   /* Sandybridge doesn't have the implied move for SENDs,
+    * and the first message register index comes from src0.
+    */
+   if (intel->gen >= 6) {
+      brw_push_insn_state(p);
+      brw_set_mask_control( p, BRW_MASK_DISABLE );
+      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+             retype(src0, BRW_REGISTER_TYPE_UD));
+      brw_pop_insn_state(p);
+      src0 = brw_message_reg(msg_reg_nr);
+   }
  
+   insn = next_insn(p, BRW_OPCODE_SEND);
     brw_set_dest(insn, dest);
     brw_set_src0(insn, src0);
     brw_set_src1(insn, brw_imm_d(0));
  
-   insn->header.destreg__conditionalmod = msg_reg_nr;
+   if (intel->gen < 6)
+       insn->header.destreg__conditionalmod = msg_reg_nr;
  
     brw_set_ff_sync_message(p->brw,
-                      insn,
-                      allocate,
-                      used,
-                      msg_length,
-                      response_length, 
-                      eot, 
-                      writes_complete, 
-                      offset,
-                      swizzle);
+                          insn,
+                          allocate,
+                          response_length,
+                          eot);
  }