i965: Add HiZ operation state to brw_context
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
index 13c925d8227222fcda2e0d7cecbbb12c4c2853f1..dbb42f4ea76e161de0f7735d23c76df5c2ee35f8 100644 (file)
@@ -34,7 +34,7 @@
 #include "brw_defines.h"
 #include "brw_eu.h"
 
-#include "../glsl/ralloc.h"
+#include "glsl/ralloc.h"
 
 /***********************************************************************
  * Internal helper for constructing instructions
@@ -58,13 +58,13 @@ static void guess_execution_size(struct brw_compile *p,
  * On Sandybridge, this is no longer the case.  This function performs the
  * explicit move; it should be called before emitting a SEND instruction.
  */
-static void
+void
 gen6_resolve_implied_move(struct brw_compile *p,
                          struct brw_reg *src,
                          GLuint msg_reg_nr)
 {
    struct intel_context *intel = &p->brw->intel;
-   if (intel->gen != 6)
+   if (intel->gen < 6)
       return;
 
    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
@@ -78,15 +78,27 @@ gen6_resolve_implied_move(struct brw_compile *p,
    *src = brw_message_reg(msg_reg_nr);
 }
 
+static void
+gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
+{
+   struct intel_context *intel = &p->brw->intel;
+   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+      reg->file = BRW_GENERAL_REGISTER_FILE;
+      reg->nr += 111;
+   }
+}
 
-static void brw_set_dest(struct brw_compile *p,
-                        struct brw_instruction *insn,
-                        struct brw_reg dest)
+
+void
+brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+            struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &dest);
+
    insn->bits1.da1.dest_reg_file = dest.file;
    insn->bits1.da1.dest_reg_type = dest.type;
    insn->bits1.da1.dest_address_mode = dest.address_mode;
@@ -209,13 +221,15 @@ validate_reg(struct brw_instruction *insn, struct brw_reg reg)
    /* 10. Check destination issues. */
 }
 
-static void brw_set_src0(struct brw_compile *p,
-                        struct brw_instruction *insn,
-                        struct brw_reg reg)
+void
+brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+            struct brw_reg reg)
 {
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &reg);
+
    validate_reg(insn, reg);
 
    insn->bits1.da1.src0_reg_file = reg.file;
@@ -294,6 +308,8 @@ void brw_set_src1(struct brw_compile *p,
 
    assert(reg.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &reg);
+
    validate_reg(insn, reg);
 
    insn->bits1.da1.src1_reg_file = reg.file;
@@ -354,182 +370,222 @@ void brw_set_src1(struct brw_compile *p,
    }
 }
 
+/**
+ * Set the Message Descriptor and Extended Message Descriptor fields
+ * for SEND messages.
+ *
+ * \note This zeroes out the Function Control bits, so it must be called
+ *       \b before filling out any message-specific data.  Callers can
+ *       choose not to fill in irrelevant bits; they will be zero.
+ */
+static void
+brw_set_message_descriptor(struct brw_compile *p,
+                          struct brw_instruction *inst,
+                          enum brw_message_target sfid,
+                          unsigned msg_length,
+                          unsigned response_length,
+                          bool header_present,
+                          bool end_of_thread)
+{
+   struct intel_context *intel = &p->brw->intel;
+
+   brw_set_src1(p, inst, brw_imm_d(0));
+
+   if (intel->gen >= 5) {
+      inst->bits3.generic_gen5.header_present = header_present;
+      inst->bits3.generic_gen5.response_length = response_length;
+      inst->bits3.generic_gen5.msg_length = msg_length;
+      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 
+      if (intel->gen >= 6) {
+        /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
+        inst->header.destreg__conditionalmod = sfid;
+      } else {
+        /* Set Extended Message Descriptor (ex_desc) */
+        inst->bits2.send_gen5.sfid = sfid;
+        inst->bits2.send_gen5.end_of_thread = end_of_thread;
+      }
+   } else {
+      inst->bits3.generic.response_length = response_length;
+      inst->bits3.generic.msg_length = msg_length;
+      inst->bits3.generic.msg_target = sfid;
+      inst->bits3.generic.end_of_thread = end_of_thread;
+   }
+}
 
 static void brw_set_math_message( struct brw_compile *p,
                                  struct brw_instruction *insn,
-                                 GLuint msg_length,
-                                 GLuint response_length,
                                  GLuint function,
                                  GLuint integer_type,
-                                 GLboolean low_precision,
-                                 GLboolean saturate,
+                                 bool low_precision,
+                                 bool saturate,
                                  GLuint dataType )
 {
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(p, insn, brw_imm_d(0));
+   unsigned msg_length;
+   unsigned response_length;
+
+   /* Infer message length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      msg_length = 2;
+      break;
+   default:
+      msg_length = 1;
+      break;
+   }
 
+   /* Infer response length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_SINCOS:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      response_length = 2;
+      break;
+   default:
+      response_length = 1;
+      break;
+   }
+
+   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
+                             msg_length, response_length, false, false);
    if (intel->gen == 5) {
-       insn->bits3.math_gen5.function = function;
-       insn->bits3.math_gen5.int_type = integer_type;
-       insn->bits3.math_gen5.precision = low_precision;
-       insn->bits3.math_gen5.saturate = saturate;
-       insn->bits3.math_gen5.data_type = dataType;
-       insn->bits3.math_gen5.snapshot = 0;
-       insn->bits3.math_gen5.header_present = 0;
-       insn->bits3.math_gen5.response_length = response_length;
-       insn->bits3.math_gen5.msg_length = msg_length;
-       insn->bits3.math_gen5.end_of_thread = 0;
-       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
-       insn->bits2.send_gen5.end_of_thread = 0;
+      insn->bits3.math_gen5.function = function;
+      insn->bits3.math_gen5.int_type = integer_type;
+      insn->bits3.math_gen5.precision = low_precision;
+      insn->bits3.math_gen5.saturate = saturate;
+      insn->bits3.math_gen5.data_type = dataType;
+      insn->bits3.math_gen5.snapshot = 0;
    } else {
-       insn->bits3.math.function = function;
-       insn->bits3.math.int_type = integer_type;
-       insn->bits3.math.precision = low_precision;
-       insn->bits3.math.saturate = saturate;
-       insn->bits3.math.data_type = dataType;
-       insn->bits3.math.response_length = response_length;
-       insn->bits3.math.msg_length = msg_length;
-       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-       insn->bits3.math.end_of_thread = 0;
+      insn->bits3.math.function = function;
+      insn->bits3.math.int_type = integer_type;
+      insn->bits3.math.precision = low_precision;
+      insn->bits3.math.saturate = saturate;
+      insn->bits3.math.data_type = dataType;
    }
 }
 
 
 static void brw_set_ff_sync_message(struct brw_compile *p,
                                    struct brw_instruction *insn,
-                                   GLboolean allocate,
+                                   bool allocate,
                                    GLuint response_length,
-                                   GLboolean end_of_thread)
+                                   bool end_of_thread)
 {
-       struct brw_context *brw = p->brw;
-       struct intel_context *intel = &brw->intel;
-       brw_set_src1(p, insn, brw_imm_d(0));
-
-       insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
-       insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
-       insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
-       insn->bits3.urb_gen5.allocate = allocate;
-       insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
-       insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
-       insn->bits3.urb_gen5.header_present = 1;
-       insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
-       insn->bits3.urb_gen5.msg_length = 1;
-       insn->bits3.urb_gen5.end_of_thread = end_of_thread;
-       if (intel->gen >= 6) {
-          insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
-       } else {
-          insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
-          insn->bits2.send_gen5.end_of_thread = end_of_thread;
-       }
+   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+                             1, response_length, true, end_of_thread);
+   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
+   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
+   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
+   insn->bits3.urb_gen5.allocate = allocate;
+   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
+   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 }
 
 static void brw_set_urb_message( struct brw_compile *p,
                                 struct brw_instruction *insn,
-                                GLboolean allocate,
-                                GLboolean used,
+                                bool allocate,
+                                bool used,
                                 GLuint msg_length,
                                 GLuint response_length,
-                                GLboolean end_of_thread,
-                                GLboolean complete,
+                                bool end_of_thread,
+                                bool complete,
                                 GLuint offset,
                                 GLuint swizzle_control )
 {
-    struct brw_context *brw = p->brw;
-    struct intel_context *intel = &brw->intel;
-    brw_set_src1(p, insn, brw_imm_d(0));
-
-    if (intel->gen >= 5) {
-        insn->bits3.urb_gen5.opcode = 0;       /* ? */
-        insn->bits3.urb_gen5.offset = offset;
-        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
-        insn->bits3.urb_gen5.allocate = allocate;
-        insn->bits3.urb_gen5.used = used;      /* ? */
-        insn->bits3.urb_gen5.complete = complete;
-        insn->bits3.urb_gen5.header_present = 1;
-        insn->bits3.urb_gen5.response_length = response_length;
-        insn->bits3.urb_gen5.msg_length = msg_length;
-        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
-       if (intel->gen >= 6) {
-          /* For SNB, the SFID bits moved to the condmod bits, and
-           * EOT stayed in bits3 above.  Does the EOT bit setting
-           * below on Ironlake even do anything?
-           */
-          insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
-       } else {
-          insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
-          insn->bits2.send_gen5.end_of_thread = end_of_thread;
-       }
-    } else {
-        insn->bits3.urb.opcode = 0;    /* ? */
-        insn->bits3.urb.offset = offset;
-        insn->bits3.urb.swizzle_control = swizzle_control;
-        insn->bits3.urb.allocate = allocate;
-        insn->bits3.urb.used = used;   /* ? */
-        insn->bits3.urb.complete = complete;
-        insn->bits3.urb.response_length = response_length;
-        insn->bits3.urb.msg_length = msg_length;
-        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-        insn->bits3.urb.end_of_thread = end_of_thread;
-    }
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
+
+   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+                             msg_length, response_length, true, end_of_thread);
+   if (intel->gen == 7) {
+      insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
+      insn->bits3.urb_gen7.offset = offset;
+      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
+      /* per_slot_offset = 0 makes it ignore offsets in message header */
+      insn->bits3.urb_gen7.per_slot_offset = 0;
+      insn->bits3.urb_gen7.complete = complete;
+   } else if (intel->gen >= 5) {
+      insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
+      insn->bits3.urb_gen5.offset = offset;
+      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
+      insn->bits3.urb_gen5.allocate = allocate;
+      insn->bits3.urb_gen5.used = used;        /* ? */
+      insn->bits3.urb_gen5.complete = complete;
+   } else {
+      insn->bits3.urb.opcode = 0;      /* ? */
+      insn->bits3.urb.offset = offset;
+      insn->bits3.urb.swizzle_control = swizzle_control;
+      insn->bits3.urb.allocate = allocate;
+      insn->bits3.urb.used = used;     /* ? */
+      insn->bits3.urb.complete = complete;
+   }
 }
 
-static void brw_set_dp_write_message( struct brw_compile *p,
-                                     struct brw_instruction *insn,
-                                     GLuint binding_table_index,
-                                     GLuint msg_control,
-                                     GLuint msg_type,
-                                     GLuint msg_length,
-                                     GLboolean header_present,
-                                     GLuint pixel_scoreboard_clear,
-                                     GLuint response_length,
-                                     GLuint end_of_thread,
-                                     GLuint send_commit_msg)
+void
+brw_set_dp_write_message(struct brw_compile *p,
+                        struct brw_instruction *insn,
+                        GLuint binding_table_index,
+                        GLuint msg_control,
+                        GLuint msg_type,
+                        GLuint msg_length,
+                        bool header_present,
+                        GLuint last_render_target,
+                        GLuint response_length,
+                        GLuint end_of_thread,
+                        GLuint send_commit_msg)
 {
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(p, insn, brw_imm_ud(0));
+   unsigned sfid;
 
-   if (intel->gen >= 6) {
-       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
-       insn->bits3.gen6_dp.msg_control = msg_control;
-       insn->bits3.gen6_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
-       insn->bits3.gen6_dp.msg_type = msg_type;
-       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
-       insn->bits3.gen6_dp.header_present = header_present;
-       insn->bits3.gen6_dp.response_length = response_length;
-       insn->bits3.gen6_dp.msg_length = msg_length;
-       insn->bits3.gen6_dp.end_of_thread = end_of_thread;
-
-       /* We always use the render cache for write messages */
-       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
+   if (intel->gen >= 7) {
+      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
+      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
+        sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+      else
+        sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+   } else if (intel->gen == 6) {
+      /* Use the render cache for all write messages. */
+      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+   } else {
+      sfid = BRW_SFID_DATAPORT_WRITE;
+   }
+
+   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+                             header_present, end_of_thread);
+
+   if (intel->gen >= 7) {
+      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+      insn->bits3.gen7_dp.msg_control = msg_control;
+      insn->bits3.gen7_dp.last_render_target = last_render_target;
+      insn->bits3.gen7_dp.msg_type = msg_type;
+   } else if (intel->gen == 6) {
+      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+      insn->bits3.gen6_dp.msg_control = msg_control;
+      insn->bits3.gen6_dp.last_render_target = last_render_target;
+      insn->bits3.gen6_dp.msg_type = msg_type;
+      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
    } else if (intel->gen == 5) {
-       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
-       insn->bits3.dp_write_gen5.msg_control = msg_control;
-       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
-       insn->bits3.dp_write_gen5.msg_type = msg_type;
-       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
-       insn->bits3.dp_write_gen5.header_present = header_present;
-       insn->bits3.dp_write_gen5.response_length = response_length;
-       insn->bits3.dp_write_gen5.msg_length = msg_length;
-       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
-       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-       insn->bits2.send_gen5.end_of_thread = end_of_thread;
+      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
+      insn->bits3.dp_write_gen5.msg_control = msg_control;
+      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
+      insn->bits3.dp_write_gen5.msg_type = msg_type;
+      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
    } else {
-       insn->bits3.dp_write.binding_table_index = binding_table_index;
-       insn->bits3.dp_write.msg_control = msg_control;
-       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-       insn->bits3.dp_write.msg_type = msg_type;
-       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
-       insn->bits3.dp_write.response_length = response_length;
-       insn->bits3.dp_write.msg_length = msg_length;
-       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-       insn->bits3.dp_write.end_of_thread = end_of_thread;
+      insn->bits3.dp_write.binding_table_index = binding_table_index;
+      insn->bits3.dp_write.msg_control = msg_control;
+      insn->bits3.dp_write.last_render_target = last_render_target;
+      insn->bits3.dp_write.msg_type = msg_type;
+      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
    }
 }
 
-static void
+void
 brw_set_dp_read_message(struct brw_compile *p,
                        struct brw_instruction *insn,
                        GLuint binding_table_index,
@@ -541,58 +597,48 @@ brw_set_dp_read_message(struct brw_compile *p,
 {
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(p, insn, brw_imm_d(0));
+   unsigned sfid;
 
-   if (intel->gen >= 6) {
-       uint32_t target_function;
-
-       if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
-         target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
-       else
-         target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
-
-       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
-       insn->bits3.gen6_dp.msg_control = msg_control;
-       insn->bits3.gen6_dp.pixel_scoreboard_clear = 0;
-       insn->bits3.gen6_dp.msg_type = msg_type;
-       insn->bits3.gen6_dp.send_commit_msg = 0;
-       insn->bits3.gen6_dp.header_present = 1;
-       insn->bits3.gen6_dp.response_length = response_length;
-       insn->bits3.gen6_dp.msg_length = msg_length;
-       insn->bits3.gen6_dp.end_of_thread = 0;
-       insn->header.destreg__conditionalmod = target_function;
+   if (intel->gen >= 7) {
+      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+   } else if (intel->gen == 6) {
+      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
+        sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+      else
+        sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
+   } else {
+      sfid = BRW_SFID_DATAPORT_READ;
+   }
+
+   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+                             true, false);
+
+   if (intel->gen >= 7) {
+      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+      insn->bits3.gen7_dp.msg_control = msg_control;
+      insn->bits3.gen7_dp.last_render_target = 0;
+      insn->bits3.gen7_dp.msg_type = msg_type;
+   } else if (intel->gen == 6) {
+      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+      insn->bits3.gen6_dp.msg_control = msg_control;
+      insn->bits3.gen6_dp.last_render_target = 0;
+      insn->bits3.gen6_dp.msg_type = msg_type;
+      insn->bits3.gen6_dp.send_commit_msg = 0;
    } else if (intel->gen == 5) {
-       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
-       insn->bits3.dp_read_gen5.msg_control = msg_control;
-       insn->bits3.dp_read_gen5.msg_type = msg_type;
-       insn->bits3.dp_read_gen5.target_cache = target_cache;
-       insn->bits3.dp_read_gen5.header_present = 1;
-       insn->bits3.dp_read_gen5.response_length = response_length;
-       insn->bits3.dp_read_gen5.msg_length = msg_length;
-       insn->bits3.dp_read_gen5.pad1 = 0;
-       insn->bits3.dp_read_gen5.end_of_thread = 0;
-       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
-       insn->bits2.send_gen5.end_of_thread = 0;
+      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
+      insn->bits3.dp_read_gen5.msg_control = msg_control;
+      insn->bits3.dp_read_gen5.msg_type = msg_type;
+      insn->bits3.dp_read_gen5.target_cache = target_cache;
    } else if (intel->is_g4x) {
-       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
-       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
-       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
-       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
-       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
-       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
-       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
-       insn->bits3.dp_read_g4x.pad1 = 0;
-       insn->bits3.dp_read_g4x.end_of_thread = 0;
+      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
+      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
+      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
+      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
    } else {
-       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
-       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
-       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
-       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
-       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
-       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
-       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
-       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
-       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
+      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
    }
 }
 
@@ -603,54 +649,41 @@ static void brw_set_sampler_message(struct brw_compile *p,
                                     GLuint msg_type,
                                     GLuint response_length,
                                     GLuint msg_length,
-                                    GLboolean eot,
                                     GLuint header_present,
                                     GLuint simd_mode)
 {
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   assert(eot == 0);
-   brw_set_src1(p, insn, brw_imm_d(0));
 
-   if (intel->gen >= 5) {
+   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
+                             response_length, header_present, false);
+
+   if (intel->gen >= 7) {
+      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
+      insn->bits3.sampler_gen7.sampler = sampler;
+      insn->bits3.sampler_gen7.msg_type = msg_type;
+      insn->bits3.sampler_gen7.simd_mode = simd_mode;
+   } else if (intel->gen >= 5) {
       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
       insn->bits3.sampler_gen5.sampler = sampler;
       insn->bits3.sampler_gen5.msg_type = msg_type;
       insn->bits3.sampler_gen5.simd_mode = simd_mode;
-      insn->bits3.sampler_gen5.header_present = header_present;
-      insn->bits3.sampler_gen5.response_length = response_length;
-      insn->bits3.sampler_gen5.msg_length = msg_length;
-      insn->bits3.sampler_gen5.end_of_thread = eot;
-      if (intel->gen >= 6)
-         insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
-      else {
-         insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
-         insn->bits2.send_gen5.end_of_thread = eot;
-      }
    } else if (intel->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
-      insn->bits3.sampler_g4x.response_length = response_length;
-      insn->bits3.sampler_g4x.msg_length = msg_length;
-      insn->bits3.sampler_g4x.end_of_thread = eot;
-      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
    } else {
       insn->bits3.sampler.binding_table_index = binding_table_index;
       insn->bits3.sampler.sampler = sampler;
       insn->bits3.sampler.msg_type = msg_type;
       insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
-      insn->bits3.sampler.response_length = response_length;
-      insn->bits3.sampler.msg_length = msg_length;
-      insn->bits3.sampler.end_of_thread = eot;
-      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
    }
 }
 
 
-
-static struct brw_instruction *next_insn( struct brw_compile *p, 
-                                         GLuint opcode )
+#define next_insn brw_next_insn
+struct brw_instruction *
+brw_next_insn(struct brw_compile *p, GLuint opcode)
 {
    struct brw_instruction *insn;
 
@@ -671,7 +704,6 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    return insn;
 }
 
-
 static struct brw_instruction *brw_alu1( struct brw_compile *p,
                                         GLuint opcode,
                                         struct brw_reg dest,
@@ -721,6 +753,8 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,     \
  * stores a rounded value (possibly the wrong way) in the dest register, but
  * also sets a per-channel "increment bit" in the flag register.  A predicated
  * add of 1.0 fixes dest to contain the desired result.
+ *
+ * Sandybridge and later appear to round correctly without an ADD.
  */
 #define ROUND(OP)                                                            \
 void brw_##OP(struct brw_compile *p,                                         \
@@ -731,10 +765,13 @@ void brw_##OP(struct brw_compile *p,                                            \
    rnd = next_insn(p, BRW_OPCODE_##OP);                                              \
    brw_set_dest(p, rnd, dest);                                               \
    brw_set_src0(p, rnd, src);                                                \
-   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
                                                                              \
-   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                            \
-   add->header.predicate_control = BRW_PREDICATE_NORMAL;                     \
+   if (p->brw->intel.gen < 6) {                                                      \
+      /* turn on round-increments */                                         \
+      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;               \
+      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                         \
+      add->header.predicate_control = BRW_PREDICATE_NORMAL;                  \
+   }                                                                         \
 }
 
 
@@ -897,11 +934,17 @@ brw_IF(struct brw_compile *p, GLuint execute_size)
       brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(p, insn, brw_ip_reg());
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else {
+   } else if (intel->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = 0;
+      insn->bits3.break_cont.uip = 0;
    }
 
    insn->header.execution_size = execute_size;
@@ -909,7 +952,7 @@ brw_IF(struct brw_compile *p, GLuint execute_size)
    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
    insn->header.mask_control = BRW_MASK_ENABLE;
    if (!p->single_program_flow)
-       insn->header.thread_control = BRW_THREAD_SWITCH;
+      insn->header.thread_control = BRW_THREAD_SWITCH;
 
    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
@@ -917,6 +960,9 @@ brw_IF(struct brw_compile *p, GLuint execute_size)
    return insn;
 }
 
+/* This function is only used for gen6-style IF instructions with an
+ * embedded comparison (conditional modifier).  It is not used on gen7.
+ */
 struct brw_instruction *
 gen6_IF(struct brw_compile *p, uint32_t conditional,
        struct brw_reg src0, struct brw_reg src1)
@@ -926,7 +972,11 @@ gen6_IF(struct brw_compile *p, uint32_t conditional,
    insn = next_insn(p, BRW_OPCODE_IF);
 
    brw_set_dest(p, insn, brw_imm_w(0));
-   insn->header.execution_size = BRW_EXECUTE_8;
+   if (p->compressed) {
+      insn->header.execution_size = BRW_EXECUTE_16;
+   } else {
+      insn->header.execution_size = BRW_EXECUTE_8;
+   }
    insn->bits1.branch_gen6.jump_count = 0;
    brw_set_src0(p, insn, src0);
    brw_set_src1(p, insn, src1);
@@ -936,7 +986,7 @@ gen6_IF(struct brw_compile *p, uint32_t conditional,
    insn->header.destreg__conditionalmod = conditional;
 
    if (!p->single_program_flow)
-       insn->header.thread_control = BRW_THREAD_SWITCH;
+      insn->header.thread_control = BRW_THREAD_SWITCH;
 
    push_if_stack(p, insn);
    return insn;
@@ -1018,9 +1068,12 @@ patch_IF_ELSE(struct brw_compile *p,
         if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
         if_inst->bits3.if_else.pop_count = 0;
         if_inst->bits3.if_else.pad0 = 0;
-      } else {
+      } else if (intel->gen == 6) {
         /* As of gen6, there is no IFF and IF must point to the ENDIF. */
         if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
+      } else {
+        if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+        if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
       }
    } else {
       else_inst->header.execution_size = if_inst->header.execution_size;
@@ -1042,9 +1095,15 @@ patch_IF_ELSE(struct brw_compile *p,
         else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
         else_inst->bits3.if_else.pop_count = 1;
         else_inst->bits3.if_else.pad0 = 0;
-      } else {
+      } else if (intel->gen == 6) {
         /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
         else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
+      } else {
+        /* The IF instruction's JIP should point just past the ELSE */
+        if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
+        /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+        if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+        else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
       }
    }
 }
@@ -1061,17 +1120,23 @@ brw_ELSE(struct brw_compile *p)
       brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(p, insn, brw_ip_reg());
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else {
+   } else if (intel->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = 0;
+      insn->bits3.break_cont.uip = 0;
    }
 
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.mask_control = BRW_MASK_ENABLE;
    if (!p->single_program_flow)
-       insn->header.thread_control = BRW_THREAD_SWITCH;
+      insn->header.thread_control = BRW_THREAD_SWITCH;
 
    push_if_stack(p, insn);
 }
@@ -1104,10 +1169,14 @@ brw_ENDIF(struct brw_compile *p)
       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else {
+   } else if (intel->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
    }
 
    insn->header.compression_control = BRW_COMPRESSION_NONE;
@@ -1119,8 +1188,10 @@ brw_ENDIF(struct brw_compile *p)
       insn->bits3.if_else.jump_count = 0;
       insn->bits3.if_else.pop_count = 1;
       insn->bits3.if_else.pad0 = 0;
-   } else {
+   } else if (intel->gen == 6) {
       insn->bits1.branch_gen6.jump_count = 2;
+   } else {
+      insn->bits3.break_cont.jip = 2;
    }
    patch_IF_ELSE(p, if_inst, else_inst, insn);
 }
@@ -1152,7 +1223,6 @@ struct brw_instruction *gen6_CONT(struct brw_compile *p,
                                  struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
-   int br = 2;
 
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -1161,8 +1231,6 @@ struct brw_instruction *gen6_CONT(struct brw_compile *p,
    brw_set_src0(p, insn, brw_ip_reg());
    brw_set_src1(p, insn, brw_imm_d(0x0));
 
-   insn->bits3.break_cont.uip = br * (do_insn - insn);
-
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
    return insn;
@@ -1236,7 +1304,16 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
    if (intel->gen >= 5)
       br = 2;
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 7) {
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = br * (do_insn - insn);
+
+      insn->header.execution_size = BRW_EXECUTE_8;
+   } else if (intel->gen == 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
       brw_set_dest(p, insn, brw_imm_w(0));
@@ -1244,8 +1321,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-      insn->header.execution_size = do_insn->header.execution_size;
-      assert(insn->header.execution_size == BRW_EXECUTE_8);
+      insn->header.execution_size = BRW_EXECUTE_8;
    } else {
       if (p->single_program_flow) {
         insn = next_insn(p, BRW_OPCODE_ADD);
@@ -1286,7 +1362,7 @@ void brw_land_fwd_jump(struct brw_compile *p,
    GLuint jmpi = 1;
 
    if (intel->gen >= 5)
-       jmpi = 2;
+      jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
@@ -1367,14 +1443,20 @@ void brw_math( struct brw_compile *p,
       assert(src.file == BRW_GENERAL_REGISTER_FILE);
 
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
-      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      if (intel->gen == 6)
+        assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
-      /* Source modifiers are ignored for extended math instructions. */
-      assert(!src.negate);
-      assert(!src.abs);
+      /* Source modifiers are ignored for extended math instructions on Gen6. */
+      if (intel->gen == 6) {
+        assert(!src.negate);
+        assert(!src.abs);
+      }
 
-      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
-         function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+         function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
+         function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+        assert(src.type != BRW_REGISTER_TYPE_F);
+      } else {
         assert(src.type == BRW_REGISTER_TYPE_F);
       }
 
@@ -1389,8 +1471,7 @@ void brw_math( struct brw_compile *p,
       brw_set_src1(p, insn, brw_null_reg());
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
-      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
       /* Example code doesn't set predicate_control for send
        * instructions.
        */
@@ -1401,9 +1482,8 @@ void brw_math( struct brw_compile *p,
       brw_set_src0(p, insn, src);
       brw_set_math_message(p,
                           insn,
-                          msg_length, response_length,
                           function,
-                          BRW_MATH_INTEGER_UNSIGNED,
+                          src.type == BRW_REGISTER_TYPE_D,
                           precision,
                           saturate,
                           data_type);
@@ -1430,20 +1510,28 @@ void brw_math2(struct brw_compile *p,
    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 
    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
-   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
-   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+   if (intel->gen == 6) {
+      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+   }
 
-   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
-       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
+       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+      assert(src0.type != BRW_REGISTER_TYPE_F);
+      assert(src1.type != BRW_REGISTER_TYPE_F);
+   } else {
       assert(src0.type == BRW_REGISTER_TYPE_F);
       assert(src1.type == BRW_REGISTER_TYPE_F);
    }
 
-   /* Source modifiers are ignored for extended math instructions. */
-   assert(!src0.negate);
-   assert(!src0.abs);
-   assert(!src1.negate);
-   assert(!src1.abs);
+   /* Source modifiers are ignored for extended math instructions on Gen6. */
+   if (intel->gen == 6) {
+      assert(!src0.negate);
+      assert(!src0.abs);
+      assert(!src1.negate);
+      assert(!src1.abs);
+   }
 
    /* Math is the same ISA format as other opcodes, except that CondModifier
     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
@@ -1469,8 +1557,6 @@ void brw_math_16( struct brw_compile *p,
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
-   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
-   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
 
    if (intel->gen >= 6) {
       insn = next_insn(p, BRW_OPCODE_MATH);
@@ -1504,7 +1590,6 @@ void brw_math_16( struct brw_compile *p,
    brw_set_src0(p, insn, src);
    brw_set_math_message(p,
                        insn, 
-                       msg_length, response_length, 
                        function,
                        BRW_MATH_INTEGER_UNSIGNED,
                        precision,
@@ -1521,7 +1606,6 @@ void brw_math_16( struct brw_compile *p,
    brw_set_src0(p, insn, src);
    brw_set_math_message(p, 
                        insn, 
-                       msg_length, response_length, 
                        function,
                        BRW_MATH_INTEGER_UNSIGNED,
                        precision,
@@ -1633,8 +1717,8 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
                               msg_control,
                               msg_type,
                               mlen,
-                              GL_TRUE, /* header_present */
-                              0, /* pixel scoreboard */
+                              true, /* header_present */
+                              0, /* not a render target */
                               send_commit_msg, /* response_length */
                               0, /* eot */
                               send_commit_msg);
@@ -1906,7 +1990,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    brw_set_dest(p, insn, dest);
    brw_set_src0(p, insn, src);
 
-   if (intel->gen == 6)
+   if (intel->gen >= 6)
       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    else if (intel->gen == 5 || intel->is_g4x)
       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
@@ -1932,8 +2016,8 @@ void brw_fb_WRITE(struct brw_compile *p,
                   GLuint binding_table_index,
                   GLuint msg_length,
                   GLuint response_length,
-                  GLboolean eot,
-                  GLboolean header_present)
+                  bool eot,
+                  bool header_present)
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
@@ -1955,10 +2039,10 @@ void brw_fb_WRITE(struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_NONE;
 
    if (intel->gen >= 6) {
-       /* headerless version, just submit color payload */
-       src0 = brw_message_reg(msg_reg_nr);
+      /* headerless version, just submit color payload */
+      src0 = brw_message_reg(msg_reg_nr);
 
-       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
    } else {
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
@@ -1979,7 +2063,7 @@ void brw_fb_WRITE(struct brw_compile *p,
                            msg_type,
                            msg_length,
                            header_present,
-                           1,  /* pixel scoreboard */
+                           1, /* last render target write */
                            response_length,
                            eot,
                            0 /* send_commit_msg */);
@@ -2001,12 +2085,11 @@ void brw_SAMPLE(struct brw_compile *p,
                GLuint msg_type,
                GLuint response_length,
                GLuint msg_length,
-               GLboolean eot,
                GLuint header_present,
                GLuint simd_mode)
 {
    struct intel_context *intel = &p->brw->intel;
-   GLboolean need_stall = 0;
+   bool need_stall = 0;
 
    if (writemask == 0) {
       /*printf("%s: zero writemask??\n", __FUNCTION__); */
@@ -2044,13 +2127,13 @@ void brw_SAMPLE(struct brw_compile *p,
          /* printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
-        GLboolean dispatch_16 = GL_FALSE;
+        bool dispatch_16 = false;
 
         struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
         guess_execution_size(p, p->current, dest);
         if (p->current->header.execution_size == BRW_EXECUTE_16)
-           dispatch_16 = GL_TRUE;
+           dispatch_16 = true;
 
         newmask = ~newmask & WRITEMASK_XYZW;
 
@@ -2096,7 +2179,6 @@ void brw_SAMPLE(struct brw_compile *p,
                              msg_type,
                              response_length, 
                              msg_length,
-                             eot,
                              header_present,
                              simd_mode);
    }
@@ -2123,12 +2205,12 @@ void brw_urb_WRITE(struct brw_compile *p,
                   struct brw_reg dest,
                   GLuint msg_reg_nr,
                   struct brw_reg src0,
-                  GLboolean allocate,
-                  GLboolean used,
+                  bool allocate,
+                  bool used,
                   GLuint msg_length,
                   GLuint response_length,
-                  GLboolean eot,
-                  GLboolean writes_complete,
+                  bool eot,
+                  bool writes_complete,
                   GLuint offset,
                   GLuint swizzle)
 {
@@ -2137,6 +2219,17 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
+   if (intel->gen == 7) {
+      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
+                      BRW_REGISTER_TYPE_UD),
+               retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0xff00));
+      brw_pop_insn_state(p);
+   }
+
    insn = next_insn(p, BRW_OPCODE_SEND);
 
    assert(msg_length < BRW_MAX_MRF);
@@ -2186,6 +2279,7 @@ brw_find_next_block_end(struct brw_compile *p, int start)
 static int
 brw_find_loop_end(struct brw_compile *p, int start)
 {
+   struct intel_context *intel = &p->brw->intel;
    int ip;
    int br = 2;
 
@@ -2193,7 +2287,9 @@ brw_find_loop_end(struct brw_compile *p, int start)
       struct brw_instruction *insn = &p->store[ip];
 
       if (insn->header.opcode == BRW_OPCODE_WHILE) {
-        if (ip + insn->bits1.branch_gen6.jump_count / br < start)
+        int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
+                                  : insn->bits3.break_cont.jip;
+        if (ip + jip / br <= start)
            return ip;
       }
    }
@@ -2220,13 +2316,14 @@ brw_set_uip_jip(struct brw_compile *p)
       switch (insn->header.opcode) {
       case BRW_OPCODE_BREAK:
         insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
-        insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
+        /* Gen7 UIP points to WHILE; Gen6 points just after it */
+        insn->bits3.break_cont.uip =
+           br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
         break;
       case BRW_OPCODE_CONTINUE:
-        /* JIP is set at CONTINUE emit time, since that's when we
-         * know where the start of the loop is.
-         */
         insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+        insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
+
         assert(insn->bits3.break_cont.uip != 0);
         assert(insn->bits3.break_cont.jip != 0);
         break;
@@ -2238,9 +2335,9 @@ void brw_ff_sync(struct brw_compile *p,
                   struct brw_reg dest,
                   GLuint msg_reg_nr,
                   struct brw_reg src0,
-                  GLboolean allocate,
+                  bool allocate,
                   GLuint response_length,
-                  GLboolean eot)
+                  bool eot)
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
@@ -2253,7 +2350,7 @@ void brw_ff_sync(struct brw_compile *p,
    brw_set_src1(p, insn, brw_imm_d(0));
 
    if (intel->gen < 6)
-       insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_ff_sync_message(p,
                           insn,