i965/vs: Add a function for how many MRFs get written as part of a SEND.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
index 41286382d0ed164ec34c114331a12070b971cf95..f9f8d49a0d0b4f78b4430ef238e19069966990a3 100644 (file)
 #include "brw_defines.h"
 #include "brw_eu.h"
 
-
-
+#include "glsl/ralloc.h"
 
 /***********************************************************************
  * Internal helper for constructing instructions
  */
 
-static void guess_execution_size( struct brw_instruction *insn,
-                                 struct brw_reg reg )
+static void guess_execution_size(struct brw_compile *p,
+                                struct brw_instruction *insn,
+                                struct brw_reg reg)
 {
-   if (reg.width == BRW_WIDTH_8 && 
-       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+   if (reg.width == BRW_WIDTH_8 && p->compressed)
       insn->header.execution_size = BRW_EXECUTE_16;
    else
       insn->header.execution_size = reg.width; /* note - definitions are compatible */
 }
 
 
-static void brw_set_dest( struct brw_instruction *insn,
-                         struct brw_reg dest )
+/**
+ * Prior to Sandybridge, the SEND instruction accepted non-MRF source
+ * registers, implicitly moving the operand to a message register.
+ *
+ * On Sandybridge, this is no longer the case.  This function performs the
+ * explicit move; it should be called before emitting a SEND instruction.
+ */
+void
+gen6_resolve_implied_move(struct brw_compile *p,
+                         struct brw_reg *src,
+                         GLuint msg_reg_nr)
+{
+   struct intel_context *intel = &p->brw->intel;
+   if (intel->gen < 6)
+      return;
+
+   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+             retype(*src, BRW_REGISTER_TYPE_UD));
+      brw_pop_insn_state(p);
+   }
+   *src = brw_message_reg(msg_reg_nr);
+}
+
+static void
+gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
+{
+   struct intel_context *intel = &p->brw->intel;
+   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+      reg->file = BRW_GENERAL_REGISTER_FILE;
+      reg->nr += 111;
+   }
+}
+
+
+void
+brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+            struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &dest);
+
    insn->bits1.da1.dest_reg_file = dest.file;
    insn->bits1.da1.dest_reg_type = dest.type;
    insn->bits1.da1.dest_address_mode = dest.address_mode;
@@ -100,7 +140,7 @@ static void brw_set_dest( struct brw_instruction *insn,
    /* NEW: Set the execution size based on dest.width and
     * insn->compression_control:
     */
-   guess_execution_size(insn, dest);
+   guess_execution_size(p, insn, dest);
 }
 
 extern int reg_type_size[];
@@ -181,12 +221,15 @@ validate_reg(struct brw_instruction *insn, struct brw_reg reg)
    /* 10. Check destination issues. */
 }
 
-static void brw_set_src0( struct brw_instruction *insn,
-                          struct brw_reg reg )
+void
+brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+            struct brw_reg reg)
 {
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &reg);
+
    validate_reg(insn, reg);
 
    insn->bits1.da1.src0_reg_file = reg.file;
@@ -257,13 +300,16 @@ static void brw_set_src0( struct brw_instruction *insn,
 }
 
 
-void brw_set_src1( struct brw_instruction *insn,
-                   struct brw_reg reg )
+void brw_set_src1(struct brw_compile *p,
+                 struct brw_instruction *insn,
+                 struct brw_reg reg)
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
    assert(reg.nr < 128);
 
+   gen7_convert_mrf_to_grf(p, &reg);
+
    validate_reg(insn, reg);
 
    insn->bits1.da1.src1_reg_file = reg.file;
@@ -326,7 +372,7 @@ void brw_set_src1( struct brw_instruction *insn,
 
 
 
-static void brw_set_math_message( struct brw_context *brw,
+static void brw_set_math_message( struct brw_compile *p,
                                  struct brw_instruction *insn,
                                  GLuint msg_length,
                                  GLuint response_length,
@@ -336,8 +382,9 @@ static void brw_set_math_message( struct brw_context *brw,
                                  GLboolean saturate,
                                  GLuint dataType )
 {
+   struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(insn, brw_imm_d(0));
+   brw_set_src1(p, insn, brw_imm_d(0));
 
    if (intel->gen == 5) {
        insn->bits3.math_gen5.function = function;
@@ -366,14 +413,15 @@ static void brw_set_math_message( struct brw_context *brw,
 }
 
 
-static void brw_set_ff_sync_message(struct brw_context *brw,
+static void brw_set_ff_sync_message(struct brw_compile *p,
                                    struct brw_instruction *insn,
                                    GLboolean allocate,
                                    GLuint response_length,
                                    GLboolean end_of_thread)
 {
+       struct brw_context *brw = p->brw;
        struct intel_context *intel = &brw->intel;
-       brw_set_src1(insn, brw_imm_d(0));
+       brw_set_src1(p, insn, brw_imm_d(0));
 
        insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
        insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
@@ -393,7 +441,7 @@ static void brw_set_ff_sync_message(struct brw_context *brw,
        }
 }
 
-static void brw_set_urb_message( struct brw_context *brw,
+static void brw_set_urb_message( struct brw_compile *p,
                                 struct brw_instruction *insn,
                                 GLboolean allocate,
                                 GLboolean used,
@@ -404,11 +452,25 @@ static void brw_set_urb_message( struct brw_context *brw,
                                 GLuint offset,
                                 GLuint swizzle_control )
 {
+    struct brw_context *brw = p->brw;
     struct intel_context *intel = &brw->intel;
-    brw_set_src1(insn, brw_imm_d(0));
-
-    if (intel->gen >= 5) {
-        insn->bits3.urb_gen5.opcode = 0;       /* ? */
+    brw_set_src1(p, insn, brw_imm_d(0));
+
+    if (intel->gen == 7) {
+        insn->bits3.urb_gen7.opcode = 0;       /* URB_WRITE_HWORD */
+        insn->bits3.urb_gen7.offset = offset;
+        assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+        insn->bits3.urb_gen7.swizzle_control = swizzle_control;
+        /* per_slot_offset = 0 makes it ignore offsets in message header */
+        insn->bits3.urb_gen7.per_slot_offset = 0;
+        insn->bits3.urb_gen7.complete = complete;
+        insn->bits3.urb_gen7.header_present = 1;
+        insn->bits3.urb_gen7.response_length = response_length;
+        insn->bits3.urb_gen7.msg_length = msg_length;
+        insn->bits3.urb_gen7.end_of_thread = end_of_thread;
+       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+    } else if (intel->gen >= 5) {
+        insn->bits3.urb_gen5.opcode = 0;       /* URB_WRITE */
         insn->bits3.urb_gen5.offset = offset;
         insn->bits3.urb_gen5.swizzle_control = swizzle_control;
         insn->bits3.urb_gen5.allocate = allocate;
@@ -442,35 +504,48 @@ static void brw_set_urb_message( struct brw_context *brw,
     }
 }
 
-static void brw_set_dp_write_message( struct brw_context *brw,
-                                     struct brw_instruction *insn,
-                                     GLuint binding_table_index,
-                                     GLuint msg_control,
-                                     GLuint msg_type,
-                                     GLuint msg_length,
-                                     GLboolean header_present,
-                                     GLuint pixel_scoreboard_clear,
-                                     GLuint response_length,
-                                     GLuint end_of_thread,
-                                     GLuint send_commit_msg)
+void
+brw_set_dp_write_message(struct brw_compile *p,
+                        struct brw_instruction *insn,
+                        GLuint binding_table_index,
+                        GLuint msg_control,
+                        GLuint msg_type,
+                        GLuint msg_length,
+                        GLboolean header_present,
+                        GLuint pixel_scoreboard_clear,
+                        GLuint response_length,
+                        GLuint end_of_thread,
+                        GLuint send_commit_msg)
 {
+   struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(insn, brw_imm_ud(0));
-
-   if (intel->gen >= 6) {
-       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
-       insn->bits3.dp_render_cache.msg_control = msg_control;
-       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
-       insn->bits3.dp_render_cache.msg_type = msg_type;
-       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
-       insn->bits3.dp_render_cache.header_present = header_present;
-       insn->bits3.dp_render_cache.response_length = response_length;
-       insn->bits3.dp_render_cache.msg_length = msg_length;
-       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
-       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-       /* XXX really need below? */
-       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-       insn->bits2.send_gen5.end_of_thread = end_of_thread;
+   brw_set_src1(p, insn, brw_imm_ud(0));
+
+   if (intel->gen >= 7) {
+       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+       insn->bits3.gen7_dp.msg_control = msg_control;
+       insn->bits3.gen7_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.gen7_dp.msg_type = msg_type;
+       insn->bits3.gen7_dp.header_present = header_present;
+       insn->bits3.gen7_dp.response_length = response_length;
+       insn->bits3.gen7_dp.msg_length = msg_length;
+       insn->bits3.gen7_dp.end_of_thread = end_of_thread;
+
+       /* We always use the render cache for write messages */
+       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
+   } else if (intel->gen == 6) {
+       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+       insn->bits3.gen6_dp.msg_control = msg_control;
+       insn->bits3.gen6_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.gen6_dp.msg_type = msg_type;
+       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
+       insn->bits3.gen6_dp.header_present = header_present;
+       insn->bits3.gen6_dp.response_length = response_length;
+       insn->bits3.gen6_dp.msg_length = msg_length;
+       insn->bits3.gen6_dp.end_of_thread = end_of_thread;
+
+       /* We always use the render cache for write messages */
+       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
    } else if (intel->gen == 5) {
        insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
        insn->bits3.dp_write_gen5.msg_control = msg_control;
@@ -496,20 +571,49 @@ static void brw_set_dp_write_message( struct brw_context *brw,
    }
 }
 
-static void brw_set_dp_read_message( struct brw_context *brw,
-                                     struct brw_instruction *insn,
-                                     GLuint binding_table_index,
-                                     GLuint msg_control,
-                                     GLuint msg_type,
-                                     GLuint target_cache,
-                                     GLuint msg_length,
-                                     GLuint response_length,
-                                     GLuint end_of_thread )
+void
+brw_set_dp_read_message(struct brw_compile *p,
+                       struct brw_instruction *insn,
+                       GLuint binding_table_index,
+                       GLuint msg_control,
+                       GLuint msg_type,
+                       GLuint target_cache,
+                       GLuint msg_length,
+                       GLuint response_length)
 {
+   struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
-   brw_set_src1(insn, brw_imm_d(0));
-
-   if (intel->gen == 5) {
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (intel->gen >= 7) {
+       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+       insn->bits3.gen7_dp.msg_control = msg_control;
+       insn->bits3.gen7_dp.pixel_scoreboard_clear = 0;
+       insn->bits3.gen7_dp.msg_type = msg_type;
+       insn->bits3.gen7_dp.header_present = 1;
+       insn->bits3.gen7_dp.response_length = response_length;
+       insn->bits3.gen7_dp.msg_length = msg_length;
+       insn->bits3.gen7_dp.end_of_thread = 0;
+       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_CONST_CACHE;
+   } else if (intel->gen == 6) {
+       uint32_t target_function;
+
+       if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
+         target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
+       else
+         target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
+
+       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+       insn->bits3.gen6_dp.msg_control = msg_control;
+       insn->bits3.gen6_dp.pixel_scoreboard_clear = 0;
+       insn->bits3.gen6_dp.msg_type = msg_type;
+       insn->bits3.gen6_dp.send_commit_msg = 0;
+       insn->bits3.gen6_dp.header_present = 1;
+       insn->bits3.gen6_dp.response_length = response_length;
+       insn->bits3.gen6_dp.msg_length = msg_length;
+       insn->bits3.gen6_dp.end_of_thread = 0;
+       insn->header.destreg__conditionalmod = target_function;
+   } else if (intel->gen == 5) {
        insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
        insn->bits3.dp_read_gen5.msg_control = msg_control;
        insn->bits3.dp_read_gen5.msg_type = msg_type;
@@ -518,9 +622,19 @@ static void brw_set_dp_read_message( struct brw_context *brw,
        insn->bits3.dp_read_gen5.response_length = response_length;
        insn->bits3.dp_read_gen5.msg_length = msg_length;
        insn->bits3.dp_read_gen5.pad1 = 0;
-       insn->bits3.dp_read_gen5.end_of_thread = end_of_thread;
+       insn->bits3.dp_read_gen5.end_of_thread = 0;
        insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
-       insn->bits2.send_gen5.end_of_thread = end_of_thread;
+       insn->bits2.send_gen5.end_of_thread = 0;
+   } else if (intel->is_g4x) {
+       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
+       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
+       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read_g4x.pad1 = 0;
+       insn->bits3.dp_read_g4x.end_of_thread = 0;
    } else {
        insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
        insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
@@ -530,11 +644,11 @@ static void brw_set_dp_read_message( struct brw_context *brw,
        insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
        insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
        insn->bits3.dp_read.pad1 = 0;  /*28:30*/
-       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
    }
 }
 
-static void brw_set_sampler_message(struct brw_context *brw,
+static void brw_set_sampler_message(struct brw_compile *p,
                                     struct brw_instruction *insn,
                                     GLuint binding_table_index,
                                     GLuint sampler,
@@ -545,11 +659,22 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint header_present,
                                     GLuint simd_mode)
 {
+   struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
    assert(eot == 0);
-   brw_set_src1(insn, brw_imm_d(0));
-
-   if (intel->gen >= 5) {
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (intel->gen >= 7) {
+      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
+      insn->bits3.sampler_gen7.sampler = sampler;
+      insn->bits3.sampler_gen7.msg_type = msg_type;
+      insn->bits3.sampler_gen7.simd_mode = simd_mode;
+      insn->bits3.sampler_gen7.header_present = header_present;
+      insn->bits3.sampler_gen7.response_length = response_length;
+      insn->bits3.sampler_gen7.msg_length = msg_length;
+      insn->bits3.sampler_gen7.end_of_thread = eot;
+      insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
+   } else if (intel->gen >= 5) {
       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
       insn->bits3.sampler_gen5.sampler = sampler;
       insn->bits3.sampler_gen5.msg_type = msg_type;
@@ -585,9 +710,9 @@ static void brw_set_sampler_message(struct brw_context *brw,
 }
 
 
-
-static struct brw_instruction *next_insn( struct brw_compile *p, 
-                                         GLuint opcode )
+#define next_insn brw_next_insn
+struct brw_instruction *
+brw_next_insn(struct brw_compile *p, GLuint opcode)
 {
    struct brw_instruction *insn;
 
@@ -608,15 +733,14 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    return insn;
 }
 
-
 static struct brw_instruction *brw_alu1( struct brw_compile *p,
                                         GLuint opcode,
                                         struct brw_reg dest,
                                         struct brw_reg src )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);   
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
    return insn;
 }
 
@@ -627,9 +751,9 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
                                        struct brw_reg src1 )
 {
    struct brw_instruction *insn = next_insn(p, opcode);   
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, src1);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
    return insn;
 }
 
@@ -654,6 +778,31 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,    \
    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);      \
 }
 
+/* Rounding operations (other than RNDD) require two instructions - the first
+ * stores a rounded value (possibly the wrong way) in the dest register, but
+ * also sets a per-channel "increment bit" in the flag register.  A predicated
+ * add of 1.0 fixes dest to contain the desired result.
+ *
+ * Sandybridge and later appear to round correctly without an ADD.
+ */
+#define ROUND(OP)                                                            \
+void brw_##OP(struct brw_compile *p,                                         \
+             struct brw_reg dest,                                            \
+             struct brw_reg src)                                             \
+{                                                                            \
+   struct brw_instruction *rnd, *add;                                        \
+   rnd = next_insn(p, BRW_OPCODE_##OP);                                              \
+   brw_set_dest(p, rnd, dest);                                               \
+   brw_set_src0(p, rnd, src);                                                \
+                                                                             \
+   if (p->brw->intel.gen < 6) {                                                      \
+      /* turn on round-increments */                                         \
+      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;               \
+      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                         \
+      add->header.predicate_control = BRW_PREDICATE_NORMAL;                  \
+   }                                                                         \
+}
+
 
 ALU1(MOV)
 ALU2(SEL)
@@ -668,7 +817,6 @@ ALU2(RSL)
 ALU2(ASR)
 ALU1(FRC)
 ALU1(RNDD)
-ALU1(RNDZ)
 ALU2(MAC)
 ALU2(MACH)
 ALU1(LZD)
@@ -679,6 +827,11 @@ ALU2(DP2)
 ALU2(LINE)
 ALU2(PLN)
 
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+
 struct brw_instruction *brw_ADD(struct brw_compile *p,
                                struct brw_reg dest,
                                struct brw_reg src0,
@@ -741,9 +894,9 @@ struct brw_instruction *brw_MUL(struct brw_compile *p,
 void brw_NOP(struct brw_compile *p)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
-   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-   brw_set_src1(insn, brw_imm_ud(0x0));
+   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(p, insn, brw_imm_ud(0x0));
 }
 
 
@@ -770,6 +923,19 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p,
    return insn;
 }
 
+static void
+push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
+{
+   p->if_stack[p->if_stack_depth] = inst;
+
+   p->if_stack_depth++;
+   if (p->if_stack_array_size <= p->if_stack_depth) {
+      p->if_stack_array_size *= 2;
+      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
+                            p->if_stack_array_size);
+   }
+}
+
 /* EU takes the value from the flag register and pushes it onto some
  * sort of a stack (presumably merging with any flag value already on
  * the stack).  Within an if block, the flags at the top of the stack
@@ -782,33 +948,32 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p,
  *
  * When the matching 'endif' instruction is reached, the flags are
  * popped off.  If the stack is now empty, normal execution resumes.
- *
- * No attempt is made to deal with stack overflow (14 elements?).
  */
-struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
+struct brw_instruction *
+brw_IF(struct brw_compile *p, GLuint execute_size)
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
 
-   if (p->single_program_flow) {
-      assert(execute_size == BRW_EXECUTE_1);
-
-      insn = next_insn(p, BRW_OPCODE_ADD);
-      insn->header.predicate_inverse = 1;
-   } else {
-      insn = next_insn(p, BRW_OPCODE_IF);
-   }
+   insn = next_insn(p, BRW_OPCODE_IF);
 
    /* Override the defaults for this instruction:
     */
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
-      brw_set_src0(insn, brw_ip_reg());
-      brw_set_src1(insn, brw_imm_d(0x0));
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (intel->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = 0;
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
-      brw_set_src0(insn, brw_null_reg());
-      brw_set_src1(insn, brw_null_reg());
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = 0;
+      insn->bits3.break_cont.uip = 0;
    }
 
    insn->header.execution_size = execute_size;
@@ -820,159 +985,283 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 
    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
+   push_if_stack(p, insn);
    return insn;
 }
 
-
-struct brw_instruction *brw_ELSE(struct brw_compile *p, 
-                                struct brw_instruction *if_insn)
+/* This function is only used for gen6-style IF instructions with an
+ * embedded comparison (conditional modifier).  It is not used on gen7.
+ */
+struct brw_instruction *
+gen6_IF(struct brw_compile *p, uint32_t conditional,
+       struct brw_reg src0, struct brw_reg src1)
 {
-   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
-   GLuint br = 1;
 
-   /* jump count is for 64bit data chunk each, so one 128bit
-      instruction requires 2 chunks. */
-   if (intel->gen >= 5)
-      br = 2;
+   insn = next_insn(p, BRW_OPCODE_IF);
 
-   if (p->single_program_flow) {
-      insn = next_insn(p, BRW_OPCODE_ADD);
+   brw_set_dest(p, insn, brw_imm_w(0));
+   if (p->compressed) {
+      insn->header.execution_size = BRW_EXECUTE_16;
    } else {
-      insn = next_insn(p, BRW_OPCODE_ELSE);
+      insn->header.execution_size = BRW_EXECUTE_8;
    }
+   insn->bits1.branch_gen6.jump_count = 0;
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
 
-   if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
-      brw_set_src0(insn, brw_ip_reg());
-      brw_set_src1(insn, brw_imm_d(0x0));
-   } else {
-      brw_set_dest(insn, brw_imm_w(0));
-      brw_set_src0(insn, brw_null_reg());
-      brw_set_src1(insn, brw_null_reg());
-   }
+   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
+   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+   insn->header.destreg__conditionalmod = conditional;
 
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.execution_size = if_insn->header.execution_size;
-   insn->header.mask_control = BRW_MASK_ENABLE;
    if (!p->single_program_flow)
        insn->header.thread_control = BRW_THREAD_SWITCH;
 
-   /* Patch the if instruction to point at this instruction.
+   push_if_stack(p, insn);
+   return insn;
+}
+
+/**
+ * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
+ */
+static void
+convert_IF_ELSE_to_ADD(struct brw_compile *p,
+                      struct brw_instruction *if_inst,
+                      struct brw_instruction *else_inst)
+{
+   /* The next instruction (where the ENDIF would be, if it existed) */
+   struct brw_instruction *next_inst = &p->store[p->nr_insn];
+
+   assert(p->single_program_flow);
+   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
+   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
+   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
+
+   /* Convert IF to an ADD instruction that moves the instruction pointer
+    * to the first instruction of the ELSE block.  If there is no ELSE
+    * block, point to where ENDIF would be.  Reverse the predicate.
+    *
+    * There's no need to execute an ENDIF since we don't need to do any
+    * stack operations, and if we're currently executing, we just want to
+    * continue normally.
     */
-   if (p->single_program_flow) {
-      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+   if_inst->header.opcode = BRW_OPCODE_ADD;
+   if_inst->header.predicate_inverse = 1;
 
-      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
-   } else {
-      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+   if (else_inst != NULL) {
+      /* Convert ELSE to an ADD instruction that points where the ENDIF
+       * would be.
+       */
+      else_inst->header.opcode = BRW_OPCODE_ADD;
 
-      if (intel->gen < 6) {
-        if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
-        if_insn->bits3.if_else.pop_count = 0;
-        if_insn->bits3.if_else.pad0 = 0;
-      } else {
-        if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
-      }
+      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
+      else_inst->bits3.ud = (next_inst - else_inst) * 16;
+   } else {
+      if_inst->bits3.ud = (next_inst - if_inst) * 16;
    }
-
-   return insn;
 }
 
-void brw_ENDIF(struct brw_compile *p, 
-              struct brw_instruction *patch_insn)
+/**
+ * Patch IF and ELSE instructions with appropriate jump targets.
+ */
+static void
+patch_IF_ELSE(struct brw_compile *p,
+             struct brw_instruction *if_inst,
+             struct brw_instruction *else_inst,
+             struct brw_instruction *endif_inst)
 {
    struct intel_context *intel = &p->brw->intel;
-   GLuint br = 1;
 
-   if (intel->gen >= 5)
-      br = 2; 
-   if (p->single_program_flow) {
-      /* In single program flow mode, there's no need to execute an ENDIF,
-       * since we don't need to do any stack operations, and if we're executing
-       * currently, we want to just continue executing.
-       */
-      struct brw_instruction *next = &p->store[p->nr_insn];
+   assert(!p->single_program_flow);
+   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
+   assert(endif_inst != NULL);
+   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
 
-      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+   unsigned br = 1;
+   /* Jump count is for 64bit data chunk each, so one 128bit instruction
+    * requires 2 chunks.
+    */
+   if (intel->gen >= 5)
+      br = 2;
 
-      patch_insn->bits3.ud = (next - patch_insn) * 16;
-   } else {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
+   endif_inst->header.execution_size = if_inst->header.execution_size;
 
+   if (else_inst == NULL) {
+      /* Patch IF -> ENDIF */
       if (intel->gen < 6) {
-        brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-        brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-        brw_set_src1(insn, brw_imm_d(0x0));
+        /* Turn it into an IFF, which means no mask stack operations for
+         * all-false and jumping past the ENDIF.
+         */
+        if_inst->header.opcode = BRW_OPCODE_IFF;
+        if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
+        if_inst->bits3.if_else.pop_count = 0;
+        if_inst->bits3.if_else.pad0 = 0;
+      } else if (intel->gen == 6) {
+        /* As of gen6, there is no IFF and IF must point to the ENDIF. */
+        if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
       } else {
-        brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_W));
-        brw_set_src0(insn, brw_null_reg());
-        brw_set_src1(insn, brw_null_reg());
+        if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+        if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
       }
+   } else {
+      else_inst->header.execution_size = if_inst->header.execution_size;
 
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.execution_size = patch_insn->header.execution_size;
-      insn->header.mask_control = BRW_MASK_ENABLE;
-      insn->header.thread_control = BRW_THREAD_SWITCH;
-
-      assert(patch_insn->bits3.if_else.jump_count == 0);
-
-      /* Patch the if or else instructions to point at this or the next
-       * instruction respectively.
-       */
-      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
-        if (intel->gen < 6) {
-           /* Turn it into an IFF, which means no mask stack operations for
-            * all-false and jumping past the ENDIF.
-            */
-           patch_insn->header.opcode = BRW_OPCODE_IFF;
-           patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
-           patch_insn->bits3.if_else.pop_count = 0;
-           patch_insn->bits3.if_else.pad0 = 0;
-        } else {
-           /* As of gen6, there is no IFF and IF must point to the ENDIF. */
-           patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
-        }
-      } else {
-        assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
-        if (intel->gen < 6) {
-           /* BRW_OPCODE_ELSE pre-gen6 should point just past the
-            * matching ENDIF.
-            */
-           patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
-           patch_insn->bits3.if_else.pop_count = 1;
-           patch_insn->bits3.if_else.pad0 = 0;
-        } else {
-           /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
-           patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
-        }
+      /* Patch IF -> ELSE */
+      if (intel->gen < 6) {
+        if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
+        if_inst->bits3.if_else.pop_count = 0;
+        if_inst->bits3.if_else.pad0 = 0;
+      } else if (intel->gen == 6) {
+        if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
       }
 
-      /* Also pop item off the stack in the endif instruction:
-       */
+      /* Patch ELSE -> ENDIF */
       if (intel->gen < 6) {
-        insn->bits3.if_else.jump_count = 0;
-        insn->bits3.if_else.pop_count = 1;
-        insn->bits3.if_else.pad0 = 0;
+        /* BRW_OPCODE_ELSE pre-gen6 should point just past the
+         * matching ENDIF.
+         */
+        else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
+        else_inst->bits3.if_else.pop_count = 1;
+        else_inst->bits3.if_else.pad0 = 0;
+      } else if (intel->gen == 6) {
+        /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
+        else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
       } else {
-        insn->bits1.branch_gen6.jump_count = 2;
+        /* The IF instruction's JIP should point just past the ELSE */
+        if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
+        /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+        if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+        else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
       }
    }
 }
 
+void
+brw_ELSE(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_ELSE);
+
+   if (intel->gen < 6) {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (intel->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = 0;
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = 0;
+      insn->bits3.break_cont.uip = 0;
+   }
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   if (!p->single_program_flow)
+       insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   push_if_stack(p, insn);
+}
+
+void
+brw_ENDIF(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+   struct brw_instruction *else_inst = NULL;
+   struct brw_instruction *if_inst = NULL;
+
+   /* Pop the IF and (optional) ELSE instructions from the stack */
+   p->if_stack_depth--;
+   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
+      else_inst = p->if_stack[p->if_stack_depth];
+      p->if_stack_depth--;
+   }
+   if_inst = p->if_stack[p->if_stack_depth];
+
+   if (p->single_program_flow) {
+      /* ENDIF is useless; don't bother emitting it. */
+      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
+      return;
+   }
+
+   insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+   if (intel->gen < 6) {
+      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (intel->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+   }
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   /* Also pop item off the stack in the endif instruction: */
+   if (intel->gen < 6) {
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   } else if (intel->gen == 6) {
+      insn->bits1.branch_gen6.jump_count = 2;
+   } else {
+      insn->bits3.break_cont.jip = 2;
+   }
+   patch_IF_ELSE(p, if_inst, else_inst, insn);
+}
+
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+
    insn = next_insn(p, BRW_OPCODE_BREAK);
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+   if (intel->gen >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+      insn->bits3.if_else.pad0 = 0;
+      insn->bits3.if_else.pop_count = pop_count;
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+
+   return insn;
+}
+
+struct brw_instruction *gen6_CONT(struct brw_compile *p,
+                                 struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_dest(p, insn, brw_ip_reg());
+   brw_set_src0(p, insn, brw_ip_reg());
+   brw_set_src1(p, insn, brw_imm_d(0x0));
+
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   insn->bits3.if_else.pad0 = 0;
-   insn->bits3.if_else.pop_count = pop_count;
    return insn;
 }
 
@@ -980,9 +1269,9 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 {
    struct brw_instruction *insn;
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+   brw_set_dest(p, insn, brw_ip_reg());
+   brw_set_src0(p, insn, brw_ip_reg());
+   brw_set_src1(p, insn, brw_imm_d(0x0));
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
    /* insn->header.mask_control = BRW_MASK_DISABLE; */
@@ -992,19 +1281,35 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 }
 
 /* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 {
-   if (p->single_program_flow) {
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6 || p->single_program_flow) {
       return &p->store[p->nr_insn];
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
 
       /* Override the defaults for this instruction:
        */
-      brw_set_dest(insn, brw_null_reg());
-      brw_set_src0(insn, brw_null_reg());
-      brw_set_src1(insn, brw_null_reg());
+      brw_set_dest(p, insn, brw_null_reg());
+      brw_set_src0(p, insn, brw_null_reg());
+      brw_set_src1(p, insn, brw_null_reg());
 
       insn->header.compression_control = BRW_COMPRESSION_NONE;
       insn->header.execution_size = execute_size;
@@ -1028,34 +1333,50 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
    if (intel->gen >= 5)
       br = 2;
 
-   if (p->single_program_flow)
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   else
+   if (intel->gen >= 7) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_ud(0));
+      insn->bits3.break_cont.jip = br * (do_insn - insn);
 
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = BRW_EXECUTE_8;
+   } else if (intel->gen == 6) {
+      insn = next_insn(p, BRW_OPCODE_WHILE);
 
-   if (p->single_program_flow) {
-      insn->header.execution_size = BRW_EXECUTE_1;
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-      insn->bits3.d = (do_insn - insn) * 16;
+      insn->header.execution_size = BRW_EXECUTE_8;
    } else {
-      insn->header.execution_size = do_insn->header.execution_size;
+      if (p->single_program_flow) {
+        insn = next_insn(p, BRW_OPCODE_ADD);
 
-      assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
-      insn->bits3.if_else.pop_count = 0;
-      insn->bits3.if_else.pad0 = 0;
-   }
+        brw_set_dest(p, insn, brw_ip_reg());
+        brw_set_src0(p, insn, brw_ip_reg());
+        brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
+        insn->header.execution_size = BRW_EXECUTE_1;
+      } else {
+        insn = next_insn(p, BRW_OPCODE_WHILE);
 
-/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+        assert(do_insn->header.opcode == BRW_OPCODE_DO);
+
+        brw_set_dest(p, insn, brw_ip_reg());
+        brw_set_src0(p, insn, brw_ip_reg());
+        brw_set_src1(p, insn, brw_imm_d(0));
+
+        insn->header.execution_size = do_insn->header.execution_size;
+        insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+        insn->bits3.if_else.pop_count = 0;
+        insn->bits3.if_else.pad0 = 0;
+      }
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
    return insn;
 }
 
@@ -1093,9 +1414,9 @@ void brw_CMP(struct brw_compile *p,
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
    insn->header.destreg__conditionalmod = conditional;
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, src1);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
 
 /*    guess_execution_size(insn, src0); */
 
@@ -1118,9 +1439,9 @@ void brw_WAIT (struct brw_compile *p)
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
    struct brw_reg src = brw_notification_1_reg();
 
-   brw_set_dest(insn, src);
-   brw_set_src0(insn, src);
-   brw_set_src1(insn, brw_null_reg());
+   brw_set_dest(p, insn, src);
+   brw_set_src0(p, insn, src);
+   brw_set_src1(p, insn, brw_null_reg());
    insn->header.execution_size = 0; /* must */
    insn->header.predicate_control = 0;
    insn->header.compression_control = 0;
@@ -1153,6 +1474,10 @@ void brw_math( struct brw_compile *p,
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
+
       if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
          function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
         assert(src.type == BRW_REGISTER_TYPE_F);
@@ -1162,10 +1487,11 @@ void brw_math( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
-      brw_set_src1(insn, brw_null_reg());
+      brw_set_dest(p, insn, dest);
+      brw_set_src0(p, insn, src);
+      brw_set_src1(p, insn, brw_null_reg());
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
       GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
@@ -1176,9 +1502,9 @@ void brw_math( struct brw_compile *p,
       insn->header.predicate_control = 0;
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
-      brw_set_math_message(p->brw,
+      brw_set_dest(p, insn, dest);
+      brw_set_src0(p, insn, src);
+      brw_set_math_message(p,
                           insn,
                           msg_length, response_length,
                           function,
@@ -1218,14 +1544,20 @@ void brw_math2(struct brw_compile *p,
       assert(src1.type == BRW_REGISTER_TYPE_F);
    }
 
+   /* Source modifiers are ignored for extended math instructions. */
+   assert(!src0.negate);
+   assert(!src0.abs);
+   assert(!src1.negate);
+   assert(!src1.abs);
+
    /* Math is the same ISA format as other opcodes, except that CondModifier
     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
     */
    insn->header.destreg__conditionalmod = function;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, src1);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
 }
 
 /**
@@ -1252,10 +1584,15 @@ void brw_math_16( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
+
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
-      brw_set_src1(insn, brw_null_reg());
+      brw_set_dest(p, insn, dest);
+      brw_set_src0(p, insn, src);
+      brw_set_src1(p, insn, brw_null_reg());
       return;
    }
 
@@ -1268,9 +1605,9 @@ void brw_math_16( struct brw_compile *p,
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(p->brw,
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
+   brw_set_math_message(p,
                        insn, 
                        msg_length, response_length, 
                        function,
@@ -1285,9 +1622,9 @@ void brw_math_16( struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
    insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
-   brw_set_dest(insn, offset(dest,1));
-   brw_set_src0(insn, src);
-   brw_set_math_message(p->brw
+   brw_set_dest(p, insn, offset(dest,1));
+   brw_set_src0(p, insn, src);
+   brw_set_math_message(p, 
                        insn, 
                        msg_length, response_length, 
                        function,
@@ -1301,38 +1638,69 @@ void brw_math_16( struct brw_compile *p,
 
 
 /**
- * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
- * Scratch offset should be a multiple of 64.
- * Used for register spilling.
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes).  Used for
+ * register spilling.
  */
-void brw_dp_WRITE_16( struct brw_compile *p,
-                     struct brw_reg src,
-                     GLuint scratch_offset )
+void brw_oword_block_write_scratch(struct brw_compile *p,
+                                  struct brw_reg mrf,
+                                  int num_regs,
+                                  GLuint offset)
 {
    struct intel_context *intel = &p->brw->intel;
-   GLuint msg_reg_nr = 1;
+   uint32_t msg_control, msg_type;
+   int mlen;
+
+   if (intel->gen >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   if (num_regs == 1) {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+      mlen = 2;
+   } else {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+      mlen = 3;
+   }
+
+   /* Set up the message header.  This is g0, with g0.2 filled with
+    * the offset.  We don't want to leave our offset around in g0 or
+    * it'll screw up texture samples, so set it up inside the message
+    * reg.
+    */
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
       /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
-             retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-             brw_imm_d(scratch_offset));
+             retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+                                 mrf.nr,
+                                 2), BRW_REGISTER_TYPE_UD),
+             brw_imm_ud(offset));
 
       brw_pop_insn_state(p);
    }
 
    {
-      GLuint msg_length = 3;
       struct brw_reg dest;
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
       int send_commit_msg;
+      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
+                                        BRW_REGISTER_TYPE_UW);
 
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
+      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
+        insn->header.compression_control = BRW_COMPRESSION_NONE;
+        src_header = vec16(src_header);
+      }
+      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+      insn->header.destreg__conditionalmod = mrf.nr;
 
       /* Until gen6, writes followed by reads from the same location
        * are not guaranteed to be ordered unless write_commit is set.
@@ -1348,19 +1716,28 @@ void brw_dp_WRITE_16( struct brw_compile *p,
         dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
         send_commit_msg = 0;
       } else {
-        dest = brw_uw16_grf(0, 0);
+        dest = src_header;
         send_commit_msg = 1;
       }
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
+      brw_set_dest(p, insn, dest);
+      if (intel->gen >= 6) {
+        brw_set_src0(p, insn, mrf);
+      } else {
+        brw_set_src0(p, insn, brw_null_reg());
+      }
 
-      brw_set_dp_write_message(p->brw,
+      if (intel->gen >= 6)
+        msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+      else
+        msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+
+      brw_set_dp_write_message(p,
                               insn,
                               255, /* binding table index (255=stateless) */
-                              BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
-                              BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
-                              msg_length,
+                              msg_control,
+                              msg_type,
+                              mlen,
                               GL_TRUE, /* header_present */
                               0, /* pixel scoreboard */
                               send_commit_msg, /* response_length */
@@ -1371,109 +1748,178 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 
 
 /**
- * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
- * Scratch offset should be a multiple of 64.
- * Used for register spilling.
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes).  Used for register
+ * spilling.
  */
-void brw_dp_READ_16( struct brw_compile *p,
-                     struct brw_reg dest,
-                     GLuint scratch_offset )
+void
+brw_oword_block_read_scratch(struct brw_compile *p,
+                            struct brw_reg dest,
+                            struct brw_reg mrf,
+                            int num_regs,
+                            GLuint offset)
 {
-   GLuint msg_reg_nr = 1;
+   struct intel_context *intel = &p->brw->intel;
+   uint32_t msg_control;
+   int rlen;
+
+   if (intel->gen >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+   if (num_regs == 1) {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+      rlen = 1;
+   } else {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+      rlen = 2;
+   }
+
    {
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
 
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
       /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
-             retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-             brw_imm_d(scratch_offset));
+             retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+                                 mrf.nr,
+                                 2), BRW_REGISTER_TYPE_UD),
+             brw_imm_ud(offset));
 
       brw_pop_insn_state(p);
    }
 
    {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-  
-      brw_set_dest(insn, dest);        /* UW? */
-      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
-      brw_set_dp_read_message(p->brw,
+      assert(insn->header.predicate_control == 0);
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditionalmod = mrf.nr;
+
+      brw_set_dest(p, insn, dest);     /* UW? */
+      if (intel->gen >= 6) {
+        brw_set_src0(p, insn, mrf);
+      } else {
+        brw_set_src0(p, insn, brw_null_reg());
+      }
+
+      brw_set_dp_read_message(p,
                              insn,
                              255, /* binding table index (255=stateless) */
-                             BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
+                             msg_control,
                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-                             1, /* target cache (render/scratch) */
+                             BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
                              1, /* msg_length */
-                             2, /* response_length */
-                             0); /* eot */
+                             rlen);
    }
 }
 
-
 /**
  * Read a float[4] vector from the data port Data Cache (const buffer).
  * Location (in buffer) should be a multiple of 16.
  * Used for fetching shader constants.
- * If relAddr is true, we'll do an indirect fetch using the address register.
  */
-void brw_dp_READ_4( struct brw_compile *p,
-                    struct brw_reg dest,
-                    GLboolean relAddr,
-                    GLuint location,
-                    GLuint bind_table_index )
+void brw_oword_block_read(struct brw_compile *p,
+                         struct brw_reg dest,
+                         struct brw_reg mrf,
+                         uint32_t offset,
+                         uint32_t bind_table_index)
 {
-   /* XXX: relAddr not implemented */
-   GLuint msg_reg_nr = 1;
-   {
-      struct brw_reg b;
-      brw_push_insn_state(p);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct intel_context *intel = &p->brw->intel;
 
-   /* Setup MRF[1] with location/offset into const buffer */
-      b = brw_message_reg(msg_reg_nr);
-      b = retype(b, BRW_REGISTER_TYPE_UD);
-      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-       */
-      brw_MOV(p, b, brw_imm_ud(location));
-      brw_pop_insn_state(p);
-   }
+   /* On newer hardware, offset is in units of owords. */
+   if (intel->gen >= 6)
+      offset /= 16;
 
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = BRW_PREDICATE_NONE;
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-      insn->header.mask_control = BRW_MASK_DISABLE;
-  
-      /* cast dest to a uword[8] vector */
-      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, brw_null_reg());
+   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
-      brw_set_dp_read_message(p->brw,
-                             insn,
-                             bind_table_index,
-                             0,  /* msg_control (0 means 1 Oword) */
-                             BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-                             0, /* source cache = data cache */
-                             1, /* msg_length */
-                             1, /* response_length (1 Oword) */
-                             0); /* eot */
+   /* set message header global offset field (reg 0, element 2) */
+   brw_MOV(p,
+          retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+                              mrf.nr,
+                              2), BRW_REGISTER_TYPE_UD),
+          brw_imm_ud(offset));
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditionalmod = mrf.nr;
+
+   /* cast dest to a uword[8] vector */
+   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+   brw_set_dest(p, insn, dest);
+   if (intel->gen >= 6) {
+      brw_set_src0(p, insn, mrf);
+   } else {
+      brw_set_src0(p, insn, brw_null_reg());
    }
+
+   brw_set_dp_read_message(p,
+                          insn,
+                          bind_table_index,
+                          BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+                          BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                          1, /* msg_length */
+                          1); /* response_length (1 reg, 2 owords!) */
+
+   brw_pop_insn_state(p);
+}
+
+/**
+ * Read a set of dwords from the data port Data Cache (const buffer).
+ *
+ * Location (in buffer) appears as UD offsets in the register after
+ * the provided mrf header reg.
+ */
+void brw_dword_scattered_read(struct brw_compile *p,
+                             struct brw_reg dest,
+                             struct brw_reg mrf,
+                             uint32_t bind_table_index)
+{
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditionalmod = mrf.nr;
+
+   /* cast dest to a uword[8] vector */
+   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, brw_null_reg());
+
+   brw_set_dp_read_message(p,
+                          insn,
+                          bind_table_index,
+                          BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
+                          BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                          2, /* msg_length */
+                          1); /* response_length */
 }
 
 
+
 /**
  * Read float[4] constant(s) from VS constant buffer.
  * For relative addressing, two float[4] constants will be read into 'dest'.
@@ -1484,29 +1930,22 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
-   struct brw_reg b;
 
-   /*
-   printf("vs const read msg, location %u, msg_reg_nr %d\n",
-          location, msg_reg_nr);
-   */
+   if (intel->gen >= 6)
+      location /= 16;
 
    /* Setup MRF[1] with location/offset into const buffer */
    brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-    */
-   b = brw_message_reg(msg_reg_nr);
-   b = retype(b, BRW_REGISTER_TYPE_UD);
-   /*b = get_element_ud(b, 2);*/
-   brw_MOV(p, b, brw_imm_ud(location));
-
+   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
+                    BRW_REGISTER_TYPE_UD),
+          brw_imm_ud(location));
    brw_pop_insn_state(p);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1516,18 +1955,21 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    insn->header.destreg__conditionalmod = msg_reg_nr;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, brw_null_reg());
+   brw_set_dest(p, insn, dest);
+   if (intel->gen >= 6) {
+      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
+   } else {
+      brw_set_src0(p, insn, brw_null_reg());
+   }
 
-   brw_set_dp_read_message(p->brw,
+   brw_set_dp_read_message(p,
                           insn,
                           bind_table_index,
                           0,
                           BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-                          0, /* source cache = data cache */
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
                           1, /* msg_length */
-                          1, /* response_length (1 Oword) */
-                          0); /* eot */
+                          1); /* response_length (1 Oword) */
 }
 
 /**
@@ -1541,10 +1983,12 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
                               GLuint bind_table_index)
 {
    struct intel_context *intel = &p->brw->intel;
+   struct brw_reg src = brw_vec8_grf(0, 0);
    int msg_type;
 
    /* Setup MRF[1] with offset into const buffer */
    brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1552,10 +1996,11 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
     * fields ignored.
     */
-   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
           addr_reg, brw_imm_d(offset));
    brw_pop_insn_state(p);
 
+   gen6_resolve_implied_move(p, &src, 0);
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
 
    insn->header.predicate_control = BRW_PREDICATE_NONE;
@@ -1563,56 +2008,62 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    insn->header.destreg__conditionalmod = 0;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, brw_vec8_grf(0, 0));
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
 
-   if (intel->gen == 6)
+   if (intel->gen >= 6)
       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    else if (intel->gen == 5 || intel->is_g4x)
       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    else
       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
 
-   brw_set_dp_read_message(p->brw,
+   brw_set_dp_read_message(p,
                           insn,
                           bind_table_index,
                           BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
                           msg_type,
-                          0, /* source cache = data cache */
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
                           2, /* msg_length */
-                          1, /* response_length */
-                          0); /* eot */
+                          1); /* response_length */
 }
 
 
 
 void brw_fb_WRITE(struct brw_compile *p,
                  int dispatch_width,
-                  struct brw_reg dest,
                   GLuint msg_reg_nr,
                   struct brw_reg src0,
                   GLuint binding_table_index,
                   GLuint msg_length,
                   GLuint response_length,
-                  GLboolean eot)
+                  GLboolean eot,
+                  GLboolean header_present)
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_control, msg_type;
-   GLboolean header_present = GL_TRUE;
+   struct brw_reg dest;
 
-   insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.predicate_control = 0; /* XXX */
+   if (dispatch_width == 16)
+      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
+   if (intel->gen >= 6 && binding_table_index == 0) {
+      insn = next_insn(p, BRW_OPCODE_SENDC);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_SEND);
+   }
+   /* The execution mask is ignored for render target writes. */
+   insn->header.predicate_control = 0;
    insn->header.compression_control = BRW_COMPRESSION_NONE;
 
    if (intel->gen >= 6) {
-      if (msg_length == 4)
-        header_present = GL_FALSE;
-
        /* headerless version, just submit color payload */
        src0 = brw_message_reg(msg_reg_nr);
 
-       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
+       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
    } else {
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
@@ -1624,9 +2075,9 @@ void brw_fb_WRITE(struct brw_compile *p,
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_dp_write_message(p->brw,
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_dp_write_message(p,
                            insn,
                            binding_table_index,
                            msg_control,
@@ -1702,7 +2153,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
         struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
-        guess_execution_size(p->current, dest);
+        guess_execution_size(p, p->current, dest);
         if (p->current->header.execution_size == BRW_EXECUTE_16)
            dispatch_16 = GL_TRUE;
 
@@ -1713,7 +2164,8 @@ void brw_SAMPLE(struct brw_compile *p,
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
         brw_set_mask_control(p, BRW_MASK_DISABLE);
 
-        brw_MOV(p, m1, brw_vec8_grf(0,0));      
+        brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
+                retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
         brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 
 
         brw_pop_insn_state(p);
@@ -1733,17 +2185,7 @@ void brw_SAMPLE(struct brw_compile *p,
    {
       struct brw_instruction *insn;
    
-      /* Sandybridge doesn't have the implied move for SENDs,
-       * and the first message register index comes from src0.
-       */
-      if (intel->gen >= 6) {
-         brw_push_insn_state(p);
-         brw_set_mask_control( p, BRW_MASK_DISABLE );
-         /* m1 contains header? */
-         brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
-         brw_pop_insn_state(p);
-         src0 = brw_message_reg(msg_reg_nr);
-      }
+      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
       insn = next_insn(p, BRW_OPCODE_SEND);
       insn->header.predicate_control = 0; /* XXX */
@@ -1751,9 +2193,9 @@ void brw_SAMPLE(struct brw_compile *p,
       if (intel->gen < 6)
          insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src0);
-      brw_set_sampler_message(p->brw, insn,
+      brw_set_dest(p, insn, dest);
+      brw_set_src0(p, insn, src0);
+      brw_set_sampler_message(p, insn,
                              binding_table_index,
                              sampler,
                              msg_type,
@@ -1771,7 +2213,8 @@ void brw_SAMPLE(struct brw_compile *p,
        */
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_MOV(p, reg, reg);          
+      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
+             retype(reg, BRW_REGISTER_TYPE_UD));
       brw_pop_insn_state(p);
    }
 
@@ -1797,29 +2240,31 @@ void brw_urb_WRITE(struct brw_compile *p,
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
 
-   /* Sandybridge doesn't have the implied move for SENDs,
-    * and the first message register index comes from src0.
-    */
-   if (intel->gen >= 6) {
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   if (intel->gen == 7) {
+      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
       brw_push_insn_state(p);
-      brw_set_mask_control( p, BRW_MASK_DISABLE );
-      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
+                      BRW_REGISTER_TYPE_UD),
+               retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0xff00));
       brw_pop_insn_state(p);
-      src0 = brw_message_reg(msg_reg_nr);
    }
 
    insn = next_insn(p, BRW_OPCODE_SEND);
 
    assert(msg_length < BRW_MAX_MRF);
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, brw_imm_d(0));
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
 
    if (intel->gen < 6)
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_urb_message(p->brw,
+   brw_set_urb_message(p,
                       insn,
                       allocate,
                       used,
@@ -1831,6 +2276,84 @@ void brw_urb_WRITE(struct brw_compile *p,
                       swizzle);
 }
 
+static int
+brw_find_next_block_end(struct brw_compile *p, int start)
+{
+   int ip;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_WHILE:
+        return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_compile *p, int start)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int ip;
+   int br = 2;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      if (insn->header.opcode == BRW_OPCODE_WHILE) {
+        int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
+                                  : insn->bits3.break_cont.jip;
+        if (ip + jip / br <= start)
+           return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK and CONT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int ip;
+   int br = 2;
+
+   if (intel->gen < 6)
+      return;
+
+   for (ip = 0; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_BREAK:
+        insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+        /* Gen7 UIP points to WHILE; Gen6 points just after it */
+        insn->bits3.break_cont.uip =
+           br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
+        break;
+      case BRW_OPCODE_CONTINUE:
+        insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+        insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
+
+        assert(insn->bits3.break_cont.uip != 0);
+        assert(insn->bits3.break_cont.jip != 0);
+        break;
+      }
+   }
+}
+
 void brw_ff_sync(struct brw_compile *p,
                   struct brw_reg dest,
                   GLuint msg_reg_nr,
@@ -1842,27 +2365,17 @@ void brw_ff_sync(struct brw_compile *p,
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
 
-   /* Sandybridge doesn't have the implied move for SENDs,
-    * and the first message register index comes from src0.
-    */
-   if (intel->gen >= 6) {
-      brw_push_insn_state(p);
-      brw_set_mask_control( p, BRW_MASK_DISABLE );
-      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
-             retype(src0, BRW_REGISTER_TYPE_UD));
-      brw_pop_insn_state(p);
-      src0 = brw_message_reg(msg_reg_nr);
-   }
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, brw_imm_d(0));
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
 
    if (intel->gen < 6)
        insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_ff_sync_message(p->brw,
+   brw_set_ff_sync_message(p,
                           insn,
                           allocate,
                           response_length,