i965/fs: add support for BRW_OPCODE_AVG in fs_generator
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
index b82a858d9a72fc96f77a45504d9f7204fbaaa4d0..8ab043fd63c8a63024a7d51d3445a369c634d765 100644 (file)
@@ -1,8 +1,8 @@
 /*
  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ Intel funded Tungsten Graphics to
  develop this 3D driver.
+
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:
+
  The above copyright notice and this permission notice (including the
  next paragraph) shall be included in all copies or substantial
  portions of the Software.
+
  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
  **********************************************************************/
  /*
   * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Keith Whitwell <keithw@vmware.com>
   */
-     
+
 
 #include "brw_context.h"
 #include "brw_defines.h"
@@ -61,10 +61,10 @@ static void guess_execution_size(struct brw_compile *p,
 void
 gen6_resolve_implied_move(struct brw_compile *p,
                          struct brw_reg *src,
-                         GLuint msg_reg_nr)
+                         unsigned msg_reg_nr)
 {
-   struct intel_context *intel = &p->brw->intel;
-   if (intel->gen < 6)
+   struct brw_context *brw = p->brw;
+   if (brw->gen < 6)
       return;
 
    if (src->file == BRW_MESSAGE_REGISTER_FILE)
@@ -84,7 +84,7 @@ gen6_resolve_implied_move(struct brw_compile *p,
 static void
 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
 {
-   /* From the BSpec / ISA Reference / send - [DevIVB+]:
+   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
     * "The send with EOT should use register space R112-R127 for <src>. This is
     *  to enable loading of a new thread into the same slot while the message
     *  with EOT for current thread is pending dispatch."
@@ -92,13 +92,68 @@ gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
     * Since we're pretending to have 16 MRFs anyway, we may as well use the
     * registers required for messages with EOT.
     */
-   struct intel_context *intel = &p->brw->intel;
-   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+   struct brw_context *brw = p->brw;
+   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
       reg->file = BRW_GENERAL_REGISTER_FILE;
       reg->nr += GEN7_MRF_HACK_START;
    }
 }
 
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_reg_type_to_hw_type(const struct brw_context *brw,
+                        enum brw_reg_type type, unsigned file)
+{
+   if (file == BRW_IMMEDIATE_VALUE) {
+      const static int imm_hw_types[] = {
+         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
+         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
+         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
+         [BRW_REGISTER_TYPE_UB] = -1,
+         [BRW_REGISTER_TYPE_B]  = -1,
+         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
+         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
+         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
+         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
+         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
+         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
+      };
+      assert(type < ARRAY_SIZE(imm_hw_types));
+      assert(imm_hw_types[type] != -1);
+      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
+      return imm_hw_types[type];
+   } else {
+      /* Non-immediate registers */
+      const static int hw_types[] = {
+         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
+         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
+         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
+         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
+         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
+         [BRW_REGISTER_TYPE_UV] = -1,
+         [BRW_REGISTER_TYPE_VF] = -1,
+         [BRW_REGISTER_TYPE_V]  = -1,
+         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
+         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
+         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
+      };
+      assert(type < ARRAY_SIZE(hw_types));
+      assert(hw_types[type] != -1);
+      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
+      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
+      return hw_types[type];
+   }
+}
 
 void
 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
@@ -111,10 +166,11 @@ brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
    gen7_convert_mrf_to_grf(p, &dest);
 
    insn->bits1.da1.dest_reg_file = dest.file;
-   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_reg_type =
+      brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
    insn->bits1.da1.dest_address_mode = dest.address_mode;
 
-   if (dest.address_mode == BRW_ADDRESS_DIRECT) {   
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
       insn->bits1.da1.dest_reg_nr = dest.nr;
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
@@ -126,7 +182,14 @@ brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
       else {
         insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
         insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
-        /* even ignored in da16, still need to set as '01' */
+         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
+             dest.file == BRW_MESSAGE_REGISTER_FILE) {
+            assert(dest.dw1.bits.writemask != 0);
+         }
+        /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
+         *    Although Dst.HorzStride is a don't care for Align16, HW needs
+         *    this to be programmed as "01".
+         */
         insn->bits1.da16.dest_horiz_stride = 1;
       }
    }
@@ -236,28 +299,42 @@ void
 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
             struct brw_reg reg)
 {
+   struct brw_context *brw = p->brw;
+
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
    gen7_convert_mrf_to_grf(p, &reg);
 
+   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
+                           insn->header.opcode == BRW_OPCODE_SENDC)) {
+      /* Any source modifiers or regions will be ignored, since this just
+       * identifies the MRF/GRF to start reading the message contents from.
+       * Check for some likely failures.
+       */
+      assert(!reg.negate);
+      assert(!reg.abs);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+   }
+
    validate_reg(insn, reg);
 
    insn->bits1.da1.src0_reg_file = reg.file;
-   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits1.da1.src0_reg_type =
+      brw_reg_type_to_hw_type(brw, reg.type, reg.file);
    insn->bits2.da1.src0_abs = reg.abs;
    insn->bits2.da1.src0_negate = reg.negate;
    insn->bits2.da1.src0_address_mode = reg.address_mode;
 
    if (reg.file == BRW_IMMEDIATE_VALUE) {
       insn->bits3.ud = reg.dw1.ud;
-   
+
       /* Required to set some fields in src1 as well:
        */
       insn->bits1.da1.src1_reg_file = 0; /* arf */
-      insn->bits1.da1.src1_reg_type = reg.type;
+      insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
    }
-   else 
+   else
    {
       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
         if (insn->header.access_mode == BRW_ALIGN_1) {
@@ -273,7 +350,7 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
         insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 
         if (insn->header.access_mode == BRW_ALIGN_1) {
-           insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset; 
+           insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
         }
         else {
            insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
@@ -281,7 +358,7 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
       }
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
-        if (reg.width == BRW_WIDTH_1 && 
+        if (reg.width == BRW_WIDTH_1 &&
             insn->header.execution_size == BRW_EXECUTE_1) {
            insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
            insn->bits2.da1.src0_width = BRW_WIDTH_1;
@@ -317,14 +394,16 @@ void brw_set_src1(struct brw_compile *p,
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
-   assert(reg.nr < 128);
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
 
    gen7_convert_mrf_to_grf(p, &reg);
 
    validate_reg(insn, reg);
 
    insn->bits1.da1.src1_reg_file = reg.file;
-   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits1.da1.src1_reg_type =
+      brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
    insn->bits3.da1.src1_abs = reg.abs;
    insn->bits3.da1.src1_negate = reg.negate;
 
@@ -352,7 +431,7 @@ void brw_set_src1(struct brw_compile *p,
       }
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
-        if (reg.width == BRW_WIDTH_1 && 
+        if (reg.width == BRW_WIDTH_1 &&
             insn->header.execution_size == BRW_EXECUTE_1) {
            insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
            insn->bits3.da1.src1_width = BRW_WIDTH_1;
@@ -398,17 +477,17 @@ brw_set_message_descriptor(struct brw_compile *p,
                           bool header_present,
                           bool end_of_thread)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
 
    brw_set_src1(p, inst, brw_imm_d(0));
 
-   if (intel->gen >= 5) {
+   if (brw->gen >= 5) {
       inst->bits3.generic_gen5.header_present = header_present;
       inst->bits3.generic_gen5.response_length = response_length;
       inst->bits3.generic_gen5.msg_length = msg_length;
       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 
-      if (intel->gen >= 6) {
+      if (brw->gen >= 6) {
         /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
         inst->header.destreg__conditionalmod = sfid;
       } else {
@@ -426,13 +505,12 @@ brw_set_message_descriptor(struct brw_compile *p,
 
 static void brw_set_math_message( struct brw_compile *p,
                                  struct brw_instruction *insn,
-                                 GLuint function,
-                                 GLuint integer_type,
+                                 unsigned function,
+                                 unsigned integer_type,
                                  bool low_precision,
-                                 GLuint dataType )
+                                 unsigned dataType )
 {
    struct brw_context *brw = p->brw;
-   struct intel_context *intel = &brw->intel;
    unsigned msg_length;
    unsigned response_length;
 
@@ -463,7 +541,7 @@ static void brw_set_math_message( struct brw_compile *p,
 
    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
                              msg_length, response_length, false, false);
-   if (intel->gen == 5) {
+   if (brw->gen == 5) {
       insn->bits3.math_gen5.function = function;
       insn->bits3.math_gen5.int_type = integer_type;
       insn->bits3.math_gen5.precision = low_precision;
@@ -484,7 +562,7 @@ static void brw_set_math_message( struct brw_compile *p,
 static void brw_set_ff_sync_message(struct brw_compile *p,
                                    struct brw_instruction *insn,
                                    bool allocate,
-                                   GLuint response_length,
+                                   unsigned response_length,
                                    bool end_of_thread)
 {
    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
@@ -499,69 +577,70 @@ static void brw_set_ff_sync_message(struct brw_compile *p,
 
 static void brw_set_urb_message( struct brw_compile *p,
                                 struct brw_instruction *insn,
-                                bool allocate,
-                                bool used,
-                                GLuint msg_length,
-                                GLuint response_length,
-                                bool end_of_thread,
-                                bool complete,
-                                GLuint offset,
-                                GLuint swizzle_control )
+                                 enum brw_urb_write_flags flags,
+                                unsigned msg_length,
+                                unsigned response_length,
+                                unsigned offset,
+                                unsigned swizzle_control )
 {
    struct brw_context *brw = p->brw;
-   struct intel_context *intel = &brw->intel;
 
    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
-                             msg_length, response_length, true, end_of_thread);
-   if (intel->gen == 7) {
-      insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
+                             msg_length, response_length, true,
+                              flags & BRW_URB_WRITE_EOT);
+   if (brw->gen == 7) {
+      if (flags & BRW_URB_WRITE_OWORD) {
+         assert(msg_length == 2); /* header + one OWORD of data */
+         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
+      } else {
+         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
+      }
       insn->bits3.urb_gen7.offset = offset;
       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
-      /* per_slot_offset = 0 makes it ignore offsets in message header */
-      insn->bits3.urb_gen7.per_slot_offset = 0;
-      insn->bits3.urb_gen7.complete = complete;
-   } else if (intel->gen >= 5) {
+      insn->bits3.urb_gen7.per_slot_offset =
+         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
+      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
+   } else if (brw->gen >= 5) {
       insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
       insn->bits3.urb_gen5.offset = offset;
       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
-      insn->bits3.urb_gen5.allocate = allocate;
-      insn->bits3.urb_gen5.used = used;        /* ? */
-      insn->bits3.urb_gen5.complete = complete;
+      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
+      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
+      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
    } else {
       insn->bits3.urb.opcode = 0;      /* ? */
       insn->bits3.urb.offset = offset;
       insn->bits3.urb.swizzle_control = swizzle_control;
-      insn->bits3.urb.allocate = allocate;
-      insn->bits3.urb.used = used;     /* ? */
-      insn->bits3.urb.complete = complete;
+      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
+      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
+      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
    }
 }
 
 void
 brw_set_dp_write_message(struct brw_compile *p,
                         struct brw_instruction *insn,
-                        GLuint binding_table_index,
-                        GLuint msg_control,
-                        GLuint msg_type,
-                        GLuint msg_length,
+                        unsigned binding_table_index,
+                        unsigned msg_control,
+                        unsigned msg_type,
+                        unsigned msg_length,
                         bool header_present,
-                        GLuint last_render_target,
-                        GLuint response_length,
-                        GLuint end_of_thread,
-                        GLuint send_commit_msg)
+                        unsigned last_render_target,
+                        unsigned response_length,
+                        unsigned end_of_thread,
+                        unsigned send_commit_msg)
 {
    struct brw_context *brw = p->brw;
-   struct intel_context *intel = &brw->intel;
    unsigned sfid;
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
         sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
       else
         sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       /* Use the render cache for all write messages. */
       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
    } else {
@@ -571,18 +650,18 @@ brw_set_dp_write_message(struct brw_compile *p,
    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
                              header_present, end_of_thread);
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
       insn->bits3.gen7_dp.msg_control = msg_control;
       insn->bits3.gen7_dp.last_render_target = last_render_target;
       insn->bits3.gen7_dp.msg_type = msg_type;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
       insn->bits3.gen6_dp.msg_control = msg_control;
       insn->bits3.gen6_dp.last_render_target = last_render_target;
       insn->bits3.gen6_dp.msg_type = msg_type;
       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
-   } else if (intel->gen == 5) {
+   } else if (brw->gen == 5) {
       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
       insn->bits3.dp_write_gen5.msg_control = msg_control;
       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
@@ -600,20 +679,20 @@ brw_set_dp_write_message(struct brw_compile *p,
 void
 brw_set_dp_read_message(struct brw_compile *p,
                        struct brw_instruction *insn,
-                       GLuint binding_table_index,
-                       GLuint msg_control,
-                       GLuint msg_type,
-                       GLuint target_cache,
-                       GLuint msg_length,
-                       GLuint response_length)
+                       unsigned binding_table_index,
+                       unsigned msg_control,
+                       unsigned msg_type,
+                       unsigned target_cache,
+                       unsigned msg_length,
+                        bool header_present,
+                       unsigned response_length)
 {
    struct brw_context *brw = p->brw;
-   struct intel_context *intel = &brw->intel;
    unsigned sfid;
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
         sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
       else
@@ -623,25 +702,25 @@ brw_set_dp_read_message(struct brw_compile *p,
    }
 
    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
-                             true, false);
+                             header_present, false);
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
       insn->bits3.gen7_dp.msg_control = msg_control;
       insn->bits3.gen7_dp.last_render_target = 0;
       insn->bits3.gen7_dp.msg_type = msg_type;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
       insn->bits3.gen6_dp.msg_control = msg_control;
       insn->bits3.gen6_dp.last_render_target = 0;
       insn->bits3.gen6_dp.msg_type = msg_type;
       insn->bits3.gen6_dp.send_commit_msg = 0;
-   } else if (intel->gen == 5) {
+   } else if (brw->gen == 5) {
       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
       insn->bits3.dp_read_gen5.msg_control = msg_control;
       insn->bits3.dp_read_gen5.msg_type = msg_type;
       insn->bits3.dp_read_gen5.target_cache = target_cache;
-   } else if (intel->is_g4x) {
+   } else if (brw->is_g4x) {
       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
@@ -657,32 +736,31 @@ brw_set_dp_read_message(struct brw_compile *p,
 void
 brw_set_sampler_message(struct brw_compile *p,
                         struct brw_instruction *insn,
-                        GLuint binding_table_index,
-                        GLuint sampler,
-                        GLuint msg_type,
-                        GLuint response_length,
-                        GLuint msg_length,
-                        GLuint header_present,
-                        GLuint simd_mode,
-                        GLuint return_format)
+                        unsigned binding_table_index,
+                        unsigned sampler,
+                        unsigned msg_type,
+                        unsigned response_length,
+                        unsigned msg_length,
+                        unsigned header_present,
+                        unsigned simd_mode,
+                        unsigned return_format)
 {
    struct brw_context *brw = p->brw;
-   struct intel_context *intel = &brw->intel;
 
    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
                              response_length, header_present, false);
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
       insn->bits3.sampler_gen7.sampler = sampler;
       insn->bits3.sampler_gen7.msg_type = msg_type;
       insn->bits3.sampler_gen7.simd_mode = simd_mode;
-   } else if (intel->gen >= 5) {
+   } else if (brw->gen >= 5) {
       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
       insn->bits3.sampler_gen5.sampler = sampler;
       insn->bits3.sampler_gen5.msg_type = msg_type;
       insn->bits3.sampler_gen5.simd_mode = simd_mode;
-   } else if (intel->is_g4x) {
+   } else if (brw->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -697,7 +775,7 @@ brw_set_sampler_message(struct brw_compile *p,
 
 #define next_insn brw_next_insn
 struct brw_instruction *
-brw_next_insn(struct brw_compile *p, GLuint opcode)
+brw_next_insn(struct brw_compile *p, unsigned opcode)
 {
    struct brw_instruction *insn;
 
@@ -711,10 +789,11 @@ brw_next_insn(struct brw_compile *p, GLuint opcode)
          assert(!"realloc eu store memeory failed");
    }
 
+   p->next_insn_offset += 16;
    insn = &p->store[p->nr_insn++];
    memcpy(insn, p->current, sizeof(*insn));
 
-   /* Reset this one-shot flag: 
+   /* Reset this one-shot flag:
     */
 
    if (p->current->header.destreg__conditionalmod) {
@@ -727,7 +806,7 @@ brw_next_insn(struct brw_compile *p, GLuint opcode)
 }
 
 static struct brw_instruction *brw_alu1( struct brw_compile *p,
-                                        GLuint opcode,
+                                        unsigned opcode,
                                         struct brw_reg dest,
                                         struct brw_reg src )
 {
@@ -738,12 +817,12 @@ static struct brw_instruction *brw_alu1( struct brw_compile *p,
 }
 
 static struct brw_instruction *brw_alu2(struct brw_compile *p,
-                                       GLuint opcode,
+                                       unsigned opcode,
                                        struct brw_reg dest,
                                        struct brw_reg src0,
                                        struct brw_reg src1 )
 {
-   struct brw_instruction *insn = next_insn(p, opcode);   
+   struct brw_instruction *insn = next_insn(p, opcode);
    brw_set_dest(p, insn, dest);
    brw_set_src0(p, insn, src0);
    brw_set_src1(p, insn, src1);
@@ -762,12 +841,13 @@ get_3src_subreg_nr(struct brw_reg reg)
 }
 
 static struct brw_instruction *brw_alu3(struct brw_compile *p,
-                                       GLuint opcode,
+                                       unsigned opcode,
                                        struct brw_reg dest,
                                        struct brw_reg src0,
                                        struct brw_reg src1,
                                        struct brw_reg src2)
 {
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn = next_insn(p, opcode);
 
    gen7_convert_mrf_to_grf(p, &dest);
@@ -778,7 +858,9 @@ static struct brw_instruction *brw_alu3(struct brw_compile *p,
          dest.file == BRW_MESSAGE_REGISTER_FILE);
    assert(dest.nr < 128);
    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
-   assert(dest.type = BRW_REGISTER_TYPE_F);
+   assert(dest.type == BRW_REGISTER_TYPE_F ||
+          dest.type == BRW_REGISTER_TYPE_D ||
+          dest.type == BRW_REGISTER_TYPE_UD);
    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
    insn->bits1.da3src.dest_reg_nr = dest.nr;
    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
@@ -788,7 +870,6 @@ static struct brw_instruction *brw_alu3(struct brw_compile *p,
    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
    assert(src0.nr < 128);
-   assert(src0.type == BRW_REGISTER_TYPE_F);
    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
    insn->bits2.da3src.src0_reg_nr = src0.nr;
@@ -799,7 +880,6 @@ static struct brw_instruction *brw_alu3(struct brw_compile *p,
    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
    assert(src1.nr < 128);
-   assert(src1.type == BRW_REGISTER_TYPE_F);
    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
@@ -811,7 +891,6 @@ static struct brw_instruction *brw_alu3(struct brw_compile *p,
    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
    assert(src2.nr < 128);
-   assert(src2.type == BRW_REGISTER_TYPE_F);
    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
@@ -819,6 +898,29 @@ static struct brw_instruction *brw_alu3(struct brw_compile *p,
    insn->bits1.da3src.src2_abs = src2.abs;
    insn->bits1.da3src.src2_negate = src2.negate;
 
+   if (brw->gen >= 7) {
+      /* Set both the source and destination types based on dest.type,
+       * ignoring the source register types.  The MAD and LRP emitters ensure
+       * that all four types are float.  The BFE and BFI2 emitters, however,
+       * may send us mixed D and UD types and want us to ignore that and use
+       * the destination type.
+       */
+      switch (dest.type) {
+      case BRW_REGISTER_TYPE_F:
+         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
+         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
+         break;
+      case BRW_REGISTER_TYPE_D:
+         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
+         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
+         break;
+      case BRW_REGISTER_TYPE_UD:
+         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
+         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
+         break;
+      }
+   }
+
    return insn;
 }
 
@@ -853,6 +955,20 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,    \
    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);        \
 }
 
+#define ALU3F(OP)                                               \
+struct brw_instruction *brw_##OP(struct brw_compile *p,         \
+                                 struct brw_reg dest,           \
+                                 struct brw_reg src0,           \
+                                 struct brw_reg src1,           \
+                                 struct brw_reg src2)           \
+{                                                               \
+   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
+   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
+   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
+   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
+   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
 /* Rounding operations (other than RNDD) require two instructions - the first
  * stores a rounded value (possibly the wrong way) in the dest register, but
  * also sets a per-channel "increment bit" in the flag register.  A predicated
@@ -870,7 +986,7 @@ void brw_##OP(struct brw_compile *p,                                              \
    brw_set_dest(p, rnd, dest);                                               \
    brw_set_src0(p, rnd, src);                                                \
                                                                              \
-   if (p->brw->intel.gen < 6) {                                                      \
+   if (p->brw->gen < 6) {                                                    \
       /* turn on round-increments */                                         \
       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;               \
       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                         \
@@ -887,9 +1003,9 @@ ALU2(OR)
 ALU2(XOR)
 ALU2(SHR)
 ALU2(SHL)
-ALU2(RSR)
-ALU2(RSL)
 ALU2(ASR)
+ALU1(F32TO16)
+ALU1(F16TO32)
 ALU1(FRC)
 ALU1(RNDD)
 ALU2(MAC)
@@ -901,7 +1017,17 @@ ALU2(DP3)
 ALU2(DP2)
 ALU2(LINE)
 ALU2(PLN)
-ALU3(MAD)
+ALU3F(MAD)
+ALU3F(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
 
 ROUND(RNDZ)
 ROUND(RNDE)
@@ -990,7 +1116,7 @@ struct brw_instruction *brw_MUL(struct brw_compile *p,
 
 void brw_NOP(struct brw_compile *p)
 {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src1(p, insn, brw_imm_ud(0x0));
@@ -1004,7 +1130,7 @@ void brw_NOP(struct brw_compile *p)
  * Comparisons, if/else/endif
  */
 
-struct brw_instruction *brw_JMPI(struct brw_compile *p, 
+struct brw_instruction *brw_JMPI(struct brw_compile *p,
                                  struct brw_reg dest,
                                  struct brw_reg src0,
                                  struct brw_reg src1)
@@ -1076,20 +1202,20 @@ get_inner_do_insn(struct brw_compile *p)
  * popped off.  If the stack is now empty, normal execution resumes.
  */
 struct brw_instruction *
-brw_IF(struct brw_compile *p, GLuint execute_size)
+brw_IF(struct brw_compile *p, unsigned execute_size)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
    insn = next_insn(p, BRW_OPCODE_IF);
 
    /* Override the defaults for this instruction:
     */
-   if (intel->gen < 6) {
+   if (brw->gen < 6) {
       brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(p, insn, brw_ip_reg());
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
@@ -1197,7 +1323,7 @@ patch_IF_ELSE(struct brw_compile *p,
              struct brw_instruction *else_inst,
              struct brw_instruction *endif_inst)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
 
    /* We shouldn't be patching IF and ELSE instructions in single program flow
     * mode when gen < 6, because in single program flow mode on those
@@ -1211,7 +1337,7 @@ patch_IF_ELSE(struct brw_compile *p,
     * instructions to conditional ADDs.  So we do patch IF and ELSE
     * instructions in single program flow mode on those platforms.
     */
-   if (intel->gen < 6)
+   if (brw->gen < 6)
       assert(!p->single_program_flow);
 
    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
@@ -1222,7 +1348,7 @@ patch_IF_ELSE(struct brw_compile *p,
    /* Jump count is for 64bit data chunk each, so one 128bit instruction
     * requires 2 chunks.
     */
-   if (intel->gen >= 5)
+   if (brw->gen >= 5)
       br = 2;
 
    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
@@ -1230,7 +1356,7 @@ patch_IF_ELSE(struct brw_compile *p,
 
    if (else_inst == NULL) {
       /* Patch IF -> ENDIF */
-      if (intel->gen < 6) {
+      if (brw->gen < 6) {
         /* Turn it into an IFF, which means no mask stack operations for
          * all-false and jumping past the ENDIF.
          */
@@ -1238,7 +1364,7 @@ patch_IF_ELSE(struct brw_compile *p,
         if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
         if_inst->bits3.if_else.pop_count = 0;
         if_inst->bits3.if_else.pad0 = 0;
-      } else if (intel->gen == 6) {
+      } else if (brw->gen == 6) {
         /* As of gen6, there is no IFF and IF must point to the ENDIF. */
         if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
       } else {
@@ -1249,23 +1375,23 @@ patch_IF_ELSE(struct brw_compile *p,
       else_inst->header.execution_size = if_inst->header.execution_size;
 
       /* Patch IF -> ELSE */
-      if (intel->gen < 6) {
+      if (brw->gen < 6) {
         if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
         if_inst->bits3.if_else.pop_count = 0;
         if_inst->bits3.if_else.pad0 = 0;
-      } else if (intel->gen == 6) {
+      } else if (brw->gen == 6) {
         if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
       }
 
       /* Patch ELSE -> ENDIF */
-      if (intel->gen < 6) {
+      if (brw->gen < 6) {
         /* BRW_OPCODE_ELSE pre-gen6 should point just past the
          * matching ENDIF.
          */
         else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
         else_inst->bits3.if_else.pop_count = 1;
         else_inst->bits3.if_else.pad0 = 0;
-      } else if (intel->gen == 6) {
+      } else if (brw->gen == 6) {
         /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
         else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
       } else {
@@ -1281,16 +1407,16 @@ patch_IF_ELSE(struct brw_compile *p,
 void
 brw_ELSE(struct brw_compile *p)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
    insn = next_insn(p, BRW_OPCODE_ELSE);
 
-   if (intel->gen < 6) {
+   if (brw->gen < 6) {
       brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(p, insn, brw_ip_reg());
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -1314,7 +1440,7 @@ brw_ELSE(struct brw_compile *p)
 void
 brw_ENDIF(struct brw_compile *p)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn = NULL;
    struct brw_instruction *else_inst = NULL;
    struct brw_instruction *if_inst = NULL;
@@ -1333,7 +1459,7 @@ brw_ENDIF(struct brw_compile *p)
     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
     * Gen5.
     */
-   if (intel->gen < 6 && p->single_program_flow)
+   if (brw->gen < 6 && p->single_program_flow)
       emit_endif = false;
 
    /*
@@ -1359,11 +1485,11 @@ brw_ENDIF(struct brw_compile *p)
       return;
    }
 
-   if (intel->gen < 6) {
+   if (brw->gen < 6) {
       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
       brw_set_src1(p, insn, brw_imm_d(0x0));
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -1378,11 +1504,11 @@ brw_ENDIF(struct brw_compile *p)
    insn->header.thread_control = BRW_THREAD_SWITCH;
 
    /* Also pop item off the stack in the endif instruction: */
-   if (intel->gen < 6) {
+   if (brw->gen < 6) {
       insn->bits3.if_else.jump_count = 0;
       insn->bits3.if_else.pop_count = 1;
       insn->bits3.if_else.pad0 = 0;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       insn->bits1.branch_gen6.jump_count = 2;
    } else {
       insn->bits3.break_cont.jip = 2;
@@ -1392,11 +1518,11 @@ brw_ENDIF(struct brw_compile *p)
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
    insn = next_insn(p, BRW_OPCODE_BREAK);
-   if (intel->gen >= 6) {
+   if (brw->gen >= 6) {
       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0));
@@ -1444,6 +1570,24 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
    return insn;
 }
 
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+   if (p->compressed) {
+      insn->header.execution_size = BRW_EXECUTE_16;
+   } else {
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = BRW_EXECUTE_8;
+   }
+   return insn;
+}
+
 /* DO/WHILE loop:
  *
  * The DO/WHILE is just an unterminated loop -- break or continue are
@@ -1460,11 +1604,11 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
  * For gen6, there's no more mask stack, so no need for DO.  WHILE
  * just points back to the first instruction of the loop.
  */
-struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
+struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
 
-   if (intel->gen >= 6 || p->single_program_flow) {
+   if (brw->gen >= 6 || p->single_program_flow) {
       push_loop_stack(p, &p->store[p->nr_insn]);
       return &p->store[p->nr_insn];
    } else {
@@ -1498,10 +1642,10 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 static void
 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *do_inst = get_inner_do_insn(p);
    struct brw_instruction *inst;
-   int br = (intel->gen == 5) ? 2 : 1;
+   int br = (brw->gen == 5) ? 2 : 1;
 
    for (inst = while_inst - 1; inst != do_inst; inst--) {
       /* If the jump count is != 0, that means that this instruction has already
@@ -1520,14 +1664,14 @@ brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
 
 struct brw_instruction *brw_WHILE(struct brw_compile *p)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn, *do_insn;
-   GLuint br = 1;
+   unsigned br = 1;
 
-   if (intel->gen >= 5)
+   if (brw->gen >= 5)
       br = 2;
 
-   if (intel->gen >= 7) {
+   if (brw->gen >= 7) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
       do_insn = get_inner_do_insn(p);
 
@@ -1537,7 +1681,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p)
       insn->bits3.break_cont.jip = br * (do_insn - insn);
 
       insn->header.execution_size = BRW_EXECUTE_8;
-   } else if (intel->gen == 6) {
+   } else if (brw->gen == 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
       do_insn = get_inner_do_insn(p);
 
@@ -1587,11 +1731,11 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p)
  */
 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
-   GLuint jmpi = 1;
+   unsigned jmpi = 1;
 
-   if (intel->gen >= 5)
+   if (brw->gen >= 5)
       jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
@@ -1608,10 +1752,11 @@ void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
  */
 void brw_CMP(struct brw_compile *p,
             struct brw_reg dest,
-            GLuint conditional,
+            unsigned conditional,
             struct brw_reg src0,
             struct brw_reg src1)
 {
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
    insn->header.destreg__conditionalmod = conditional;
@@ -1624,13 +1769,27 @@ void brw_CMP(struct brw_compile *p,
 
    /* Make it so that future instructions will use the computed flag
     * value until brw_set_predicate_control_flag_value() is called
-    * again.  
+    * again.
     */
    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.nr == 0) {
       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
       p->flag_value = 0xff;
    }
+
+   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
+    * page says:
+    *    "Any CMP instruction with a null destination must use a {switch}."
+    *
+    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
+    * mentioned on their work-arounds pages.
+    */
+   if (brw->gen == 7) {
+      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          dest.nr == BRW_ARF_NULL) {
+         insn->header.thread_control = BRW_THREAD_SWITCH;
+      }
+   }
 }
 
 /* Issue 'wait' instruction for n1, host could program MMIO
@@ -1657,26 +1816,27 @@ void brw_WAIT (struct brw_compile *p)
  */
 void brw_math( struct brw_compile *p,
               struct brw_reg dest,
-              GLuint function,
-              GLuint msg_reg_nr,
+              unsigned function,
+              unsigned msg_reg_nr,
               struct brw_reg src,
-              GLuint data_type,
-              GLuint precision )
+              unsigned data_type,
+              unsigned precision )
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
 
-   if (intel->gen >= 6) {
+   if (brw->gen >= 6) {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
 
-      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
       assert(src.file == BRW_GENERAL_REGISTER_FILE);
 
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
-      if (intel->gen == 6)
+      if (brw->gen == 6)
         assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
       /* Source modifiers are ignored for extended math instructions on Gen6. */
-      if (intel->gen == 6) {
+      if (brw->gen == 6) {
         assert(!src.negate);
         assert(!src.abs);
       }
@@ -1721,23 +1881,20 @@ void brw_math( struct brw_compile *p,
  */
 void brw_math2(struct brw_compile *p,
               struct brw_reg dest,
-              GLuint function,
+              unsigned function,
               struct brw_reg src0,
               struct brw_reg src1)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
 
-   assert(intel->gen >= 6);
-   (void) intel;
-
-
-   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 
    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
-   if (intel->gen == 6) {
+   if (brw->gen == 6) {
       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
    }
@@ -1753,7 +1910,7 @@ void brw_math2(struct brw_compile *p,
    }
 
    /* Source modifiers are ignored for extended math instructions on Gen6. */
-   if (intel->gen == 6) {
+   if (brw->gen == 6) {
       assert(!src0.negate);
       assert(!src0.abs);
       assert(!src1.negate);
@@ -1770,74 +1927,6 @@ void brw_math2(struct brw_compile *p,
    brw_set_src1(p, insn, src1);
 }
 
-/**
- * Extended math function, float[16].
- * Use 2 send instructions.
- */
-void brw_math_16( struct brw_compile *p,
-                 struct brw_reg dest,
-                 GLuint function,
-                 GLuint msg_reg_nr,
-                 struct brw_reg src,
-                 GLuint precision )
-{
-   struct intel_context *intel = &p->brw->intel;
-   struct brw_instruction *insn;
-
-   if (intel->gen >= 6) {
-      insn = next_insn(p, BRW_OPCODE_MATH);
-
-      /* Math is the same ISA format as other opcodes, except that CondModifier
-       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
-       */
-      insn->header.destreg__conditionalmod = function;
-
-      /* Source modifiers are ignored for extended math instructions. */
-      assert(!src.negate);
-      assert(!src.abs);
-
-      brw_set_dest(p, insn, dest);
-      brw_set_src0(p, insn, src);
-      brw_set_src1(p, insn, brw_null_reg());
-      return;
-   }
-
-   /* First instruction:
-    */
-   brw_push_insn_state(p);
-   brw_set_predicate_control_flag_value(p, 0xff);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-   insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditionalmod = msg_reg_nr;
-
-   brw_set_dest(p, insn, dest);
-   brw_set_src0(p, insn, src);
-   brw_set_math_message(p,
-                       insn, 
-                       function,
-                       BRW_MATH_INTEGER_UNSIGNED,
-                       precision,
-                       BRW_MATH_DATA_VECTOR);
-
-   /* Second instruction:
-    */
-   insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditionalmod = msg_reg_nr+1;
-
-   brw_set_dest(p, insn, offset(dest,1));
-   brw_set_src0(p, insn, src);
-   brw_set_math_message(p, 
-                       insn, 
-                       function,
-                       BRW_MATH_INTEGER_UNSIGNED,
-                       precision,
-                       BRW_MATH_DATA_VECTOR);
-
-   brw_pop_insn_state(p);
-}
-
 
 /**
  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
@@ -1849,13 +1938,13 @@ void brw_math_16( struct brw_compile *p,
 void brw_oword_block_write_scratch(struct brw_compile *p,
                                   struct brw_reg mrf,
                                   int num_regs,
-                                  GLuint offset)
+                                  unsigned offset)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    uint32_t msg_control, msg_type;
    int mlen;
 
-   if (intel->gen >= 6)
+   if (brw->gen >= 6)
       offset /= 16;
 
    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
@@ -1914,7 +2003,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
        * protection.  Our use of DP writes is all about register
        * spilling within a thread.
        */
-      if (intel->gen >= 6) {
+      if (brw->gen >= 6) {
         dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
         send_commit_msg = 0;
       } else {
@@ -1923,13 +2012,13 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
       }
 
       brw_set_dest(p, insn, dest);
-      if (intel->gen >= 6) {
+      if (brw->gen >= 6) {
         brw_set_src0(p, insn, mrf);
       } else {
         brw_set_src0(p, insn, brw_null_reg());
       }
 
-      if (intel->gen >= 6)
+      if (brw->gen >= 6)
         msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
       else
         msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
@@ -1961,13 +2050,13 @@ brw_oword_block_read_scratch(struct brw_compile *p,
                             struct brw_reg dest,
                             struct brw_reg mrf,
                             int num_regs,
-                            GLuint offset)
+                            unsigned offset)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    uint32_t msg_control;
    int rlen;
 
-   if (intel->gen >= 6)
+   if (brw->gen >= 6)
       offset /= 16;
 
    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
@@ -2006,7 +2095,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
       insn->header.destreg__conditionalmod = mrf.nr;
 
       brw_set_dest(p, insn, dest);     /* UW? */
-      if (intel->gen >= 6) {
+      if (brw->gen >= 6) {
         brw_set_src0(p, insn, mrf);
       } else {
         brw_set_src0(p, insn, brw_null_reg());
@@ -2019,10 +2108,53 @@ brw_oword_block_read_scratch(struct brw_compile *p,
                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
                              BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
                              1, /* msg_length */
+                              true, /* header_present */
                              rlen);
    }
 }
 
+void
+gen7_block_read_scratch(struct brw_compile *p,
+                        struct brw_reg dest,
+                        int num_regs,
+                        unsigned offset)
+{
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   brw_set_dest(p, insn, dest);
+
+   /* The HW requires that the header is present; this is to get the g0.5
+    * scratch offset.
+    */
+   bool header_present = true;
+   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
+
+   brw_set_message_descriptor(p, insn,
+                              GEN7_SFID_DATAPORT_DATA_CACHE,
+                              1, /* mlen: just g0 */
+                              num_regs,
+                              header_present,
+                              false);
+
+   insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
+
+   assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
+   insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
+
+   /* According to the docs, offset is "A 12-bit HWord offset into the memory
+    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
+    * is 32 bytes, which happens to be the size of a register.
+    */
+   offset /= REG_SIZE;
+   assert(offset < (1 << 12));
+   insn->bits3.ud |= offset;
+}
+
 /**
  * Read a float[4] vector from the data port Data Cache (const buffer).
  * Location (in buffer) should be a multiple of 16.
@@ -2034,10 +2166,10 @@ void brw_oword_block_read(struct brw_compile *p,
                          uint32_t offset,
                          uint32_t bind_table_index)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
 
    /* On newer hardware, offset is in units of owords. */
-   if (intel->gen >= 6)
+   if (brw->gen >= 6)
       offset /= 16;
 
    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
@@ -2063,7 +2195,7 @@ void brw_oword_block_read(struct brw_compile *p,
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
    brw_set_dest(p, insn, dest);
-   if (intel->gen >= 6) {
+   if (brw->gen >= 6) {
       brw_set_src0(p, insn, mrf);
    } else {
       brw_set_src0(p, insn, brw_null_reg());
@@ -2076,176 +2208,27 @@ void brw_oword_block_read(struct brw_compile *p,
                           BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
                           1, /* msg_length */
+                           true, /* header_present */
                           1); /* response_length (1 reg, 2 owords!) */
 
    brw_pop_insn_state(p);
 }
 
-/**
- * Read a set of dwords from the data port Data Cache (const buffer).
- *
- * Location (in buffer) appears as UD offsets in the register after
- * the provided mrf header reg.
- */
-void brw_dword_scattered_read(struct brw_compile *p,
-                             struct brw_reg dest,
-                             struct brw_reg mrf,
-                             uint32_t bind_table_index)
-{
-   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
-
-   brw_push_insn_state(p);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-   brw_pop_insn_state(p);
-
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditionalmod = mrf.nr;
-
-   /* cast dest to a uword[8] vector */
-   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
-
-   brw_set_dest(p, insn, dest);
-   brw_set_src0(p, insn, brw_null_reg());
-
-   brw_set_dp_read_message(p,
-                          insn,
-                          bind_table_index,
-                          BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
-                          BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
-                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
-                          2, /* msg_length */
-                          1); /* response_length */
-}
-
-
-
-/**
- * Read float[4] constant(s) from VS constant buffer.
- * For relative addressing, two float[4] constants will be read into 'dest'.
- * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
- */
-void brw_dp_READ_4_vs(struct brw_compile *p,
-                      struct brw_reg dest,
-                      GLuint location,
-                      GLuint bind_table_index)
-{
-   struct intel_context *intel = &p->brw->intel;
-   struct brw_instruction *insn;
-   GLuint msg_reg_nr = 1;
-
-   if (intel->gen >= 6)
-      location /= 16;
-
-   /* Setup MRF[1] with location/offset into const buffer */
-   brw_push_insn_state(p);
-   brw_set_access_mode(p, BRW_ALIGN_1);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
-                    BRW_REGISTER_TYPE_UD),
-          brw_imm_ud(location));
-   brw_pop_insn_state(p);
-
-   insn = next_insn(p, BRW_OPCODE_SEND);
-
-   insn->header.predicate_control = BRW_PREDICATE_NONE;
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.destreg__conditionalmod = msg_reg_nr;
-   insn->header.mask_control = BRW_MASK_DISABLE;
-
-   brw_set_dest(p, insn, dest);
-   if (intel->gen >= 6) {
-      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
-   } else {
-      brw_set_src0(p, insn, brw_null_reg());
-   }
-
-   brw_set_dp_read_message(p,
-                          insn,
-                          bind_table_index,
-                          0,
-                          BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
-                          1, /* msg_length */
-                          1); /* response_length (1 Oword) */
-}
-
-/**
- * Read a float[4] constant per vertex from VS constant buffer, with
- * relative addressing.
- */
-void brw_dp_READ_4_vs_relative(struct brw_compile *p,
-                              struct brw_reg dest,
-                              struct brw_reg addr_reg,
-                              GLuint offset,
-                              GLuint bind_table_index)
-{
-   struct intel_context *intel = &p->brw->intel;
-   struct brw_reg src = brw_vec8_grf(0, 0);
-   int msg_type;
-
-   /* Setup MRF[1] with offset into const buffer */
-   brw_push_insn_state(p);
-   brw_set_access_mode(p, BRW_ALIGN_1);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
-    * fields ignored.
-    */
-   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
-          addr_reg, brw_imm_d(offset));
-   brw_pop_insn_state(p);
-
-   gen6_resolve_implied_move(p, &src, 0);
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-   insn->header.predicate_control = BRW_PREDICATE_NONE;
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.destreg__conditionalmod = 0;
-   insn->header.mask_control = BRW_MASK_DISABLE;
-
-   brw_set_dest(p, insn, dest);
-   brw_set_src0(p, insn, src);
-
-   if (intel->gen >= 6)
-      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
-   else if (intel->gen == 5 || intel->is_g4x)
-      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
-   else
-      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
-
-   brw_set_dp_read_message(p,
-                          insn,
-                          bind_table_index,
-                          BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                          msg_type,
-                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
-                          2, /* msg_length */
-                          1); /* response_length */
-}
-
-
 
 void brw_fb_WRITE(struct brw_compile *p,
                  int dispatch_width,
-                  GLuint msg_reg_nr,
+                  unsigned msg_reg_nr,
                   struct brw_reg src0,
-                  GLuint msg_control,
-                  GLuint binding_table_index,
-                  GLuint msg_length,
-                  GLuint response_length,
+                  unsigned msg_control,
+                  unsigned binding_table_index,
+                  unsigned msg_length,
+                  unsigned response_length,
                   bool eot,
                   bool header_present)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
-   GLuint msg_type;
+   unsigned msg_type;
    struct brw_reg dest;
 
    if (dispatch_width == 16)
@@ -2253,7 +2236,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    else
       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 
-   if (intel->gen >= 6) {
+   if (brw->gen >= 6) {
       insn = next_insn(p, BRW_OPCODE_SENDC);
    } else {
       insn = next_insn(p, BRW_OPCODE_SEND);
@@ -2262,7 +2245,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    insn->header.predicate_control = 0;
    insn->header.compression_control = BRW_COMPRESSION_NONE;
 
-   if (intel->gen >= 6) {
+   if (brw->gen >= 6) {
       /* headerless version, just submit color payload */
       src0 = brw_message_reg(msg_reg_nr);
 
@@ -2296,126 +2279,55 @@ void brw_fb_WRITE(struct brw_compile *p,
  */
 void brw_SAMPLE(struct brw_compile *p,
                struct brw_reg dest,
-               GLuint msg_reg_nr,
+               unsigned msg_reg_nr,
                struct brw_reg src0,
-               GLuint binding_table_index,
-               GLuint sampler,
-               GLuint writemask,
-               GLuint msg_type,
-               GLuint response_length,
-               GLuint msg_length,
-               GLuint header_present,
-               GLuint simd_mode,
-               GLuint return_format)
-{
-   struct intel_context *intel = &p->brw->intel;
-   bool need_stall = 0;
-
-   if (writemask == 0) {
-      /*printf("%s: zero writemask??\n", __FUNCTION__); */
-      return;
-   }
-   
-   /* Hardware doesn't do destination dependency checking on send
-    * instructions properly.  Add a workaround which generates the
-    * dependency by other means.  In practice it seems like this bug
-    * only crops up for texture samples, and only where registers are
-    * written by the send and then written again later without being
-    * read in between.  Luckily for us, we already track that
-    * information and use it to modify the writemask for the
-    * instruction, so that is a guide for whether a workaround is
-    * needed.
-    */
-   if (writemask != WRITEMASK_XYZW) {
-      GLuint dst_offset = 0;
-      GLuint i, newmask = 0, len = 0;
-
-      for (i = 0; i < 4; i++) {
-        if (writemask & (1<<i))
-           break;
-        dst_offset += 2;
-      }
-      for (; i < 4; i++) {
-        if (!(writemask & (1<<i)))
-           break;
-        newmask |= 1<<i;
-        len++;
-      }
-
-      if (newmask != writemask) {
-        need_stall = 1;
-         /* printf("need stall %x %x\n", newmask , writemask); */
-      }
-      else {
-        bool dispatch_16 = false;
-
-        struct brw_reg m1 = brw_message_reg(msg_reg_nr);
-
-        guess_execution_size(p, p->current, dest);
-        if (p->current->header.execution_size == BRW_EXECUTE_16)
-           dispatch_16 = true;
-
-        newmask = ~newmask & WRITEMASK_XYZW;
-
-        brw_push_insn_state(p);
-
-        brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-        brw_set_mask_control(p, BRW_MASK_DISABLE);
-
-        brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
-                retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
-        brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 
-
-        brw_pop_insn_state(p);
-
-        src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
-        dest = offset(dest, dst_offset);
-
-        /* For 16-wide dispatch, masked channels are skipped in the
-         * response.  For 8-wide, masked channels still take up slots,
-         * and are just not written to.
-         */
-        if (dispatch_16)
-           response_length = len * 2;
-      }
-   }
+               unsigned binding_table_index,
+               unsigned sampler,
+               unsigned msg_type,
+               unsigned response_length,
+               unsigned msg_length,
+               unsigned header_present,
+               unsigned simd_mode,
+               unsigned return_format)
+{
+   struct brw_context *brw = p->brw;
+   struct brw_instruction *insn;
 
-   {
-      struct brw_instruction *insn;
-   
+   if (msg_reg_nr != -1)
       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
-      insn = next_insn(p, BRW_OPCODE_SEND);
-      insn->header.predicate_control = 0; /* XXX */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.predicate_control = 0; /* XXX */
+
+   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
+    *
+    *    "Instruction compression is not allowed for this instruction (that
+    *     is, send). The hardware behavior is undefined if this instruction is
+    *     set as compressed. However, compress control can be set to "SecHalf"
+    *     to affect the EMask generation."
+    *
+    * No similar wording is found in later PRMs, but there are examples
+    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
+    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
+    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
+    */
+   if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
       insn->header.compression_control = BRW_COMPRESSION_NONE;
-      if (intel->gen < 6)
-         insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(p, insn, dest);
-      brw_set_src0(p, insn, src0);
-      brw_set_sampler_message(p, insn,
-                             binding_table_index,
-                             sampler,
-                             msg_type,
-                             response_length, 
-                             msg_length,
-                             header_present,
-                             simd_mode,
-                             return_format);
-   }
-
-   if (need_stall) {
-      struct brw_reg reg = vec8(offset(dest, response_length-1));
-
-      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
-       */
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
-             retype(reg, BRW_REGISTER_TYPE_UD));
-      brw_pop_insn_state(p);
-   }
+   if (brw->gen < 6)
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_sampler_message(p, insn,
+                           binding_table_index,
+                           sampler,
+                           msg_type,
+                           response_length,
+                           msg_length,
+                           header_present,
+                           simd_mode,
+                           return_format);
 }
 
 /* All these variables are pretty confusing - we might be better off
@@ -2424,26 +2336,24 @@ void brw_SAMPLE(struct brw_compile *p,
  */
 void brw_urb_WRITE(struct brw_compile *p,
                   struct brw_reg dest,
-                  GLuint msg_reg_nr,
+                  unsigned msg_reg_nr,
                   struct brw_reg src0,
-                  bool allocate,
-                  bool used,
-                  GLuint msg_length,
-                  GLuint response_length,
-                  bool eot,
-                  bool writes_complete,
-                  GLuint offset,
-                  GLuint swizzle)
-{
-   struct intel_context *intel = &p->brw->intel;
+                   enum brw_urb_write_flags flags,
+                  unsigned msg_length,
+                  unsigned response_length,
+                  unsigned offset,
+                  unsigned swizzle)
+{
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
-   if (intel->gen == 7) {
+   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
       brw_push_insn_state(p);
       brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
                       BRW_REGISTER_TYPE_UD),
                retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
@@ -2459,38 +2369,48 @@ void brw_urb_WRITE(struct brw_compile *p,
    brw_set_src0(p, insn, src0);
    brw_set_src1(p, insn, brw_imm_d(0));
 
-   if (intel->gen < 6)
+   if (brw->gen < 6)
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_urb_message(p,
                       insn,
-                      allocate,
-                      used,
+                      flags,
                       msg_length,
-                      response_length, 
-                      eot, 
-                      writes_complete, 
+                      response_length,
                       offset,
                       swizzle);
 }
 
+static int
+next_ip(struct brw_compile *p, int ip)
+{
+   struct brw_instruction *insn = (void *)p->store + ip;
+
+   if (insn->header.cmpt_control)
+      return ip + 8;
+   else
+      return ip + 16;
+}
+
 static int
 brw_find_next_block_end(struct brw_compile *p, int start)
 {
    int ip;
+   void *store = p->store;
 
-   for (ip = start + 1; ip < p->nr_insn; ip++) {
-      struct brw_instruction *insn = &p->store[ip];
+   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
+      struct brw_instruction *insn = store + ip;
 
       switch (insn->header.opcode) {
       case BRW_OPCODE_ENDIF:
       case BRW_OPCODE_ELSE:
       case BRW_OPCODE_WHILE:
+      case BRW_OPCODE_HALT:
         return ip;
       }
    }
-   assert(!"not reached");
-   return start + 1;
+
+   return 0;
 }
 
 /* There is no DO instruction on gen6, so to find the end of the loop
@@ -2500,67 +2420,113 @@ brw_find_next_block_end(struct brw_compile *p, int start)
 static int
 brw_find_loop_end(struct brw_compile *p, int start)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    int ip;
-   int br = 2;
+   int scale = 8;
+   void *store = p->store;
 
-   for (ip = start + 1; ip < p->nr_insn; ip++) {
-      struct brw_instruction *insn = &p->store[ip];
+   /* Always start after the instruction (such as a WHILE) we're trying to fix
+    * up.
+    */
+   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
+      struct brw_instruction *insn = store + ip;
 
       if (insn->header.opcode == BRW_OPCODE_WHILE) {
-        int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
+        int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
                                   : insn->bits3.break_cont.jip;
-        if (ip + jip / br <= start)
+        if (ip + jip * scale <= start)
            return ip;
       }
    }
    assert(!"not reached");
-   return start + 1;
+   return start;
 }
 
 /* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
  */
 void
 brw_set_uip_jip(struct brw_compile *p)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    int ip;
-   int br = 2;
+   int scale = 8;
+   void *store = p->store;
 
-   if (intel->gen < 6)
+   if (brw->gen < 6)
       return;
 
-   for (ip = 0; ip < p->nr_insn; ip++) {
-      struct brw_instruction *insn = &p->store[ip];
+   for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
+      struct brw_instruction *insn = store + ip;
+
+      if (insn->header.cmpt_control) {
+        /* Fixups for compacted BREAK/CONTINUE not supported yet. */
+        assert(insn->header.opcode != BRW_OPCODE_BREAK &&
+               insn->header.opcode != BRW_OPCODE_CONTINUE &&
+               insn->header.opcode != BRW_OPCODE_HALT);
+        continue;
+      }
 
+      int block_end_ip = brw_find_next_block_end(p, ip);
       switch (insn->header.opcode) {
       case BRW_OPCODE_BREAK:
-        insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+         assert(block_end_ip != 0);
+        insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
         /* Gen7 UIP points to WHILE; Gen6 points just after it */
         insn->bits3.break_cont.uip =
-           br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
+           (brw_find_loop_end(p, ip) - ip +
+             (brw->gen == 6 ? 16 : 0)) / scale;
         break;
       case BRW_OPCODE_CONTINUE:
-        insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
-        insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
+         assert(block_end_ip != 0);
+        insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+        insn->bits3.break_cont.uip =
+            (brw_find_loop_end(p, ip) - ip) / scale;
 
         assert(insn->bits3.break_cont.uip != 0);
         assert(insn->bits3.break_cont.jip != 0);
         break;
+
+      case BRW_OPCODE_ENDIF:
+         if (block_end_ip == 0)
+            insn->bits3.break_cont.jip = 2;
+         else
+            insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+        break;
+
+      case BRW_OPCODE_HALT:
+        /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+         *
+         *    "In case of the halt instruction not inside any conditional
+         *     code block, the value of <JIP> and <UIP> should be the
+         *     same. In case of the halt instruction inside conditional code
+         *     block, the <UIP> should be the end of the program, and the
+         *     <JIP> should be end of the most inner conditional code block."
+         *
+         * The uip will have already been set by whoever set up the
+         * instruction.
+         */
+        if (block_end_ip == 0) {
+           insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+        } else {
+           insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+        }
+        assert(insn->bits3.break_cont.uip != 0);
+        assert(insn->bits3.break_cont.jip != 0);
+        break;
       }
    }
 }
 
 void brw_ff_sync(struct brw_compile *p,
                   struct brw_reg dest,
-                  GLuint msg_reg_nr,
+                  unsigned msg_reg_nr,
                   struct brw_reg src0,
                   bool allocate,
-                  GLuint response_length,
+                  unsigned response_length,
                   bool eot)
 {
-   struct intel_context *intel = &p->brw->intel;
+   struct brw_context *brw = p->brw;
    struct brw_instruction *insn;
 
    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
@@ -2570,7 +2536,7 @@ void brw_ff_sync(struct brw_compile *p,
    brw_set_src0(p, insn, src0);
    brw_set_src1(p, insn, brw_imm_d(0));
 
-   if (intel->gen < 6)
+   if (brw->gen < 6)
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_ff_sync_message(p,
@@ -2594,9 +2560,9 @@ void brw_ff_sync(struct brw_compile *p,
 void
 brw_svb_write(struct brw_compile *p,
               struct brw_reg dest,
-              GLuint msg_reg_nr,
+              unsigned msg_reg_nr,
               struct brw_reg src0,
-              GLuint binding_table_index,
+              unsigned binding_table_index,
               bool   send_commit_msg)
 {
    struct brw_instruction *insn;
@@ -2618,3 +2584,163 @@ brw_svb_write(struct brw_compile *p,
                             0, /* end_of_thread */
                             send_commit_msg); /* send_commit_msg */
 }
+
+static void
+brw_set_dp_untyped_atomic_message(struct brw_compile *p,
+                                  struct brw_instruction *insn,
+                                  unsigned atomic_op,
+                                  unsigned bind_table_index,
+                                  unsigned msg_length,
+                                  unsigned response_length,
+                                  bool header_present)
+{
+   if (p->brw->is_haswell) {
+      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
+                                 msg_length, response_length,
+                                 header_present, false);
+
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+         if (insn->header.execution_size != BRW_EXECUTE_16)
+            insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
+
+         insn->bits3.gen7_dp.msg_type =
+            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
+      } else {
+         insn->bits3.gen7_dp.msg_type =
+            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
+      }
+
+   } else {
+      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
+                                 msg_length, response_length,
+                                 header_present, false);
+
+      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
+
+      if (insn->header.execution_size != BRW_EXECUTE_16)
+         insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
+   }
+
+   if (response_length)
+      insn->bits3.ud |= 1 << 13; /* Return data expected */
+
+   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
+   insn->bits3.ud |= atomic_op << 8;
+}
+
+void
+brw_untyped_atomic(struct brw_compile *p,
+                   struct brw_reg dest,
+                   struct brw_reg mrf,
+                   unsigned atomic_op,
+                   unsigned bind_table_index,
+                   unsigned msg_length,
+                   unsigned response_length) {
+   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
+   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
+   brw_set_src1(p, insn, brw_imm_d(0));
+   brw_set_dp_untyped_atomic_message(
+      p, insn, atomic_op, bind_table_index, msg_length, response_length,
+      insn->header.access_mode == BRW_ALIGN_1);
+}
+
+static void
+brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
+                                        struct brw_instruction *insn,
+                                        unsigned bind_table_index,
+                                        unsigned msg_length,
+                                        unsigned response_length,
+                                        bool header_present)
+{
+   const unsigned dispatch_width =
+      (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
+   const unsigned num_channels = response_length / (dispatch_width / 8);
+
+   if (p->brw->is_haswell) {
+      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
+                                 msg_length, response_length,
+                                 header_present, false);
+
+      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
+   } else {
+      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
+                                 msg_length, response_length,
+                                 header_present, false);
+
+      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
+   }
+
+   if (insn->header.access_mode == BRW_ALIGN_1) {
+      if (dispatch_width == 16)
+         insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
+      else
+         insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
+   }
+
+   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
+
+   /* Set mask of 32-bit channels to drop. */
+   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
+}
+
+void
+brw_untyped_surface_read(struct brw_compile *p,
+                         struct brw_reg dest,
+                         struct brw_reg mrf,
+                         unsigned bind_table_index,
+                         unsigned msg_length,
+                         unsigned response_length)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
+   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
+   brw_set_dp_untyped_surface_read_message(
+      p, insn, bind_table_index, msg_length, response_length,
+      insn->header.access_mode == BRW_ALIGN_1);
+}
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32.  So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_compile *p,
+                         struct brw_reg payload,
+                         uint32_t surf_index)
+{
+   struct brw_context *brw = p->brw;
+   assert(brw->gen >= 7);
+
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_pop_insn_state(p);
+
+   /* We use brw_vec1_reg and unmasked because we want to increment the given
+    * offset only once.
+    */
+   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                      BRW_ARF_NULL, 0));
+   brw_set_src0(p, send, brw_vec1_reg(payload.file,
+                                      payload.nr, 0));
+   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
+                                     2 /* message length */,
+                                     0 /* response length */,
+                                     false /* header present */);
+}