i965/nir/vec4: Prepare source and destination registers for ALU operations

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 936f0d19706192aa22001b5f93a7ee484c7ef4d1..15fe3648af84e9d7b5221fa039a2e3c3373a5bb4 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -686,6 +686,74 @@ fs_inst::components_read(unsigned i) const
        else
           return 1;
  
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      assert(src[8].file == IMM && src[9].file == IMM);
+      /* Texture coordinates. */
+      if (i == 0)
+         return src[8].fixed_hw_reg.dw1.ud;
+      /* Texture derivatives. */
+      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[9].fixed_hw_reg.dw1.ud;
+      /* Texture offset. */
+      else if (i == 7)
+         return 2;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
     default:
        return 1;
     }
@@ -872,6 +940,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->param_size = v->param_size;
  }
  
+void
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[uniforms++] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[uniforms++] = &zero;
+}
+
  fs_reg *
  fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                           bool origin_upper_left)
@@ -895,11 +975,11 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
        bld.MOV(wpos, this->pixel_y);
     } else {
        fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
  
        if (flip) {
          pixel_y.negate = true;
-        offset += key->drawable_height - 1.0;
+        offset += key->drawable_height - 1.0f;
        }
  
        bld.ADD(wpos, pixel_y, fs_reg(offset));
@@ -1844,8 +1924,7 @@ fs_visitor::demote_pull_constants()
             continue;
  
           /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
           fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
           fs_reg dst = vgrf(glsl_type::float_type);
  
@@ -1860,8 +1939,9 @@ fs_visitor::demote_pull_constants()
              inst->src[i].reladdr = NULL;
              inst->src[i].stride = 1;
           } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
              fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                        dst, surf_index, offset);
              inst->src[i].set_smear(pull_index & 3);
           }
@@ -2137,11 +2217,11 @@ fs_visitor::opt_zero_samples()
         *     "Parameter 0 is required except for the sampleinfo message, which
         *      has no parameter 0"
         */
-      while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
               load_payload->src[(inst->mlen - inst->header_size) /
-                               (dispatch_width / 8) +
+                               (inst->exec_size / 8) +
                                 inst->header_size - 1].is_zero()) {
-         inst->mlen -= dispatch_width / 8;
+         inst->mlen -= inst->exec_size / 8;
           progress = true;
        }
     }
@@ -2178,7 +2258,8 @@ fs_visitor::opt_sampler_eot()
        return false;
  
     /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
     assert(fb_write->eot);
     assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
  
@@ -2209,9 +2290,11 @@ fs_visitor::opt_sampler_eot()
     assert(!tex_inst->eot); /* We can't get here twice */
     assert((tex_inst->offset & (0xff << 24)) == 0);
  
+   const fs_builder ibld(this, block, tex_inst);
+
     tex_inst->offset |= fb_write->target << 24;
     tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
     fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
  
     /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2223,8 +2306,8 @@ fs_visitor::opt_sampler_eot()
     if (tex_inst->header_size != 0)
        return true;
  
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
     fs_reg *new_sources =
        ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
  
@@ -2744,7 +2827,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        if (block->start() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2760,7 +2844,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
              if (reg >= first_write_grf &&
                  reg < first_write_grf + write_len &&
                  needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                 needs_dep[reg - first_write_grf] = false;
                 if (scan_inst->exec_size == 16)
                    needs_dep[reg - first_write_grf + 1] = false;
@@ -2807,7 +2891,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
        if (block->end() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2822,7 +2907,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
            scan_inst->dst.reg >= first_write_grf &&
            scan_inst->dst.reg < first_write_grf + write_len &&
            needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
           needs_dep[scan_inst->dst.reg - first_write_grf] = false;
        }
  
@@ -2957,7 +3043,8 @@ fs_visitor::lower_load_payload()
        if (dst.file == MRF)
           dst.reg = dst.reg & ~BRW_MRF_COMPR4;
  
-      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
  
        for (uint8_t i = 0; i < inst->header_size; i++) {
           if (inst->src[i].file != BAD_FILE) {
@@ -2968,10 +3055,6 @@ fs_visitor::lower_load_payload()
           dst = offset(dst, hbld, 1);
        }
  
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
-
        if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
            inst->exec_size > 8) {
           /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3059,7 +3142,7 @@ fs_visitor::lower_integer_multiplication()
             inst->dst.type != BRW_REGISTER_TYPE_UD))
           continue;
  
-      const fs_builder ibld = bld.at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        /* The MUL instruction isn't commutative. On Gen <= 6, only the low
         * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
@@ -3361,15 +3444,523 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
     inst->header_size = header_size;
  }
  
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      for (unsigned i = coord_components; i < 3; i++)
+         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+       * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, fs_reg(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = msg_begin.reg;
+   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &sampler,
+                                const fs_reg &offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (offset_value.file != BAD_FILE) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.reg--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++) {
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+   }
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, lod);
+         lod = offset(lod, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, lod2);
+         lod2 = offset(lod2, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = message.reg;
+   inst->mlen = msg_end.reg - message.reg;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs, const fs_reg &sampler,
+                                fs_reg offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   int reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       offset_value.file != BAD_FILE ||
+       is_high_sampler(devinfo, sampler)) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      header_size = 1;
+      sources[0] = fs_reg();
+      length++;
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and
+    * hardcode the LOD to 0.
+    */
+   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+       op == SHADER_OPCODE_TEX) {
+      op = SHADER_OPCODE_TXL;
+      lod = fs_reg(0.0f);
+   }
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length], lod);
+            lod = offset(lod, bld, 1);
+            length++;
+
+            bld.MOV(sources[length], lod2);
+            lod2 = offset(lod2, bld, 1);
+            length++;
+         }
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gen9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+      length++;
+
+      if (devinfo->gen >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+            coordinate = offset(coordinate, bld, 1);
+         }
+         length++;
+      }
+
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+      length++;
+
+      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+         length++;
+      }
+
+      if (op == SHADER_OPCODE_TXF_CMS) {
+         /* Data from the multisample control surface. */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         length++;
+      }
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* gather4_po_c should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) { /* u, v */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+         offset_value = offset(offset_value, bld, 1);
+         length++;
+      }
+
+      if (coord_components == 3) { /* r if present */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+   }
+
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_size;
+   else
+      mlen = length * reg_width;
+
+   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+                                     BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+   /* Generate the SEND. */
+   inst->opcode = op;
+   inst->src[0] = src_payload;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = -1;
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[0];
+   const fs_reg &shadow_c = inst->src[1];
+   const fs_reg &lod = inst->src[2];
+   const fs_reg &lod2 = inst->src[3];
+   const fs_reg &sample_index = inst->src[4];
+   const fs_reg &mcs = inst->src[5];
+   const fs_reg &sampler = inst->src[6];
+   const fs_reg &offset_value = inst->src[7];
+   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+   if (devinfo->gen >= 7) {
+      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      mcs, sampler, offset_value,
+                                      coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      sampler, offset_value,
+                                      coord_components, grad_components);
+   } else {
+      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sampler,
+                                      coord_components, grad_components);
+   }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const UNUSED fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
+}
+
  bool
  fs_visitor::lower_logical_sends()
  {
     bool progress = false;
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        switch (inst->opcode) {
        case FS_OPCODE_FB_WRITE_LOGICAL:
@@ -3380,6 +3971,90 @@ fs_visitor::lower_logical_sends()
                                       payload);
           break;
  
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
        default:
           continue;
        }
@@ -3413,6 +4088,44 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
        /* Dual-source FB writes are unsupported in SIMD16 mode. */
        return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
  
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode. */
+      return 8;
+
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+      /* gather4_po_c is unsupported in SIMD16 mode. */
+      const fs_reg &shadow_c = inst->src[1];
+      return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+   }
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL: {
+      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+       * mode because the message exceeds the maximum length of 11.
+       */
+      const fs_reg &shadow_c = inst->src[1];
+      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+         return 16;
+      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->gen == 4)
+         return 16;
+      else
+         return inst->exec_size;
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
     default:
        return inst->exec_size;
     }
@@ -3450,10 +4163,15 @@ fs_visitor::lower_simd_width()
        const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
  
        if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction. */
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
           const fs_builder ibld = bld.at(block, inst)
                                      .exec_all(inst->force_writemask_all)
-                                    .group(inst->exec_size, inst->force_sechalf);
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
  
           /* Split the copies in chunks of the execution width of either the
            * original or the lowered instruction, whichever is lower.
@@ -3476,14 +4194,11 @@ fs_visitor::lower_simd_width()
              split_inst.exec_size = lower_width;
              split_inst.eot = inst->eot && i == n - 1;
  
-            /* Set exec_all if the lowered width is higher than the original
-             * to avoid breaking the compiler invariant that no control
-             * flow-masked instruction is wider than the shader's
-             * dispatch_width.  Then transform the sources and destination and
-             * emit the lowered instruction.
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
               */
-            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
-                                        .group(lower_width, i);
+            const fs_builder lbld = ibld.group(lower_width, i);
  
              for (unsigned j = 0; j < inst->sources; j++) {
                 if (inst->src[j].file != BAD_FILE &&
@@ -3956,9 +4671,11 @@ fs_visitor::optimize()
      * Ideally optimization passes wouldn't be part of the visitor so they
      * wouldn't have access to bld at all, but they do, so just in case some
      * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
      */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
  
     split_virtual_grfs();