intel/fs: Rework KSP data to be SIMD width-based

[mesa.git] / src / intel / compiler / brw_fs.cpp
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index 6fb46e7374c09357121d5783a2c780d1837bc605..ee0d1967ecc348b5a1548a017da01c94263f7992 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -191,21 +191,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
                              vec4_result, surf_index, vec4_offset);
     inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
  
-   fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
-   switch (type_sz(dst.type)) {
-   case 2:
-      shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
-      bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
-      break;
-   case 4:
-      bld.MOV(dst, retype(dw, dst.type));
-      break;
-   case 8:
-      shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
-      break;
-   default:
-      unreachable("Unsupported bit_size");
-   }
+   shuffle_from_32bit_read(bld, dst, vec4_result,
+                           (const_offset & 0xf) / type_sz(dst.type), 1);
  }
  
  /**
@@ -310,6 +297,22 @@ fs_inst::has_source_and_destination_hazard() const
     case FS_OPCODE_PACK_HALF_2x16_SPLIT:
        /* Multiple partial writes to the destination */
        return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+   case SHADER_OPCODE_SEL_EXEC:
+      /* This is implemented as
+       *
+       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
+       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
+       *
+       * Because the source is only read in the second instruction, the first
+       * may stomp all over it.
+       */
+      return true;
     default:
        /* The SIMD16 compressed instruction
         *
@@ -437,6 +440,13 @@ fs_reg::equals(const fs_reg &r) const
             stride == r.stride);
  }
  
+bool
+fs_reg::negative_equals(const fs_reg &r) const
+{
+   return (this->backend_reg::negative_equals(r) &&
+           stride == r.stride);
+}
+
  bool
  fs_reg::is_contiguous() const
  {
@@ -467,6 +477,9 @@ type_size_scalar(const struct glsl_type *type)
     case GLSL_TYPE_INT16:
     case GLSL_TYPE_FLOAT16:
        return DIV_ROUND_UP(type->components(), 2);
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      return DIV_ROUND_UP(type->components(), 4);
     case GLSL_TYPE_DOUBLE:
     case GLSL_TYPE_UINT64:
     case GLSL_TYPE_INT64:
@@ -803,6 +816,15 @@ fs_inst::size_read(int arg) const
  {
     switch (opcode) {
     case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_REP_FB_WRITE:
+      if (arg == 0) {
+         if (base_mrf >= 0)
+            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
+         else
+            return mlen * REG_SIZE;
+      }
+      break;
+
     case FS_OPCODE_FB_READ:
     case SHADER_OPCODE_URB_WRITE_SIMD8:
     case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
@@ -816,6 +838,8 @@ fs_inst::size_read(int arg) const
     case SHADER_OPCODE_TYPED_ATOMIC:
     case SHADER_OPCODE_TYPED_SURFACE_READ:
     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
     case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
     case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
     case SHADER_OPCODE_BYTE_SCATTERED_READ:
@@ -929,9 +953,12 @@ unsigned
  fs_inst::flags_written() const
  {
     if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
+                            opcode != BRW_OPCODE_CSEL &&
                              opcode != BRW_OPCODE_IF &&
                              opcode != BRW_OPCODE_WHILE)) ||
-       opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+       opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS ||
+       opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
+       opcode == FS_OPCODE_FB_WRITE) {
        return flag_mask(this);
     } else {
        return flag_mask(dst, size_written);
@@ -980,7 +1007,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) const
     case SHADER_OPCODE_SAMPLEINFO:
        return 1;
     case FS_OPCODE_FB_WRITE:
-      return 2;
+   case FS_OPCODE_REP_FB_WRITE:
+      return inst->src[0].file == BAD_FILE ? 0 : 2;
     case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
     case SHADER_OPCODE_GEN4_SCRATCH_READ:
        return 1;
@@ -1049,8 +1077,8 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
        bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
     } else {
        bld.emit(FS_OPCODE_LINTERP, wpos,
-           this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
-           interp_reg(VARYING_SLOT_POS, 2));
+               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
+               component(interp_reg(VARYING_SLOT_POS, 2), 0));
     }
     wpos = offset(wpos, bld, 1);
  
@@ -1579,14 +1607,26 @@ fs_visitor::assign_urb_setup()
      * setup regs, now that the location of the constants has been chosen.
      */
     foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->opcode == FS_OPCODE_LINTERP) {
-        assert(inst->src[1].file == FIXED_GRF);
-         inst->src[1].nr += urb_start;
-      }
-
-      if (inst->opcode == FS_OPCODE_CINTERP) {
-        assert(inst->src[0].file == FIXED_GRF);
-         inst->src[0].nr += urb_start;
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ATTR) {
+            /* ATTR regs in the FS are in units of logical scalar inputs each
+             * of which consumes half of a GRF register.
+             */
+            assert(inst->src[i].offset < REG_SIZE / 2);
+            const unsigned grf = urb_start + inst->src[i].nr / 2;
+            const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
+                                    inst->src[i].offset;
+            const unsigned width = inst->src[i].stride == 0 ?
+                                   1 : MIN2(inst->exec_size, 8);
+            struct brw_reg reg = stride(
+               byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                           offset),
+               width * inst->src[i].stride,
+               width, inst->src[i].stride);
+            reg.abs = inst->src[i].abs;
+            reg.negate = inst->src[i].negate;
+            inst->src[i] = reg;
+         }
        }
     }
  
@@ -1948,7 +1988,7 @@ struct cplx_align {
  static void
  cplx_align_assert_sane(struct cplx_align a)
  {
-   assert(a.mul > 0 && util_is_power_of_two(a.mul));
+   assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
     assert(a.offset < a.mul);
  }
  
@@ -2000,7 +2040,7 @@ static void
  mark_uniform_slots_read(struct uniform_slot_info *slots,
                          unsigned num_slots, unsigned alignment)
  {
-   assert(alignment > 0 && util_is_power_of_two(alignment));
+   assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
     assert(alignment <= CPLX_ALIGN_MAX_MUL);
  
     /* We can't align a slot to anything less than the slot size */
@@ -2406,7 +2446,8 @@ fs_visitor::opt_algebraic()
           }
           break;
        case BRW_OPCODE_OR:
-         if (inst->src[0].equals(inst->src[1])) {
+         if (inst->src[0].equals(inst->src[1]) ||
+             inst->src[1].is_zero()) {
              inst->opcode = BRW_OPCODE_MOV;
              inst->src[1] = reg_undef;
              progress = true;
@@ -2530,6 +2571,20 @@ fs_visitor::opt_algebraic()
           }
           break;
  
+      case SHADER_OPCODE_SHUFFLE:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            progress = true;
+         }
+         break;
+
        default:
          break;
        }
@@ -2811,6 +2866,106 @@ mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
     return ((1 << n) - 1) << shift;
  }
  
+bool
+fs_visitor::opt_peephole_csel()
+{
+   if (devinfo->gen < 8)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      int ip = block->end_ip + 1;
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         ip--;
+
+         if (inst->opcode != BRW_OPCODE_SEL ||
+             inst->predicate != BRW_PREDICATE_NORMAL ||
+             (inst->dst.type != BRW_REGISTER_TYPE_F &&
+              inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Because it is a 3-src instruction, CSEL cannot have an immediate
+          * value as a source, but we can sometimes handle zero.
+          */
+         if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
+              inst->src[0].file != UNIFORM) ||
+             (inst->src[1].file != VGRF && inst->src[1].file != ATTR &&
+              inst->src[1].file != UNIFORM && !inst->src[1].is_zero()))
+            continue;
+
+         foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+            if (!scan_inst->flags_written())
+               continue;
+
+            if ((scan_inst->opcode != BRW_OPCODE_CMP &&
+                 scan_inst->opcode != BRW_OPCODE_MOV) ||
+                scan_inst->predicate != BRW_PREDICATE_NONE ||
+                (scan_inst->src[0].file != VGRF &&
+                 scan_inst->src[0].file != ATTR &&
+                 scan_inst->src[0].file != UNIFORM) ||
+                scan_inst->src[0].type != BRW_REGISTER_TYPE_F)
+               break;
+
+            if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero())
+               break;
+
+            const brw::fs_builder ibld(this, block, inst);
+
+            const enum brw_conditional_mod cond =
+               inst->predicate_inverse
+               ? brw_negate_cmod(scan_inst->conditional_mod)
+               : scan_inst->conditional_mod;
+
+            fs_inst *csel_inst = NULL;
+
+            if (inst->src[1].file != IMM) {
+               csel_inst = ibld.CSEL(inst->dst,
+                                     inst->src[0],
+                                     inst->src[1],
+                                     scan_inst->src[0],
+                                     cond);
+            } else if (cond == BRW_CONDITIONAL_NZ) {
+               /* Consider the sequence
+                *
+                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
+                * (+f0) sel  g124<1>UD  g2<8,8,1>UD  0x00000000UD
+                *
+                * The sel will pick the immediate value 0 if r0 is ±0.0.
+                * Therefore, this sequence is equivalent:
+                *
+                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
+                * (+f0) sel  g124<1>F   g2<8,8,1>F   (abs)g3<8,8,1>F
+                *
+                * The abs is ensures that the result is 0UD when g3 is -0.0F.
+                * By normal cmp-sel merging, this is also equivalent:
+                *
+                * csel.nz    g124<1>F   g2<4,4,1>F   (abs)g3<4,4,1>F  g3<4,4,1>F
+                */
+               csel_inst = ibld.CSEL(inst->dst,
+                                     inst->src[0],
+                                     scan_inst->src[0],
+                                     scan_inst->src[0],
+                                     cond);
+
+               csel_inst->src[1].abs = true;
+            }
+
+            if (csel_inst != NULL) {
+               progress = true;
+               inst->remove(block);
+            }
+
+            break;
+         }
+      }
+   }
+
+   return progress;
+}
+
  bool
  fs_visitor::compute_to_mrf()
  {
@@ -3070,7 +3225,7 @@ fs_visitor::emit_repclear_shader()
                 .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
     }
  
-   fs_inst *write;
+   fs_inst *write = NULL;
     if (key->nr_color_regions == 1) {
        write = bld.emit(FS_OPCODE_REP_FB_WRITE);
        write->saturate = key->clamp_fragment_color;
@@ -3080,7 +3235,18 @@ fs_visitor::emit_repclear_shader()
        write->mlen = 1;
     } else {
        assume(key->nr_color_regions > 0);
+
+      struct brw_reg header =
+         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
+      bld.exec_all().group(16, 0)
+         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
        for (int i = 0; i < key->nr_color_regions; ++i) {
+         if (i > 0) {
+            bld.exec_all().group(1, 0)
+               .MOV(component(header, 2), brw_imm_ud(i));
+         }
+
           write = bld.emit(FS_OPCODE_REP_FB_WRITE);
           write->saturate = key->clamp_fragment_color;
           write->base_mrf = base_mrf;
@@ -3090,6 +3256,7 @@ fs_visitor::emit_repclear_shader()
        }
     }
     write->eot = true;
+   write->last_rt = true;
  
     calculate_cfg();
  
@@ -3549,11 +3716,7 @@ fs_visitor::lower_integer_multiplication()
                inst->dst.type != BRW_REGISTER_TYPE_UD))
              continue;
  
-         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
-          * operation directly, but CHV/BXT cannot.
-          */
-         if (devinfo->gen >= 8 &&
-             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo))
+         if (devinfo->has_integer_dword_mul)
              continue;
  
           if (inst->src[1].file == IMM &&
@@ -3808,25 +3971,83 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
     int header_size = 2, payload_header_size;
     unsigned length = 0;
  
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-    *
-    *     "Dispatched Pixel Enables. One bit per pixel indicating
-    *      which pixels were originally enabled when the thread was
-    *      dispatched. This field is only required for the end-of-
-    *      thread message and on all dual-source messages."
-    */
-   if (devinfo->gen >= 6 &&
-       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-       color1.file == BAD_FILE &&
-       key->nr_color_regions == 1) {
-      header_size = 0;
-   }
+   if (devinfo->gen < 6) {
+      /* For gen4-5, we always have a header consisting of g0 and g1.  We have
+       * an implied MOV from g0,g1 to the start of the message.  The MOV from
+       * g0 is handled by the hardware and the MOV from g1 is provided by the
+       * generator.  This is required because, on gen4-5, the generator may
+       * generate two write messages with different message lengths in order
+       * to handle AA data properly.
+       *
+       * Also, since the pixel mask goes in the g0 portion of the message and
+       * since render target writes are the last thing in the shader, we write
+       * the pixel mask directly into g0 and it will get copied as part of the
+       * implied write.
+       */
+      if (prog_data->uses_kill) {
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
+                 brw_flag_reg(0, 1));
+      }
  
-   if (header_size != 0) {
-      assert(header_size == 2);
-      /* Allocate 2 registers for a header */
-      length += 2;
+      assert(length == 0);
+      length = 2;
+   } else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
+               prog_data->uses_kill) ||
+              color1.file != BAD_FILE ||
+              key->nr_color_regions > 1) {
+      /* From the Sandy Bridge PRM, volume 4, page 198:
+       *
+       *     "Dispatched Pixel Enables. One bit per pixel indicating
+       *      which pixels were originally enabled when the thread was
+       *      dispatched. This field is only required for the end-of-
+       *      thread message and on all dual-source messages."
+       */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+
+      /* The header starts off as g0 and g1 */
+      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                           BRW_REGISTER_TYPE_UD));
+
+      uint32_t g00_bits = 0;
+
+      /* Set "Source0 Alpha Present to RenderTarget" bit in message
+       * header.
+       */
+      if (inst->target > 0 && key->replicate_alpha)
+         g00_bits |= 1 << 11;
+
+      /* Set computes stencil to render target */
+      if (prog_data->computed_stencil)
+         g00_bits |= 1 << 14;
+
+      if (g00_bits) {
+         /* OR extra bits into g0.0 */
+         ubld.group(1, 0).OR(component(header, 0),
+                             retype(brw_vec1_grf(0, 0),
+                                    BRW_REGISTER_TYPE_UD),
+                             brw_imm_ud(g00_bits));
+      }
+
+      /* Set the render target index for choosing BLEND_STATE. */
+      if (inst->target > 0) {
+         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
+      }
+
+      if (prog_data->uses_kill) {
+         ubld.group(1, 0).MOV(retype(component(header, 15),
+                                     BRW_REGISTER_TYPE_UW),
+                              brw_flag_reg(0, 1));
+      }
+
+      assert(length == 0);
+      sources[0] = header;
+      sources[1] = horiz_offset(header, 8);
+      length = 2;
     }
+   assert(length == 0 || length == 2);
+   header_size = length;
  
     if (payload.aa_dest_stencil_reg) {
        sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
@@ -3933,7 +4154,13 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
        if (devinfo->gen < 6 && bld.dispatch_width() == 16)
           load->dst.nr |= BRW_MRF_COMPR4;
  
-      inst->resize_sources(0);
+      if (devinfo->gen < 6) {
+         /* Set up src[0] for the implied MOV from grf0-1 */
+         inst->resize_sources(1);
+         inst->src[0] = brw_vec8_grf(0, 0);
+      } else {
+         inst->resize_sources(0);
+      }
        inst->base_mrf = 1;
     }
  
@@ -4196,17 +4423,15 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
         op == SHADER_OPCODE_SAMPLEINFO ||
         is_high_sampler(devinfo, sampler)) {
        /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that we're only reserving space for it in the
-       * message payload as it will be initialized implicitly by the
-       * generator.
+       * put them in.
         *
         * TG4 needs to place its channel select in the header, for interaction
         * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
         * larger sampler numbers we need to offset the Sampler State Pointer in
         * the header.
         */
+      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
        header_size = 1;
-      sources[0] = fs_reg();
        length++;
  
        /* If we're requesting fewer than four channels worth of response,
@@ -4218,6 +4443,40 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
           unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
           inst->offset |= mask << 12;
        }
+
+      /* Build the actual header */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+      const fs_builder ubld1 = ubld.group(1, 0);
+      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+      if (inst->offset) {
+         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
+      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
+                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
+         /* The vertex and fragment stages have g0.2 set to 0, so
+          * header0.2 is 0 when g0 is copied. Other stages may not, so we
+          * must set it to 0 to avoid setting undesirable bits in the
+          * message.
+          */
+         ubld1.MOV(component(header, 2), brw_imm_ud(0));
+      }
+
+      if (is_high_sampler(devinfo, sampler)) {
+         if (sampler.file == BRW_IMMEDIATE_VALUE) {
+            assert(sampler.ud >= 16);
+            const int sampler_state_size = 16; /* 16 bytes */
+
+            ubld1.ADD(component(header, 3),
+                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
+         } else {
+            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
+            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
+            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
+            ubld1.ADD(component(header, 3),
+                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                      tmp);
+         }
+      }
     }
  
     if (shadow_c.file != BAD_FILE) {
@@ -4431,6 +4690,8 @@ static void
  lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
                             const fs_reg &sample_mask)
  {
+   const gen_device_info *devinfo = bld.shader->devinfo;
+
     /* Get the logical send arguments. */
     const fs_reg &addr = inst->src[0];
     const fs_reg &src = inst->src[1];
@@ -4441,7 +4702,20 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
     /* Calculate the total number of components of the payload. */
     const unsigned addr_sz = inst->components_read(0);
     const unsigned src_sz = inst->components_read(1);
-   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   /* From the BDW PRM Volume 7, page 147:
+    *
+    *  "For the Data Cache Data Port*, the header must be present for the
+    *   following message types: [...] Typed read/write/atomics"
+    *
+    * Earlier generations have a similar wording.  Because of this restriction
+    * we don't attempt to implement sample masks via predication for such
+    * messages prior to Gen9, since we have to provide a header anyway.  On
+    * Gen11+ the header has been removed so we can only use predication.
+    */
+   const unsigned header_sz = devinfo->gen < 9 &&
+                              (op == SHADER_OPCODE_TYPED_SURFACE_READ ||
+                               op == SHADER_OPCODE_TYPED_SURFACE_WRITE ||
+                               op == SHADER_OPCODE_TYPED_ATOMIC) ? 1 : 0;
     const unsigned sz = header_sz + addr_sz + src_sz;
  
     /* Allocate space for the payload. */
@@ -4461,6 +4735,32 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
  
     bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
  
+   /* Predicate the instruction on the sample mask if no header is
+    * provided.
+    */
+   if (!header_sz && sample_mask.file != BAD_FILE &&
+       sample_mask.file != IMM) {
+      const fs_builder ubld = bld.group(1, 0).exec_all();
+      if (inst->predicate) {
+         assert(inst->predicate == BRW_PREDICATE_NORMAL);
+         assert(!inst->predicate_inverse);
+         assert(inst->flag_subreg < 2);
+         /* Combine the sample mask with the existing predicate by using a
+          * vertical predication mode.
+          */
+         inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
+         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2),
+                         sample_mask.type),
+                  sample_mask);
+      } else {
+         inst->flag_subreg = 2;
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->predicate_inverse = false;
+         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
+                  sample_mask);
+      }
+   }
+
     /* Update the original instruction. */
     inst->opcode = op;
     inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
@@ -4947,6 +5247,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
     case BRW_OPCODE_MAD:
     case BRW_OPCODE_LRP:
     case FS_OPCODE_PACK:
+   case SHADER_OPCODE_SEL_EXEC:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
        return get_fpu_lowered_simd_width(devinfo, inst);
  
     case BRW_OPCODE_CMP: {
@@ -5131,6 +5433,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
        return MIN2(8, inst->exec_size);
  
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      return 8;
+
     case SHADER_OPCODE_MOV_INDIRECT: {
        /* From IVB and HSW PRMs:
         *
@@ -5460,9 +5765,10 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
     fs_inst *inst = (fs_inst *)be_inst;
  
     if (inst->predicate) {
-      fprintf(file, "(%cf0.%d) ",
-             inst->predicate_inverse ? '-' : '+',
-             inst->flag_subreg);
+      fprintf(file, "(%cf%d.%d) ",
+              inst->predicate_inverse ? '-' : '+',
+              inst->flag_subreg / 2,
+              inst->flag_subreg % 2);
     }
  
     fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
@@ -5472,9 +5778,11 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
        fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
        if (!inst->predicate &&
            (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+                                inst->opcode != BRW_OPCODE_CSEL &&
                                  inst->opcode != BRW_OPCODE_IF &&
                                  inst->opcode != BRW_OPCODE_WHILE))) {
-         fprintf(file, ".f0.%d", inst->flag_subreg);
+         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
+                 inst->flag_subreg % 2);
        }
     }
     fprintf(file, "(%d) ", inst->exec_size);
@@ -5860,7 +6168,7 @@ fs_visitor::calculate_register_pressure()
  bool
  fs_visitor::opt_drop_redundant_mov_to_flags()
  {
-   bool flag_mov_found[2] = {false};
+   bool flag_mov_found[4] = {false};
     bool progress = false;
  
     /* Instructions removed by this pass can only be added if this were true */
@@ -5969,6 +6277,12 @@ fs_visitor::optimize()
        OPT(compact_virtual_grfs);
     } while (progress);
  
+   /* Do this after cmod propagation has had every possible opportunity to
+    * propagate results into SEL instructions.
+    */
+   if (OPT(opt_peephole_csel))
+      OPT(dead_code_eliminate);
+
     progress = false;
     pass_num = 0;
  
@@ -6785,8 +7099,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
        brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
  
     cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
-   uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
-   unsigned simd8_grf_used = 0, simd16_grf_used = 0;
  
     fs_visitor v8(compiler, log_data, mem_ctx, key,
                   &prog_data->base, prog, shader, 8,
@@ -6798,8 +7110,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
        return NULL;
     } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
        simd8_cfg = v8.cfg;
-      simd8_grf_start = v8.payload.num_regs;
-      simd8_grf_used = v8.grf_used;
+      prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used);
     }
  
     if (v8.max_dispatch_width >= 16 &&
@@ -6815,8 +7127,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                     v16.fail_msg);
        } else {
           simd16_cfg = v16.cfg;
-         simd16_grf_start = v16.payload.num_regs;
-         simd16_grf_used = v16.grf_used;
+         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+         prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
        }
     }
  
@@ -6832,6 +7144,16 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     if (compiler->devinfo->gen < 5 && simd16_cfg)
        simd8_cfg = NULL;
  
+   if (compiler->devinfo->gen <= 5 && !simd8_cfg) {
+      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
+       * the data available in the base prog data struct for convenience.
+       */
+      if (simd16_cfg) {
+         prog_data->base.dispatch_grf_start_reg =
+            prog_data->dispatch_grf_start_reg_16;
+      }
+   }
+
     if (prog_data->persample_dispatch) {
        /* Starting with SandyBridge (where we first get MSAA), the different
         * pixel dispatch combinations are grouped into classifications A
@@ -6856,7 +7178,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
      */
     brw_compute_flat_inputs(prog_data, shader);
  
-   fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+   fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
                    v8.promoted_constants, v8.runtime_check_aads_emit,
                    MESA_SHADER_FRAGMENT);
  
@@ -6870,23 +7192,14 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     if (simd8_cfg) {
        prog_data->dispatch_8 = true;
        g.generate_code(simd8_cfg, 8);
-      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
-      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+   }
  
-      if (simd16_cfg) {
-         prog_data->dispatch_16 = true;
-         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
-         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
-         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
-      }
-   } else if (simd16_cfg) {
+   if (simd16_cfg) {
        prog_data->dispatch_16 = true;
-      g.generate_code(simd16_cfg, 16);
-      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
-      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
     }
  
-   return g.get_assembly(&prog_data->base.program_size);
+   return g.get_assembly();
  }
  
  fs_reg *
@@ -6970,7 +7283,6 @@ static nir_shader *
  compile_cs_to_nir(const struct brw_compiler *compiler,
                    void *mem_ctx,
                    const struct brw_cs_prog_key *key,
-                  struct brw_cs_prog_data *prog_data,
                    const nir_shader *src_shader,
                    unsigned dispatch_width)
  {
@@ -7005,13 +7317,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
     cfg_t *cfg = NULL;
     const char *fail_msg = NULL;
-   unsigned promoted_constants;
+   unsigned promoted_constants = 0;
  
     /* Now the main event: Visit the shader IR and generate our CS IR for it.
      */
     if (min_dispatch_width <= 8) {
        nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                           prog_data, src_shader, 8);
+                                           src_shader, 8);
        v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                            NULL, /* Never used in core profile */
                            nir8, 8, shader_time_index);
@@ -7032,7 +7344,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
         !fail_msg && min_dispatch_width <= 16) {
        /* Try a SIMD16 compile */
        nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                            prog_data, src_shader, 16);
+                                            src_shader, 16);
        v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                             NULL, /* Never used in core profile */
                             nir16, 16, shader_time_index);
@@ -7065,7 +7377,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
     if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
        /* Try a SIMD32 compile */
        nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                            prog_data, src_shader, 32);
+                                            src_shader, 32);
        v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                             NULL, /* Never used in core profile */
                             nir32, 32, shader_time_index);
@@ -7097,7 +7409,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        if (error_str)
           *error_str = ralloc_strdup(mem_ctx, fail_msg);
     } else {
-      fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
+      fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
                       promoted_constants, false, MESA_SHADER_COMPUTE);
        if (INTEL_DEBUG & DEBUG_CS) {
           char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
@@ -7109,7 +7421,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
  
        g.generate_code(cfg, prog_data->simd_size);
  
-      ret = g.get_assembly(&prog_data->base.program_size);
+      ret = g.get_assembly();
     }
  
     delete v8;