intel/fs: Extend thread payload layout to SIMD32

[mesa.git] / src / intel / compiler / brw_fs.cpp
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index ee0d1967ecc348b5a1548a017da01c94263f7992..173fc8593d35789335121b7dddcd9c6a14515142 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -805,6 +805,8 @@ fs_inst::components_read(unsigned i) const
        else
           return 1;
     }
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return (i == 0 ? 2 : 1);
  
     default:
        return 1;
@@ -840,7 +842,6 @@ fs_inst::size_read(int arg) const
     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
     case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
-   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
     case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
     case SHADER_OPCODE_BYTE_SCATTERED_READ:
        if (arg == 0)
@@ -1074,7 +1075,7 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
  
     /* gl_FragCoord.z */
     if (devinfo->gen >= 6) {
-      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
+      bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
     } else {
        bld.emit(FS_OPCODE_LINTERP, wpos,
                 this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
@@ -1212,30 +1213,16 @@ fs_visitor::emit_samplepos_setup()
      * The X, Y sample positions come in as bytes in  thread payload. So, read
      * the positions using vstride=16, width=8, hstride=2.
      */
-   struct brw_reg sample_pos_reg =
-      stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
-                    BRW_REGISTER_TYPE_B), 16, 8, 2);
+   const fs_reg sample_pos_reg =
+      fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
  
-   if (dispatch_width == 8) {
-      abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
-   } else {
-      abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
-      abld.half(1).MOV(half(int_sample_x, 1),
-                       fs_reg(suboffset(sample_pos_reg, 16)));
-   }
     /* Compute gl_SamplePosition.x */
-   compute_sample_position(pos, int_sample_x);
-   pos = offset(pos, abld, 1);
-   if (dispatch_width == 8) {
-      abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
-   } else {
-      abld.half(0).MOV(half(int_sample_y, 0),
-                       fs_reg(suboffset(sample_pos_reg, 1)));
-      abld.half(1).MOV(half(int_sample_y, 1),
-                       fs_reg(suboffset(sample_pos_reg, 17)));
-   }
+   abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
+   compute_sample_position(offset(pos, abld, 0), int_sample_x);
+
     /* Compute gl_SamplePosition.y */
-   compute_sample_position(pos, int_sample_y);
+   abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
+   compute_sample_position(offset(pos, abld, 1), int_sample_y);
     return reg;
  }
  
@@ -1344,8 +1331,8 @@ fs_visitor::emit_samplemaskin_setup()
  
     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
  
-   fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
-                               BRW_REGISTER_TYPE_D));
+   fs_reg coverage_mask =
+      fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
  
     if (wm_prog_data->persample_dispatch) {
        /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
@@ -2665,7 +2652,7 @@ fs_visitor::opt_sampler_eot()
  {
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  
-   if (stage != MESA_SHADER_FRAGMENT)
+   if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16)
        return false;
  
     if (devinfo->gen != 9 && !devinfo->is_cherryview)
@@ -3972,6 +3959,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
     unsigned length = 0;
  
     if (devinfo->gen < 6) {
+      /* TODO: Support SIMD32 on gen4-5 */
+      assert(bld.group() < 16);
+
        /* For gen4-5, we always have a header consisting of g0 and g1.  We have
         * an implied MOV from g0,g1 to the start of the message.  The MOV from
         * g0 is handled by the hardware and the MOV from g1 is provided by the
@@ -4005,10 +3995,20 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
         */
        const fs_builder ubld = bld.exec_all().group(8, 0);
  
-      /* The header starts off as g0 and g1 */
        fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
-                                           BRW_REGISTER_TYPE_UD));
+      if (bld.group() < 16) {
+         /* The header starts off as g0 and g1 for the first half */
+         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                              BRW_REGISTER_TYPE_UD));
+      } else {
+         /* The header starts off as g0 and g2 for the second half */
+         assert(bld.group() < 32);
+         const fs_reg header_sources[2] = {
+            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
+         };
+         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
+      }
  
        uint32_t g00_bits = 0;
  
@@ -4036,6 +4036,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
        }
  
        if (prog_data->uses_kill) {
+         assert(bld.group() < 16);
           ubld.group(1, 0).MOV(retype(component(header, 15),
                                       BRW_REGISTER_TYPE_UW),
                                brw_flag_reg(0, 1));
@@ -4049,11 +4050,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
     assert(length == 0 || length == 2);
     header_size = length;
  
-   if (payload.aa_dest_stencil_reg) {
+   if (payload.aa_dest_stencil_reg[0]) {
+      assert(inst->group < 16);
        sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
        bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
           .MOV(sources[length],
-              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
        length++;
     }
  
@@ -4073,7 +4075,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
  
        bld.exec_all().annotate("FB write oMask")
           .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
-                           inst->group),
+                           inst->group % 16),
                sample_mask);
        length++;
     }
@@ -4118,7 +4120,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
  
     if (src_stencil.file != BAD_FILE) {
        assert(devinfo->gen >= 9);
-      assert(bld.dispatch_width() != 16);
+      assert(bld.dispatch_width() == 8);
  
        /* XXX: src_stencil is only available on gen9+. dst_depth is never
         * available on gen9+. As such it's impossible to have both enabled at the
@@ -4172,12 +4174,21 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
  static void
  lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
  {
-   const fs_builder &ubld = bld.exec_all();
+   const fs_builder &ubld = bld.exec_all().group(8, 0);
     const unsigned length = 2;
-   const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
+   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
  
-   ubld.group(16, 0)
-       .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   if (bld.group() < 16) {
+      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                           BRW_REGISTER_TYPE_UD));
+   } else {
+      assert(bld.group() < 32);
+      const fs_reg header_sources[] = {
+         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
+      };
+      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
+   }
  
     inst->resize_sources(1);
     inst->src[0] = header;
@@ -5675,16 +5686,49 @@ fs_visitor::lower_simd_width()
            * after \p inst, inst->next is a moving target and we need to save
            * it off here so that we insert the zip instructions in the right
            * place.
+          *
+          * Since we're inserting split instructions after after_inst, the
+          * instructions will end up in the reverse order that we insert them.
+          * However, certain render target writes require that the low group
+          * instructions come before the high group.  From the Ivy Bridge PRM
+          * Vol. 4, Pt. 1, Section 3.9.11:
+          *
+          *    "If multiple SIMD8 Dual Source messages are delivered by the
+          *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
+          *    issued before the SIMD8_DUALSRC_HI message with the same Slot
+          *    Group Select setting."
+          *
+          * And, from Section 3.9.11.1 of the same PRM:
+          *
+          *    "When SIMD32 or SIMD16 PS threads send render target writes
+          *    with multiple SIMD8 and SIMD16 messages, the following must
+          *    hold:
+          *
+          *    All the slots (as described above) must have a corresponding
+          *    render target write irrespective of the slot's validity. A slot
+          *    is considered valid when at least one sample is enabled. For
+          *    example, a SIMD16 PS thread must send two SIMD8 render target
+          *    writes to cover all the slots.
+          *
+          *    PS thread must send SIMD render target write messages with
+          *    increasing slot numbers. For example, SIMD16 thread has
+          *    Slot[15:0] and if two SIMD8 render target writes are used, the
+          *    first SIMD8 render target write must send Slot[7:0] and the
+          *    next one must send Slot[15:8]."
+          *
+          * In order to make low group instructions come before high group
+          * instructions (this is required for some render target writes), we
+          * split from the highest group to lowest.
            */
           exec_node *const after_inst = inst->next;
-         for (unsigned i = 0; i < n; i++) {
+         for (int i = n - 1; i >= 0; i--) {
              /* Emit a copy of the original instruction with the lowered width.
               * If the EOT flag was set throw it away except for the last
               * instruction to avoid killing the thread prematurely.
               */
              fs_inst split_inst = *inst;
              split_inst.exec_size = lower_width;
-            split_inst.eot = inst->eot && i == 0;
+            split_inst.eot = inst->eot && i == n - 1;
  
              /* Select the correct channel enables for the i-th group, then
               * transform the sources and destination and emit the lowered
@@ -6010,7 +6054,7 @@ fs_visitor::setup_fs_payload_gen6()
      */
     for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
        if (prog_data->barycentric_interp_modes & (1 << i)) {
-         payload.barycentric_coord_reg[i] = payload.num_regs;
+         payload.barycentric_coord_reg[i][0] = payload.num_regs;
           payload.num_regs += 2;
           if (dispatch_width == 16) {
              payload.num_regs += 2;
@@ -6022,7 +6066,7 @@ fs_visitor::setup_fs_payload_gen6()
     prog_data->uses_src_depth =
        (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
     if (prog_data->uses_src_depth) {
-      payload.source_depth_reg = payload.num_regs;
+      payload.source_depth_reg[0] = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
           /* R28: interpolated depth if not SIMD8. */
@@ -6034,7 +6078,7 @@ fs_visitor::setup_fs_payload_gen6()
     prog_data->uses_src_w =
        (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
     if (prog_data->uses_src_w) {
-      payload.source_w_reg = payload.num_regs;
+      payload.source_w_reg[0] = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
           /* R30: interpolated W if not SIMD8. */
@@ -6055,7 +6099,7 @@ fs_visitor::setup_fs_payload_gen6()
         * persample dispatch, we hard-code it to 0.5.
         */
        prog_data->uses_pos_offset = true;
-      payload.sample_pos_reg = payload.num_regs;
+      payload.sample_pos_reg[0] = payload.num_regs;
        payload.num_regs++;
     }
  
@@ -6064,7 +6108,7 @@ fs_visitor::setup_fs_payload_gen6()
        (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
     if (prog_data->uses_sample_mask) {
        assert(devinfo->gen >= 7);
-      payload.sample_mask_in_reg = payload.num_regs;
+      payload.sample_mask_in_reg[0] = payload.num_regs;
        payload.num_regs++;
        if (dispatch_width == 16) {
           /* R33: input coverage mask if not SIMD8. */
@@ -7050,7 +7094,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 const nir_shader *src_shader,
                 struct gl_program *prog,
                 int shader_time_index8, int shader_time_index16,
-               bool allow_spilling,
+               int shader_time_index32, bool allow_spilling,
                 bool use_rep_send, struct brw_vue_map *vue_map,
                 char **error_str)
  {