i965/skl: Add the header for constant loads outside of the generator
authorNeil Roberts <neil@linux.intel.com>
Tue, 24 Mar 2015 15:52:20 +0000 (15:52 +0000)
committerNeil Roberts <neil@linux.intel.com>
Thu, 16 Apr 2015 12:02:26 +0000 (13:02 +0100)
Commit 5a06ee738 added a step to the generator to set up the message
header when generating the VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
instruction. That pseudo opcode is implemented in terms of multiple
actual opcodes, one of which writes to one of the source registers in
order to set up the message header. This causes problems because the
scheduler isn't aware that the source register is written to and it
can end up reorganising the instructions incorrectly such that the
write to the source register overwrites a needed value from a previous
instruction. This problem was presenting itself as a rendering error
in the weapon in Enemy Territory: Quake Wars.

Since commit 588859e1 there is an additional problem that the double
register allocated to include the message header would end up being
split into two. This wasn't happening previously because the code to
split registers was explicitly avoided for instructions that are
sending from the GRF.

This patch fixes both problems by splitting the code to set up the
message header into a new pseudo opcode so that it will be done
outside of the generator. This new opcode has the header register as a
destination so the scheduler can recognise that the register is
written to. This has the additional benefit that the scheduler can
optimise the message header slightly better by moving the mov
instructions further away from the send instructions.

On Skylake it appears to fix the following three Piglit tests without
causing any regressions:

 gs-float-array-variable-index
 gs-mat3x4-row-major
 gs-mat4x3-row-major

I think we actually may need to do something similar for the fs
backend and possibly for message headers from regular texture sampling
but I'm not entirely sure.

v2: Make sure the exec-size is retained as 8 for the mov instruction
    to initialise the header from g0. This was accidentally lost
    during a rebase on top of 07c571a39fa1.
    Split the patch into two so that the helper function is a separate
    change.
    Fix emitting the MOV instruction on Gen7.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89058
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_shader.cpp
src/mesa/drivers/dri/i965/brw_vec4.h
src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

index da6ed5b973e7b46190567a7c2243a80259a068f0..a97a9443f3ddb85fca3e8a82698d71f6a50e30f5 100644 (file)
@@ -948,6 +948,7 @@ enum opcode {
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_PULL_CONSTANT_LOAD,
    VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+   VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
    VS_OPCODE_UNPACK_FLAGS_SIMD4X2,
 
    /**
index 335a8007e12254374206feec27e82f0bee801a72..0d6ac0c9311aef99600873b2b86d1398d5f36b44 100644 (file)
@@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op)
       return "pull_constant_load";
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
       return "pull_constant_load_gen7";
+
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+      return "set_simd4x2_header_gen9";
+
    case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
       return "unpack_flags_simd4x2";
 
index 036392488646c4a6e4ac36be3e73e1d2c7acb186..a0ee2cc603b6611b100da9defdd5059d609ca5c1 100644 (file)
@@ -500,6 +500,8 @@ private:
                                          struct brw_reg dst,
                                          struct brw_reg surf_index,
                                          struct brw_reg offset);
+   void generate_set_simd4x2_header_gen9(vec4_instruction *inst,
+                                         struct brw_reg dst);
    void generate_unpack_flags(struct brw_reg dst);
 
    void generate_untyped_atomic(vec4_instruction *inst,
index 980e266750a45075d1e6c1a7a8b27dc79d139b4c..70d2af5e911ac2acdfdc3a1746f70dc551e5d307 100644 (file)
@@ -44,6 +44,7 @@ can_do_writemask(const struct brw_context *brw,
    case SHADER_OPCODE_GEN4_SCRATCH_READ:
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
       return false;
    default:
       /* The MATH instruction on Gen6 only executes in align1 mode, which does
index e4addf7d2fda366cff5b26cdd65839c78e86ade5..b22a5556bfc6e5123a1c963b8a729577e53db925 100644 (file)
@@ -1039,38 +1039,18 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 {
    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
 
-   struct brw_reg src = offset;
-   bool header_present = false;
-   int mlen = 1;
-
-   if (brw->gen >= 9) {
-      /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
-      mlen = 2;
-      header_present = true;
-
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-      brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      brw_set_default_access_mode(p, BRW_ALIGN_1);
-
-      brw_MOV(p, get_element_ud(src, 2),
-              brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
-      brw_pop_insn_state(p);
-   }
-
    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
 
       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
       brw_set_dest(p, insn, dst);
-      brw_set_src0(p, insn, src);
+      brw_set_src0(p, insn, offset);
       brw_set_sampler_message(p, insn,
                               surf_index.dw1.ud,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
-                              header_present,
+                              inst->mlen,
+                              inst->header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
 
@@ -1095,14 +1075,14 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 
       /* dst = send(offset, a0.0 | <descriptor>) */
       brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
+         p, BRW_SFID_SAMPLER, dst, offset, addr);
       brw_set_sampler_message(p, insn,
                               0 /* surface */,
                               0 /* sampler */,
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1 /* rlen */,
-                              mlen /* mlen */,
-                              header_present /* header */,
+                              inst->mlen,
+                              inst->header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
 
@@ -1112,6 +1092,22 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
    }
 }
 
+void
+vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
+                                                 struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 2),
+           brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
+
+   brw_pop_insn_state(p);
+}
+
 void
 vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
                                         struct brw_reg dst,
@@ -1435,6 +1431,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
          generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
          break;
 
+      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+         generate_set_simd4x2_header_gen9(inst, dst);
+         break;
+
       case GS_OPCODE_URB_WRITE:
          generate_gs_urb_write(inst);
          break;
index f7d542b3266c25af8508db4f5141789b8f4fdac9..3d16caa74f0410c1165fe69a8b8ad5440d904d66 100644 (file)
@@ -1313,16 +1313,36 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 
    vec4_instruction *pull;
 
-   if (brw->gen >= 7) {
-      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
+   if (brw->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      src_reg header(this, glsl_type::uvec4_type, 2);
 
-      /* We have to use a message header on Skylake to get SIMD4x2 mode.
-       * Reserve space for the register.
-       */
-      if (brw->gen >= 9) {
-         grf_offset.reg_offset++;
-         alloc.sizes[grf_offset.reg] = 2;
-      }
+      pull = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(header));
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      dst_reg index_reg = retype(offset(dst_reg(header), 1),
+                                 offset_reg.type);
+      pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                                           dst,
+                                           surf_index,
+                                           header);
+      pull->mlen = 2;
+      pull->header_present = true;
+   } else if (brw->gen >= 7) {
+      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 
       grf_offset.type = offset_reg.type;