i965: Handle mix-and-match TCS/TES with separate shader objects.
authorKenneth Graunke <kenneth@whitecape.org>
Tue, 8 Dec 2015 04:18:42 +0000 (20:18 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Wed, 23 Dec 2015 01:22:11 +0000 (17:22 -0800)
GL_ARB_separate_shader_objects allows the application to mix-and-match
TCS and TES programs separately.  This means that the interface between
the two stages isn't known until the final SSO pipeline is in place.

This isn't a great match for our hardware: the TCS and TES have to agree
on the Patch URB entry layout.  Since we store data as per-patch slots
followed by per-vertex slots, changing the number of per-patch slots can
significantly alter the layout.  This can easily happen with SSO.

To handle this, we store the [Patch]OutputsWritten and [Patch]InputsRead
bitfields in the TCS/TES program keys, introducing program recompiles.
brw_upload_programs() decides the layout for both TCS and TES, and
passes it to brw_upload_tcs/tes(), which store it in the key.

When creating the NIR for a shader specialization, we override
nir->info.inputs_read (and friends) to the program key's values.
Since everything uses those, no further compiler changes are needed.
This also replaces the hack in brw_create_nir().

To avoid recompiles, brw_precompile_tes() looks to see if there's a
TCS in the linked shader.  If so, it accounts for the TCS outputs,
just as brw_upload_programs() would.  This eliminates all recompiles
in the non-SSO case.  In the SSO case, there should only be recompiles
when using a TCS and TES that have different input/output interfaces.

Fixes Piglit's mix-and-match-tcs-tes test.

v2: Pull the brw_upload_programs code into a brw_upload_tess_programs()
    helper function (requested by Jordan Justen).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
src/mesa/drivers/dri/i965/brw_compiler.h
src/mesa/drivers/dri/i965/brw_nir.c
src/mesa/drivers/dri/i965/brw_program.h
src/mesa/drivers/dri/i965/brw_shader.cpp
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/brw_tcs.c
src/mesa/drivers/dri/i965/brw_tes.c
src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp

index 0ffaac7f5ceb4cf30d81645705a8855da1620d7a..e66deb109e47f77fb5f892c0e5b482f1cc75487e 100644 (file)
@@ -200,6 +200,9 @@ struct brw_tcs_prog_key
 
    unsigned input_vertices;
 
+   /** A bitfield of per-patch outputs written. */
+   uint32_t patch_outputs_written;
+
    /** A bitfield of per-vertex outputs written. */
    uint64_t outputs_written;
 
@@ -211,6 +214,12 @@ struct brw_tes_prog_key
 {
    unsigned program_string_id;
 
+   /** A bitfield of per-patch inputs read. */
+   uint32_t patch_inputs_read;
+
+   /** A bitfield of per-vertex inputs read. */
+   uint64_t inputs_read;
+
    struct brw_sampler_prog_key_data tex;
 };
 
index 8b06dbe20908af8ae7b7155dfd523776aa06620c..eebd2a386b61cae60e14450a0d7c1256c3525d4d 100644 (file)
@@ -618,17 +618,6 @@ brw_create_nir(struct brw_context *brw,
    /* First, lower the GLSL IR or Mesa IR to NIR */
    if (shader_prog) {
       nir = glsl_to_nir(shader_prog, stage, options);
-
-      if (nir->stage == MESA_SHADER_TESS_EVAL &&
-          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
-         const struct gl_program *tcs =
-            shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
-         /* Work around the TCS having bonus outputs used as shared memory
-          * segments, which makes OutputsWritten not match InputsRead
-          */
-         nir->info.inputs_read = tcs->OutputsWritten;
-         nir->info.patch_inputs_read = tcs->PatchOutputsWritten;
-      }
    } else {
       nir = prog_to_nir(prog, options);
       OPT_V(nir_convert_to_ssa); /* turn registers into SSA */
index 3d9e1b983c80defec4c7afa47f7cf0283ea045d3..059ccf8bd398d04f38cd1782d21630355c0dab9b 100644 (file)
@@ -56,8 +56,10 @@ void
 brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog);
 
-void brw_upload_tcs_prog(struct brw_context *brw);
-void brw_upload_tes_prog(struct brw_context *brw);
+void brw_upload_tcs_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
+void brw_upload_tes_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
 
 #ifdef __cplusplus
 } /* extern "C" */
index 57f9eb23e4e700293fe51390d7e937ae88c91b55..5140cfb7bc6323e5b60843fc73dd3aa8401e93d6 100644 (file)
@@ -1329,6 +1329,8 @@ brw_compile_tes(const struct brw_compiler *compiler,
 
    nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.inputs_read = key->inputs_read;
+   nir->info.patch_inputs_read = key->patch_inputs_read;
    nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
index eb0357b755942bbba8f4213346dec99c55200591..81a67d284e0dd32ee9411d30b9fe54b5b2bd065e 100644 (file)
@@ -673,21 +673,41 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map)
    }
 }
 
+static inline void
+brw_upload_tess_programs(struct brw_context *brw)
+{
+   if (brw->tess_eval_program) {
+      uint64_t per_vertex_slots = brw->tess_eval_program->Base.InputsRead;
+      uint32_t per_patch_slots =
+         brw->tess_eval_program->Base.PatchInputsRead;
+
+      /* The TCS may have additional outputs which aren't read by the
+       * TES (possibly for cross-thread communication).  These need to
+       * be stored in the Patch URB Entry as well.
+       */
+      if (brw->tess_ctrl_program) {
+         per_vertex_slots |= brw->tess_ctrl_program->Base.OutputsWritten;
+         per_patch_slots |=
+            brw->tess_ctrl_program->Base.PatchOutputsWritten;
+      }
+
+      brw_upload_tcs_prog(brw, per_vertex_slots, per_patch_slots);
+      brw_upload_tes_prog(brw, per_vertex_slots, per_patch_slots);
+   } else {
+      brw->tcs.prog_data = NULL;
+      brw->tcs.base.prog_data = NULL;
+      brw->tes.prog_data = NULL;
+      brw->tes.base.prog_data = NULL;
+   }
+}
+
 static inline void
 brw_upload_programs(struct brw_context *brw,
                     enum brw_pipeline pipeline)
 {
    if (pipeline == BRW_RENDER_PIPELINE) {
       brw_upload_vs_prog(brw);
-      if (brw->tess_eval_program) {
-         brw_upload_tcs_prog(brw);
-         brw_upload_tes_prog(brw);
-      } else {
-         brw->tcs.prog_data = NULL;
-         brw->tcs.base.prog_data = NULL;
-         brw->tes.prog_data = NULL;
-         brw->tes.base.prog_data = NULL;
-      }
+      brw_upload_tess_programs(brw);
 
       if (brw->gen < 6)
          brw_upload_ff_gs_prog(brw);
index ecb6fd0c8ba64bae125b3d090d635cbed0c7542a..2c925e7f572336a0644ad4eb5cd2c0b7fa5f6390 100644 (file)
@@ -67,6 +67,10 @@ brw_tcs_debug_recompile(struct brw_context *brw,
 
    found |= key_debug(brw, "input vertices", old_key->input_vertices,
                       key->input_vertices);
+   found |= key_debug(brw, "outputs written", old_key->outputs_written,
+                      key->outputs_written);
+   found |= key_debug(brw, "patch outputs written", old_key->patch_outputs_written,
+                      key->patch_outputs_written);
    found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
                       key->tes_primitive_mode);
    found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
@@ -224,7 +228,9 @@ brw_codegen_tcs_prog(struct brw_context *brw,
 
 
 void
-brw_upload_tcs_prog(struct brw_context *brw)
+brw_upload_tcs_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
@@ -248,6 +254,8 @@ brw_upload_tcs_prog(struct brw_context *brw)
    memset(&key, 0, sizeof(key));
 
    key.input_vertices = ctx->TessCtrlProgram.patch_vertices;
+   key.outputs_written = per_vertex_slots;
+   key.patch_outputs_written = per_patch_slots;
 
    /* We need to specialize our code generation for tessellation levels
     * based on the domain the DS is expecting to tessellate.
@@ -301,6 +309,9 @@ brw_tcs_precompile(struct gl_context *ctx,
 
    key.tes_primitive_mode = GL_TRIANGLES;
 
+   key.outputs_written = prog->OutputsWritten;
+   key.patch_outputs_written = prog->PatchOutputsWritten;
+
    success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key);
 
    brw->tcs.base.prog_offset = old_prog_offset;
index 844c5b28b33cfe685256ffd07d1634558ce771ae..27dc7e59f5d1fb43ffda1c72570e3afaf1a3070e 100644 (file)
@@ -66,6 +66,10 @@ brw_tes_debug_recompile(struct brw_context *brw,
    }
 
    found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+   found |= key_debug(brw, "inputs read", old_key->inputs_read,
+                      key->inputs_read);
+   found |= key_debug(brw, "patch inputs read", old_key->patch_inputs_read,
+                      key->patch_inputs_read);
 
    if (!found) {
       perf_debug("  Something else\n");
@@ -226,7 +230,9 @@ brw_codegen_tes_prog(struct brw_context *brw,
 
 
 void
-brw_upload_tes_prog(struct brw_context *brw)
+brw_upload_tes_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
@@ -247,6 +253,14 @@ brw_upload_tes_prog(struct brw_context *brw)
 
    key.program_string_id = tep->id;
 
+   /* Ignore gl_TessLevelInner/Outer - we treat them as system values,
+    * not inputs, and they're always present in the URB entry regardless
+    * of whether or not we read them.
+    */
+   key.inputs_read = per_vertex_slots &
+      ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+   key.patch_inputs_read = per_patch_slots;
+
    /* _NEW_TEXTURE */
    brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
                                       &key.tex);
@@ -280,6 +294,20 @@ brw_tes_precompile(struct gl_context *ctx,
    memset(&key, 0, sizeof(key));
 
    key.program_string_id = btep->id;
+   key.inputs_read = prog->InputsRead;
+   key.patch_inputs_read = prog->PatchInputsRead;
+
+   if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      struct gl_program *tcp =
+         shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
+      key.inputs_read |= tcp->OutputsWritten;
+      key.patch_inputs_read |= tcp->PatchOutputsWritten;
+   }
+
+   /* Ignore gl_TessLevelInner/Outer - they're system values. */
+   key.inputs_read &= ~(VARYING_BIT_TESS_LEVEL_INNER |
+                        VARYING_BIT_TESS_LEVEL_OUTER);
+
    brw_setup_tex_for_precompile(brw, &key.tex, prog);
 
    success = brw_codegen_tes_prog(brw, shader_prog, btep, &key);
index fba55b5f5f214fcfe6e6d544883093e2d510bc14..507db749e63e4b4f7ad120ca1a4353cc297492c2 100644 (file)
@@ -476,6 +476,8 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
    nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);