i965/fs: Rework the persample shading key/prog_data bits
authorJason Ekstrand <jason.ekstrand@intel.com>
Tue, 10 May 2016 00:48:24 +0000 (17:48 -0700)
committerJason Ekstrand <jason.ekstrand@intel.com>
Sat, 14 May 2016 20:34:05 +0000 (13:34 -0700)
This commit reworks and simplifies the way we handle persample shading in
the shader key and prog_data.  The previous approach had three different
key bits that had slightly different and hard-to-decern meanings while the
new bits are far more clear.  This commit changes it to two easily
understood bits that communicate everything we need:

 1) key->persample_interp: means that the user has requested persample
    interpolation through the API.  This is equivalent to having
    SAMPLE_SHADING enabled and having MIN_SAMPLE_SHADING_VALUE set high
    enough that you actually get multiple per-sample invocations.

 2) key->multisample_fbo: means that the shader will be running on an
    actual multi-sampled framebuffer.

This commit also adds a new "persample_dispatch" bit to prog_data which
indicates that the shader should be run in persample mode.  This way the
state setup code doesn't have to look at the fragment program or GL state
and can just pull that data out of the prog_data.

In theory, this shuffle could mean more recompiles.  However, in practice,
we were shoving enough state into the key before that we were probably
hitting a recompile on every per-sample shader anyway.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/intel/vulkan/anv_pipeline.c
src/mesa/drivers/dri/i965/brw_compiler.h
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_wm.c
src/mesa/drivers/dri/i965/gen6_wm_state.c
src/mesa/drivers/dri/i965/gen7_wm_state.c
src/mesa/drivers/dri/i965/gen8_ps_state.c

index 5800e683d39a80dd5ce884526cb6aa54aefe34af..f55069ee74718aee9224b9422de083f4a7a74c4c 100644 (file)
@@ -286,8 +286,9 @@ populate_wm_prog_key(const struct brw_device_info *devinfo,
       /* We should probably pull this out of the shader, but it's fairly
        * harmless to compute it and then let dead-code take care of it.
        */
-      key->persample_shading = info->pMultisampleState->sampleShadingEnable;
-      key->compute_pos_offset = info->pMultisampleState->sampleShadingEnable;
+      key->persample_interp =
+         (info->pMultisampleState->minSampleShading *
+          info->pMultisampleState->rasterizationSamples) > 1;
       key->multisample_fbo = true;
    }
 }
index 3d1dc88eebc789792deaa7e605c7952dff78092d..3fcd7e87c4e377e799681637ada25c1593acbcfb 100644 (file)
@@ -242,12 +242,11 @@ struct brw_wm_prog_key {
    uint8_t iz_lookup;
    bool stats_wm:1;
    bool flat_shade:1;
-   bool persample_shading:1;
    unsigned nr_color_regions:5;
    bool replicate_alpha:1;
    bool render_to_fbo:1;
    bool clamp_fragment_color:1;
-   bool compute_pos_offset:1;
+   bool persample_interp:1;
    bool multisample_fbo:1;
    unsigned line_aa:2;
    bool high_quality_derivatives:1;
@@ -386,6 +385,7 @@ struct brw_wm_prog_data {
    bool early_fragment_tests;
    bool no_8;
    bool dual_src_blend;
+   bool persample_dispatch;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
index 89b30c7bfb8f382b7ed9114b8f96af9a31020043..5e08d7120b5e9d3433acb8c4467530f6df658b8f 100644 (file)
@@ -1195,8 +1195,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                   inst->no_dd_clear = true;
 
                inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                                   mod_centroid && !key->persample_shading,
-                                   mod_sample || key->persample_shading);
+                                   mod_centroid && !key->persample_interp,
+                                   mod_sample || key->persample_interp);
                inst->predicate = BRW_PREDICATE_NORMAL;
                inst->predicate_inverse = false;
                if (devinfo->has_pln)
@@ -1204,8 +1204,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
 
             } else {
                emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                            mod_centroid && !key->persample_shading,
-                            mod_sample || key->persample_shading);
+                            mod_centroid && !key->persample_interp,
+                            mod_sample || key->persample_interp);
             }
             if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                bld.MUL(*attr, *attr, this->pixel_w);
@@ -1262,10 +1262,10 @@ void
 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    assert(dst.type == BRW_REGISTER_TYPE_F);
 
-   if (key->compute_pos_offset) {
+   if (wm_prog_data->persample_dispatch) {
       /* Convert int_sample_pos to floating point */
       bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
@@ -1430,7 +1430,7 @@ fs_reg *
 fs_visitor::emit_samplemaskin_setup()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key *) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    assert(devinfo->gen >= 6);
 
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
@@ -1438,7 +1438,7 @@ fs_visitor::emit_samplemaskin_setup()
    fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
                                BRW_REGISTER_TYPE_D));
 
-   if (key->persample_shading) {
+   if (wm_prog_data->persample_dispatch) {
       /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
        * and a mask representing which sample is being processed by the
        * current shader invocation.
@@ -5098,7 +5098,6 @@ fs_visitor::setup_fs_payload_gen6()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    unsigned barycentric_interp_modes =
       (stage == MESA_SHADER_FRAGMENT) ?
@@ -5151,9 +5150,19 @@ fs_visitor::setup_fs_payload_gen6()
       }
    }
 
-   prog_data->uses_pos_offset = key->compute_pos_offset;
    /* R31: MSAA position offsets. */
-   if (prog_data->uses_pos_offset) {
+   if (prog_data->persample_dispatch &&
+       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = true;
       payload.sample_pos_reg = payload.num_regs;
       payload.num_regs++;
    }
@@ -5993,12 +6002,19 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    prog_data->computed_stencil =
       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                           SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info.fs.uses_sample_qualifier);
+
    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
 
    prog_data->barycentric_interp_modes =
       brw_compute_barycentric_interp_modes(compiler->devinfo,
                                            key->flat_shade,
-                                           key->persample_shading,
+                                           key->persample_interp,
                                            shader);
 
    fs_visitor v(compiler, log_data, mem_ctx, key,
index ced97089fac2f96ebe7a277d4f506c792d1fe307..395b0b8b1e84f5a6a3fda8bde99f6d2b810e78db 100644 (file)
@@ -250,8 +250,8 @@ brw_wm_debug_recompile(struct brw_context *brw,
                       old_key->stats_wm, key->stats_wm);
    found |= key_debug(brw, "flat shading",
                       old_key->flat_shade, key->flat_shade);
-   found |= key_debug(brw, "per-sample shading",
-                      old_key->persample_shading, key->persample_shading);
+   found |= key_debug(brw, "per-sample interpolation",
+                      old_key->persample_interp, key->persample_interp);
    found |= key_debug(brw, "number of color buffers",
                       old_key->nr_color_regions, key->nr_color_regions);
    found |= key_debug(brw, "MRT alpha test or alpha-to-coverage",
@@ -528,15 +528,14 @@ brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 
    /* _NEW_BUFFERS _NEW_MULTISAMPLE */
    /* Ignore sample qualifier while computing this flag. */
-   key->persample_shading =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
+   if (ctx->Multisample.Enabled) {
+      key->persample_interp =
+         ctx->Multisample.SampleShading &&
+         (ctx->Multisample.MinSampleShadingValue *
+          _mesa_geometric_samples(ctx->DrawBuffer) > 1);
 
-   key->compute_pos_offset =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
-      fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS;
-
-   key->multisample_fbo = ctx->Multisample.Enabled &&
-                          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+      key->multisample_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+   }
 
    /* BRW_NEW_VUE_MAP_GEOM_OUT */
    if (brw->gen < 6 || _mesa_bitcount_64(fp->program.Base.InputsRead &
index 335920cb20fe8e06b5a55e0ca56e411f72d8cda4..dd33926dc82de08bc12e36c473fe5cc24221815c 100644 (file)
@@ -130,12 +130,10 @@ gen6_upload_wm_state(struct brw_context *brw,
 
    dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   assert(min_inv_per_frag >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
 
-      if (!prog_data->no_8 && min_inv_per_frag == 1) {
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
          dw4 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
@@ -198,7 +196,7 @@ gen6_upload_wm_state(struct brw_context *brw,
       else
          dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
 
-      if (min_inv_per_frag > 1)
+      if (prog_data->persample_dispatch)
          dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
       else {
          dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
index 2c3930f404672fcbe5def03468a448798fcefdff..945fbbdaa2bb463df24e6ec6672c9c0d48a37037 100644 (file)
@@ -91,7 +91,7 @@ upload_wm_state(struct brw_context *brw)
       else
          dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
 
-      if (_mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false) > 1)
+      if (prog_data->persample_dispatch)
          dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
       else
          dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
@@ -152,7 +152,6 @@ gen7_upload_ps_state(struct brw_context *brw,
                      bool enable_dual_src_blend, unsigned sample_mask,
                      unsigned fast_clear_op)
 {
-   struct gl_context *ctx = &brw->ctx;
    uint32_t dw2, dw4, dw5, ksp0, ksp2;
    const int max_threads_shift = brw->is_haswell ?
       HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
@@ -216,18 +215,15 @@ gen7_upload_ps_state(struct brw_context *brw,
    if (prog_data->num_varying_inputs != 0)
       dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
 
-   /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-    * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-    * is successfully compiled. In majority of the cases that bring us
-    * better performance than 'SIMD8 only' dispatch.
-    */
-   int min_inv_per_frag =
-      _mesa_get_min_invocations_per_fragment(ctx, fp, false);
-   assert(min_inv_per_frag >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
-      if (!prog_data->no_8 && min_inv_per_frag == 1) {
+
+      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
+       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
+       * is successfully compiled. In majority of the cases that bring us
+       * better performance than 'SIMD8 only' dispatch.
+       */
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
          dw5 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
index 9269a796604462bd72b6e20dc15264dd9ac9eb77..d3e1ca38c751206ad302479b52283bf4bbf42b2b 100644 (file)
@@ -52,8 +52,7 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_src_w)
       dw1 |= GEN8_PSX_USES_SOURCE_W;
 
-   if (multisampled_fbo &&
-       _mesa_get_min_invocations_per_fragment(ctx, fp, false) > 1)
+   if (prog_data->persample_dispatch)
       dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE;
 
    if (prog_data->uses_sample_mask) {
@@ -192,7 +191,6 @@ gen8_upload_ps_state(struct brw_context *brw,
                      const struct brw_wm_prog_data *prog_data,
                      uint32_t fast_clear_op)
 {
-   struct gl_context *ctx = &brw->ctx;
    uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;
 
    /* Initialize the execution mask with VMask.  Otherwise, derivatives are
@@ -246,19 +244,15 @@ gen8_upload_ps_state(struct brw_context *brw,
 
    dw6 |= fast_clear_op;
 
-   /* _NEW_MULTISAMPLE
-    * In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-    * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-    * is successfully compiled. In majority of the cases that bring us
-    * better performance than 'SIMD8 only' dispatch.
-    */
-   int min_invocations_per_fragment =
-      _mesa_get_min_invocations_per_fragment(ctx, fp, false);
-   assert(min_invocations_per_fragment >= 1);
-
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
-      if (!prog_data->no_8 && min_invocations_per_fragment == 1) {
+
+      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
+       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
+       * is successfully compiled. In majority of the cases that bring us
+       * better performance than 'SIMD8 only' dispatch.
+       */
+      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
          dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
          dw7 |= (prog_data->base.dispatch_grf_start_reg <<
                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);