From 712a980adde0b14eee8b4accd02af9b9740091a2 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 9 May 2016 17:48:24 -0700 Subject: [PATCH] i965/fs: Rework the persample shading key/prog_data bits This commit reworks and simplifies the way we handle persample shading in the shader key and prog_data. The previous approach had three different key bits that had slightly different and hard-to-decern meanings while the new bits are far more clear. This commit changes it to two easily understood bits that communicate everything we need: 1) key->persample_interp: means that the user has requested persample interpolation through the API. This is equivalent to having SAMPLE_SHADING enabled and having MIN_SAMPLE_SHADING_VALUE set high enough that you actually get multiple per-sample invocations. 2) key->multisample_fbo: means that the shader will be running on an actual multi-sampled framebuffer. This commit also adds a new "persample_dispatch" bit to prog_data which indicates that the shader should be run in persample mode. This way the state setup code doesn't have to look at the fragment program or GL state and can just pull that data out of the prog_data. In theory, this shuffle could mean more recompiles. However, in practice, we were shoving enough state into the key before that we were probably hitting a recompile on every per-sample shader anyway. Reviewed-by: Kenneth Graunke --- src/intel/vulkan/anv_pipeline.c | 5 +-- src/mesa/drivers/dri/i965/brw_compiler.h | 4 +-- src/mesa/drivers/dri/i965/brw_fs.cpp | 40 ++++++++++++++++------- src/mesa/drivers/dri/i965/brw_wm.c | 19 +++++------ src/mesa/drivers/dri/i965/gen6_wm_state.c | 6 ++-- src/mesa/drivers/dri/i965/gen7_wm_state.c | 20 +++++------- src/mesa/drivers/dri/i965/gen8_ps_state.c | 22 +++++-------- 7 files changed, 60 insertions(+), 56 deletions(-) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 5800e683d39..f55069ee747 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -286,8 +286,9 @@ populate_wm_prog_key(const struct brw_device_info *devinfo, /* We should probably pull this out of the shader, but it's fairly * harmless to compute it and then let dead-code take care of it. */ - key->persample_shading = info->pMultisampleState->sampleShadingEnable; - key->compute_pos_offset = info->pMultisampleState->sampleShadingEnable; + key->persample_interp = + (info->pMultisampleState->minSampleShading * + info->pMultisampleState->rasterizationSamples) > 1; key->multisample_fbo = true; } } diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 3d1dc88eebc..3fcd7e87c4e 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -242,12 +242,11 @@ struct brw_wm_prog_key { uint8_t iz_lookup; bool stats_wm:1; bool flat_shade:1; - bool persample_shading:1; unsigned nr_color_regions:5; bool replicate_alpha:1; bool render_to_fbo:1; bool clamp_fragment_color:1; - bool compute_pos_offset:1; + bool persample_interp:1; bool multisample_fbo:1; unsigned line_aa:2; bool high_quality_derivatives:1; @@ -386,6 +385,7 @@ struct brw_wm_prog_data { bool early_fragment_tests; bool no_8; bool dual_src_blend; + bool persample_dispatch; bool uses_pos_offset; bool uses_omask; bool uses_kill; diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 89b30c7bfb8..5e08d7120b5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1195,8 +1195,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, inst->no_dd_clear = true; inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); + mod_centroid && !key->persample_interp, + mod_sample || key->persample_interp); inst->predicate = BRW_PREDICATE_NORMAL; inst->predicate_inverse = false; if (devinfo->has_pln) @@ -1204,8 +1204,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, } else { emit_linterp(*attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); + mod_centroid && !key->persample_interp, + mod_sample || key->persample_interp); } if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { bld.MUL(*attr, *attr, this->pixel_w); @@ -1262,10 +1262,10 @@ void fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) { assert(stage == MESA_SHADER_FRAGMENT); - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; assert(dst.type == BRW_REGISTER_TYPE_F); - if (key->compute_pos_offset) { + if (wm_prog_data->persample_dispatch) { /* Convert int_sample_pos to floating point */ bld.MOV(dst, int_sample_pos); /* Scale to the range [0, 1] */ @@ -1430,7 +1430,7 @@ fs_reg * fs_visitor::emit_samplemaskin_setup() { assert(stage == MESA_SHADER_FRAGMENT); - brw_wm_prog_key *key = (brw_wm_prog_key *) this->key; + brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; assert(devinfo->gen >= 6); fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); @@ -1438,7 +1438,7 @@ fs_visitor::emit_samplemaskin_setup() fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0), BRW_REGISTER_TYPE_D)); - if (key->persample_shading) { + if (wm_prog_data->persample_dispatch) { /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, * and a mask representing which sample is being processed by the * current shader invocation. @@ -5098,7 +5098,6 @@ fs_visitor::setup_fs_payload_gen6() { assert(stage == MESA_SHADER_FRAGMENT); brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; unsigned barycentric_interp_modes = (stage == MESA_SHADER_FRAGMENT) ? @@ -5151,9 +5150,19 @@ fs_visitor::setup_fs_payload_gen6() } } - prog_data->uses_pos_offset = key->compute_pos_offset; /* R31: MSAA position offsets. */ - if (prog_data->uses_pos_offset) { + if (prog_data->persample_dispatch && + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) { + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = true; payload.sample_pos_reg = payload.num_regs; payload.num_regs++; } @@ -5993,12 +6002,19 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, prog_data->computed_stencil = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + prog_data->persample_dispatch = + key->multisample_fbo && + (key->persample_interp || + (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | + SYSTEM_BIT_SAMPLE_POS)) || + shader->info.fs.uses_sample_qualifier); + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; prog_data->barycentric_interp_modes = brw_compute_barycentric_interp_modes(compiler->devinfo, key->flat_shade, - key->persample_shading, + key->persample_interp, shader); fs_visitor v(compiler, log_data, mem_ctx, key, diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index ced97089fac..395b0b8b1e8 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -250,8 +250,8 @@ brw_wm_debug_recompile(struct brw_context *brw, old_key->stats_wm, key->stats_wm); found |= key_debug(brw, "flat shading", old_key->flat_shade, key->flat_shade); - found |= key_debug(brw, "per-sample shading", - old_key->persample_shading, key->persample_shading); + found |= key_debug(brw, "per-sample interpolation", + old_key->persample_interp, key->persample_interp); found |= key_debug(brw, "number of color buffers", old_key->nr_color_regions, key->nr_color_regions); found |= key_debug(brw, "MRT alpha test or alpha-to-coverage", @@ -528,15 +528,14 @@ brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key) /* _NEW_BUFFERS _NEW_MULTISAMPLE */ /* Ignore sample qualifier while computing this flag. */ - key->persample_shading = - _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1; + if (ctx->Multisample.Enabled) { + key->persample_interp = + ctx->Multisample.SampleShading && + (ctx->Multisample.MinSampleShadingValue * + _mesa_geometric_samples(ctx->DrawBuffer) > 1); - key->compute_pos_offset = - _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 && - fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS; - - key->multisample_fbo = ctx->Multisample.Enabled && - _mesa_geometric_samples(ctx->DrawBuffer) > 1; + key->multisample_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1; + } /* BRW_NEW_VUE_MAP_GEOM_OUT */ if (brw->gen < 6 || _mesa_bitcount_64(fp->program.Base.InputsRead & diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 335920cb20f..dd33926dc82 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -130,12 +130,10 @@ gen6_upload_wm_state(struct brw_context *brw, dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; - assert(min_inv_per_frag >= 1); - if (prog_data->prog_offset_16 || prog_data->no_8) { dw5 |= GEN6_WM_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && min_inv_per_frag == 1) { + if (!prog_data->no_8 && !prog_data->persample_dispatch) { dw5 |= GEN6_WM_8_DISPATCH_ENABLE; dw4 |= (prog_data->base.dispatch_grf_start_reg << GEN6_WM_DISPATCH_START_GRF_SHIFT_0); @@ -198,7 +196,7 @@ gen6_upload_wm_state(struct brw_context *brw, else dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; - if (min_inv_per_frag > 1) + if (prog_data->persample_dispatch) dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; else { dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 2c3930f4046..945fbbdaa2b 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -91,7 +91,7 @@ upload_wm_state(struct brw_context *brw) else dw1 |= GEN7_WM_MSRAST_OFF_PIXEL; - if (_mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false) > 1) + if (prog_data->persample_dispatch) dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE; else dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL; @@ -152,7 +152,6 @@ gen7_upload_ps_state(struct brw_context *brw, bool enable_dual_src_blend, unsigned sample_mask, unsigned fast_clear_op) { - struct gl_context *ctx = &brw->ctx; uint32_t dw2, dw4, dw5, ksp0, ksp2; const int max_threads_shift = brw->is_haswell ? HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT; @@ -216,18 +215,15 @@ gen7_upload_ps_state(struct brw_context *brw, if (prog_data->num_varying_inputs != 0) dw4 |= GEN7_PS_ATTRIBUTE_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - int min_inv_per_frag = - _mesa_get_min_invocations_per_fragment(ctx, fp, false); - assert(min_inv_per_frag >= 1); - if (prog_data->prog_offset_16 || prog_data->no_8) { dw4 |= GEN7_PS_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && min_inv_per_frag == 1) { + + /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 + * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader + * is successfully compiled. In majority of the cases that bring us + * better performance than 'SIMD8 only' dispatch. + */ + if (!prog_data->no_8 && !prog_data->persample_dispatch) { dw4 |= GEN7_PS_8_DISPATCH_ENABLE; dw5 |= (prog_data->base.dispatch_grf_start_reg << GEN7_PS_DISPATCH_START_GRF_SHIFT_0); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 9269a796604..d3e1ca38c75 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -52,8 +52,7 @@ gen8_upload_ps_extra(struct brw_context *brw, if (prog_data->uses_src_w) dw1 |= GEN8_PSX_USES_SOURCE_W; - if (multisampled_fbo && - _mesa_get_min_invocations_per_fragment(ctx, fp, false) > 1) + if (prog_data->persample_dispatch) dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE; if (prog_data->uses_sample_mask) { @@ -192,7 +191,6 @@ gen8_upload_ps_state(struct brw_context *brw, const struct brw_wm_prog_data *prog_data, uint32_t fast_clear_op) { - struct gl_context *ctx = &brw->ctx; uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0; /* Initialize the execution mask with VMask. Otherwise, derivatives are @@ -246,19 +244,15 @@ gen8_upload_ps_state(struct brw_context *brw, dw6 |= fast_clear_op; - /* _NEW_MULTISAMPLE - * In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - int min_invocations_per_fragment = - _mesa_get_min_invocations_per_fragment(ctx, fp, false); - assert(min_invocations_per_fragment >= 1); - if (prog_data->prog_offset_16 || prog_data->no_8) { dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && min_invocations_per_fragment == 1) { + + /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 + * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader + * is successfully compiled. In majority of the cases that bring us + * better performance than 'SIMD8 only' dispatch. + */ + if (!prog_data->no_8 && !prog_data->persample_dispatch) { dw6 |= GEN7_PS_8_DISPATCH_ENABLE; dw7 |= (prog_data->base.dispatch_grf_start_reg << GEN7_PS_DISPATCH_START_GRF_SHIFT_0); -- 2.30.2