From fcc70e4855c3bde3cadce9d0d1abf8da7106f643 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 6 Oct 2018 22:44:36 -0400 Subject: [PATCH] radeonsi: track context rolls better for the Vega scissor bug workaround We should get fewer context rolls with the SET_CONTEXT_REG optimization, but it would have been for nothing if the scissor state rolled the context anyway. Don't emit the scissor state if there is no context roll. --- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 31 ++++++++++++++---- src/gallium/drivers/radeonsi/si_state.h | 17 ++-------- .../drivers/radeonsi/si_state_binning.c | 7 ++++ src/gallium/drivers/radeonsi/si_state_draw.c | 32 +++++++++++-------- .../drivers/radeonsi/si_state_shaders.c | 23 +++++++++++++ .../drivers/radeonsi/si_state_viewport.c | 3 ++ 7 files changed, 80 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 7ae17435ab6..6edc06cece7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1023,6 +1023,7 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ + unsigned context_roll_counter; /* Queries. */ /* Maintain the list of active queries for pausing between IBs. */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index b63e70092db..8b2e6e57f45 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -88,9 +88,6 @@ static void si_emit_cb_render_state(struct si_context *sctx) (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) cb_target_mask = 0; - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, - SI_TRACKED_CB_TARGET_MASK, cb_target_mask); - /* GFX9: Flush DFSM when CB_TARGET_MASK changes. * I think we don't have to do anything between IBs. */ @@ -102,6 +99,10 @@ static void si_emit_cb_render_state(struct si_context *sctx) radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + unsigned initial_cdw = cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, + SI_TRACKED_CB_TARGET_MASK, cb_target_mask); + if (sctx->chip_class >= VI) { /* DCC MSAA workaround for blending. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- @@ -252,6 +253,8 @@ static void si_emit_cb_render_state(struct si_context *sctx) sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); } + if (initial_cdw != cs->current.cdw) + sctx->context_roll_counter++; } /* @@ -773,6 +776,7 @@ static void si_emit_clip_regs(struct si_context *sctx) clipdist_mask &= rs->clip_plane_enable; culldist_mask |= clipdist_mask; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, vs_sel->pa_cl_vs_out_cntl | @@ -784,6 +788,9 @@ static void si_emit_clip_regs(struct si_context *sctx) rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /* @@ -1352,6 +1359,7 @@ static void si_emit_db_render_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control, db_render_control, db_count_control; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; /* DB_RENDER_CONTROL */ if (sctx->dbcb_depth_copy_enabled || @@ -1434,6 +1442,9 @@ static void si_emit_db_render_state(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /* @@ -3489,6 +3500,8 @@ static void si_emit_msaa_config(struct si_context *sctx) } } + unsigned initial_cdw = cs->current.cdw; + /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl, @@ -3500,10 +3513,14 @@ static void si_emit_msaa_config(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - /* GFX9: Flush DFSM when the AA mode changes. */ - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + if (initial_cdw != cs->current.cdw) { + sctx->context_roll_counter++; + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } } } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f52296d1119..83589e6918c 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -171,17 +171,13 @@ union si_state { #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name)) #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *)) -static inline unsigned si_states_that_roll_context(void) +static inline unsigned si_states_that_always_roll_context(void) { return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) | SI_STATE_BIT(poly_offset) | - SI_STATE_BIT(es) | - SI_STATE_BIT(gs) | - SI_STATE_BIT(vgt_shader_config) | - SI_STATE_BIT(vs) | - SI_STATE_BIT(ps)); + SI_STATE_BIT(vgt_shader_config)); } union si_state_atoms { @@ -216,25 +212,18 @@ union si_state_atoms { sizeof(struct si_atom))) #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*)) -static inline unsigned si_atoms_that_roll_context(void) +static inline unsigned si_atoms_that_always_roll_context(void) { return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) | SI_ATOM_BIT(msaa_sample_locs) | - SI_ATOM_BIT(db_render_state) | - SI_ATOM_BIT(dpbb_state) | - SI_ATOM_BIT(msaa_config) | SI_ATOM_BIT(sample_mask) | - SI_ATOM_BIT(cb_render_state) | SI_ATOM_BIT(blend_color) | - SI_ATOM_BIT(clip_regs) | SI_ATOM_BIT(clip_state) | - SI_ATOM_BIT(guardband) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) | SI_ATOM_BIT(stencil_ref) | - SI_ATOM_BIT(spi_map) | SI_ATOM_BIT(scratch_state)); } diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 4aad94d95f9..70c129242d1 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -310,6 +310,8 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) static void si_emit_dpbb_disable(struct si_context *sctx) { + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | @@ -318,6 +320,8 @@ static void si_emit_dpbb_disable(struct si_context *sctx) SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } void si_emit_dpbb_state(struct si_context *sctx) @@ -419,6 +423,7 @@ void si_emit_dpbb_state(struct si_context *sctx) if (bin_size.y >= 32) bin_size_extend.y = util_logbase2(bin_size.y) - 5; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_reg( sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, @@ -436,4 +441,6 @@ void si_emit_dpbb_state(struct si_context *sctx) SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 69f723e4e4a..83eb646b791 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1189,26 +1189,26 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i unsigned skip_atom_mask) { unsigned num_patches = 0; + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && + !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); bool context_roll = false; /* set correctly for GFX9 only */ context_roll |= si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches); - if (info->count_from_stream_output) + + if (handle_scissor_bug && + (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context() || + si_prim_restart_index_changed(sctx, info))) context_roll = true; - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - if ((sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && - (context_roll || - sctx->dirty_atoms & si_atoms_that_roll_context() || - sctx->dirty_states & si_states_that_roll_context() || - si_prim_restart_index_changed(sctx, info))) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - } + sctx->context_roll_counter = 0; /* Emit state atoms. */ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; @@ -1231,6 +1231,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i } sctx->dirty_states = 0; + if (handle_scissor_bug && + (context_roll || sctx->context_roll_counter)) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + /* Emit draw states. */ si_emit_vs_state(sctx, info); si_emit_draw_registers(sctx, info, num_patches); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 2bdac33586b..ad7d21e7816 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -561,6 +561,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_es(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.es->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; if (!shader) return; @@ -578,6 +579,9 @@ static void si_emit_shader_es(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) @@ -762,6 +766,8 @@ static void gfx9_get_gs_info(struct si_shader_selector *es, static void si_emit_shader_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; @@ -822,6 +828,9 @@ static void si_emit_shader_gs(struct si_context *sctx) SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); } + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -957,6 +966,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_vs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.vs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; @@ -994,6 +1005,9 @@ static void si_emit_shader_vs(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /** @@ -1156,6 +1170,8 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) static void si_emit_shader_ps(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.ps->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; @@ -1181,6 +1197,9 @@ static void si_emit_shader_ps(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, shader->ctx_reg.ps.cb_shader_mask); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_ps(struct si_shader *shader) @@ -2849,9 +2868,13 @@ static void si_emit_spi_map(struct si_context *sctx) /* R_028644_SPI_PS_INPUT_CNTL_0 */ /* Dota 2: Only ~16% of SPI map updates set different values. */ /* Talos: Only ~9% of SPI map updates set different values. */ + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, sctx->tracked_regs.spi_ps_input_cntl, num_interp); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /** diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 819c773ba8e..587422e50ca 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -258,6 +258,7 @@ static void si_emit_guardband(struct si_context *ctx) * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ + unsigned initial_cdw = ctx->gfx_cs->current.cdw; radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y), @@ -271,6 +272,8 @@ static void si_emit_guardband(struct si_context *ctx) S_028BE4_PIX_CENTER(rs->half_pixel_center) | S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); + if (initial_cdw != ctx->gfx_cs->current.cdw) + ctx->context_roll_counter++; } static void si_emit_scissors(struct si_context *ctx) -- 2.30.2