From 440135e5a0d178c537db3f96e6823bc8220a0f3f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 18 Apr 2019 15:19:19 -0400 Subject: [PATCH] radeonsi/gfx9: rework the gfx9 scissor bug workaround (v2) Needed to track context rolls caused by streamout and ACQUIRE_MEM. ACQUIRE_MEM can occur outside of draw calls. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355 v2: squashed patches and done more rework Cc: 19.0 --- src/gallium/drivers/radeonsi/si_pipe.c | 2 + src/gallium/drivers/radeonsi/si_pipe.h | 3 +- src/gallium/drivers/radeonsi/si_state.c | 8 +- .../drivers/radeonsi/si_state_binning.c | 4 +- src/gallium/drivers/radeonsi/si_state_draw.c | 86 +++++++++++-------- .../drivers/radeonsi/si_state_shaders.c | 10 +-- .../drivers/radeonsi/si_state_streamout.c | 1 + .../drivers/radeonsi/si_state_viewport.c | 2 +- 8 files changed, 68 insertions(+), 48 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 07642246ab6..aaf5138a3a2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1097,6 +1097,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, #include "si_debug_options.h" } + sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN; sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 && sscreen->info.family <= CHIP_POLARIS12) || sscreen->info.family == CHIP_VEGA10 || diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 7fc0319973b..1d26ca90219 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -468,6 +468,7 @@ struct si_screen { bool has_out_of_order_rast; bool assume_no_z_fights; bool commutative_blend_add; + bool has_gfx9_scissor_bug; bool has_msaa_sample_loc_bug; bool has_ls_vgpr_init_bug; bool has_dcc_constant_encode; @@ -1075,7 +1076,7 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ - unsigned context_roll_counter; + unsigned context_roll; /* Queries. */ /* Maintain the list of active queries for pausing between IBs. */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 757c17f7df8..bc7e777ad73 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -256,7 +256,7 @@ static void si_emit_cb_render_state(struct si_context *sctx) sx_blend_opt_control); } if (initial_cdw != cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -793,7 +793,7 @@ static void si_emit_clip_regs(struct si_context *sctx) S_028810_CLIP_DISABLE(window_space)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -1455,7 +1455,7 @@ static void si_emit_db_render_state(struct si_context *sctx) SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -3544,7 +3544,7 @@ static void si_emit_msaa_config(struct si_context *sctx) SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); if (initial_cdw != cs->current.cdw) { - sctx->context_roll_counter++; + sctx->context_roll = true; /* GFX9: Flush DFSM when the AA mode changes. */ if (sctx->screen->dfsm_allowed) { diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 3516e561282..5c6c2e69b90 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -321,7 +321,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx) S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } void si_emit_dpbb_state(struct si_context *sctx) @@ -443,5 +443,5 @@ void si_emit_dpbb_state(struct si_context *sctx) S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 80e1bc4b475..4b60679484f 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -66,7 +66,7 @@ static unsigned si_conv_pipe_prim(unsigned mode) * The information about LDS and other non-compile-time parameters is then * written to userdata SGPRs. */ -static bool si_emit_derived_tess_state(struct si_context *sctx, +static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *num_patches) { @@ -110,7 +110,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) { *num_patches = sctx->last_num_patches; - return false; + return; } sctx->last_ls = ls_current; @@ -305,9 +305,8 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, ls_hs_config); } sctx->last_ls_hs_config = ls_hs_config; - return true; /* true if the context rolls */ + sctx->context_roll = true; } - return false; } static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info) @@ -541,7 +540,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, } /* rast_prim is the primitive type after GS. */ -static bool si_emit_rasterizer_prim_state(struct si_context *sctx) +static void si_emit_rasterizer_prim_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; enum pipe_prim_type rast_prim = sctx->current_rast_prim; @@ -549,11 +548,11 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx) /* Skip this if not rendering lines. */ if (!util_prim_is_lines(rast_prim)) - return false; + return; if (rast_prim == sctx->last_rast_prim && rs->pa_sc_line_stipple == sctx->last_sc_line_stipple) - return false; + return; /* For lines, reset the stipple pattern at each primitive. Otherwise, * reset the stipple pattern at each packet (line strips, line loops). @@ -564,7 +563,7 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx) sctx->last_rast_prim = rast_prim; sctx->last_sc_line_stipple = rs->pa_sc_line_stipple; - return true; /* true if the context rolls */ + sctx->context_roll = true; } static void si_emit_vs_state(struct si_context *sctx, @@ -659,6 +658,7 @@ static void si_emit_draw_registers(struct si_context *sctx, radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index); sctx->last_restart_index = info->restart_index; + sctx->context_roll = true; } } @@ -886,6 +886,11 @@ static void si_emit_surface_sync(struct si_context *sctx, radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ } + + /* ACQUIRE_MEM has an implicit context roll if the current context + * is busy. */ + if (sctx->has_graphics) + sctx->context_roll = true; } void si_emit_cache_flush(struct si_context *sctx) @@ -1213,26 +1218,10 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i unsigned skip_atom_mask) { unsigned num_patches = 0; - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && - !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); - bool context_roll = false; /* set correctly for GFX9 only */ - context_roll |= si_emit_rasterizer_prim_state(sctx); + si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) - context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches); - - if (handle_scissor_bug && - (info->count_from_stream_output || - sctx->dirty_atoms & si_atoms_that_always_roll_context() || - sctx->dirty_states & si_states_that_always_roll_context() || - si_prim_restart_index_changed(sctx, info))) - context_roll = true; - - sctx->context_roll_counter = 0; + si_emit_derived_tess_state(sctx, info, &num_patches); /* Emit state atoms. */ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; @@ -1255,12 +1244,6 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i } sctx->dirty_states = 0; - if (handle_scissor_bug && - (context_roll || sctx->context_roll_counter)) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - sctx->atoms.s.scissors.emit(sctx); - } - /* Emit draw states. */ si_emit_vs_state(sctx, info); si_emit_draw_registers(sctx, info, num_patches); @@ -1462,6 +1445,22 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (!si_upload_vertex_buffer_descriptors(sctx)) goto return_cleanup; + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug; + unsigned masked_atoms = 0; + + if (has_gfx9_scissor_bug) { + masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); + + if (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context()) + sctx->context_roll = true; + } + /* Use optimal packet order based on whether we need to sync the pipeline. */ if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | @@ -1472,8 +1471,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i * Then draw and prefetch at the end. This ensures that the time * the CUs are idle is very short. */ - unsigned masked_atoms = 0; - if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); @@ -1487,6 +1484,13 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) sctx->atoms.s.render_cond.emit(sctx); + + if (has_gfx9_scissor_bug && + (sctx->context_roll || + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); @@ -1511,7 +1515,16 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (!si_upload_graphics_shader_descriptors(sctx)) return; - si_emit_all_states(sctx, info, 0); + si_emit_all_states(sctx, info, masked_atoms); + + if (has_gfx9_scissor_bug && + (sctx->context_roll || + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + sctx->dirty_atoms = 0; + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* Prefetch the remaining shaders after the draw has been @@ -1520,6 +1533,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i cik_emit_prefetch_L2(sctx, false); } + /* Clear the context roll flag after the draw call. */ + sctx->context_roll = false; + if (unlikely(sctx->current_saved_cs)) { si_trace_emit(sctx); si_log_draw_state(sctx, sctx->log); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index f57e7730905..55df95477d3 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -576,7 +576,7 @@ static void si_emit_shader_es(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) @@ -825,7 +825,7 @@ static void si_emit_shader_gs(struct si_context *sctx) } if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -1002,7 +1002,7 @@ static void si_emit_shader_vs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /** @@ -1194,7 +1194,7 @@ static void si_emit_shader_ps(struct si_context *sctx) shader->ctx_reg.ps.cb_shader_mask); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_ps(struct si_shader *shader) @@ -2877,7 +2877,7 @@ static void si_emit_spi_map(struct si_context *sctx) sctx->tracked_regs.spi_ps_input_cntl, num_interp); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /** diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 2bf6862c89b..2a0a4bef9a2 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -303,6 +303,7 @@ void si_emit_streamout_end(struct si_context *sctx) * buffer bound. This ensures that the primitives-emitted query * won't increment. */ radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); + sctx->context_roll = true; t[i]->buf_filled_size_valid = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index f988da4520b..6f348a9b58d 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -283,7 +283,7 @@ static void si_emit_guardband(struct si_context *ctx) S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); if (initial_cdw != ctx->gfx_cs->current.cdw) - ctx->context_roll_counter++; + ctx->context_roll = true; } static void si_emit_scissors(struct si_context *ctx) -- 2.30.2