From aab134cfa57cd2f72d4234fe3f41e392e6a4f48d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Fri, 8 Sep 2017 12:05:24 +0200 Subject: [PATCH] radeonsi: enable out-of-order rasterization when possible on VI and GFX9 dGPUs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This does not take commutative blending into account yet. R600_DEBUG=nooutoforder disables it. Reviewed-by: Marek Olšák Tested-by: Dieter Nützel --- src/gallium/drivers/radeon/r600_pipe_common.c | 1 + src/gallium/drivers/radeon/r600_pipe_common.h | 2 +- src/gallium/drivers/radeonsi/si_pipe.c | 3 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 157 +++++++++++++++++- src/gallium/drivers/radeonsi/si_state.h | 28 +++- .../drivers/radeonsi/si_state_shaders.c | 7 + 7 files changed, 193 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 1183e181a82..59fcb63fb7a 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -891,6 +891,7 @@ static const struct debug_named_value common_debug_options[] = { { "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" }, { "nodpbb", DBG_NO_DPBB, "Disable DPBB." }, { "nodfsm", DBG_NO_DFSM, "Disable DFSM." }, + { "nooutoforder", DBG_NO_OUT_OF_ORDER, "Disable out-of-order rasterization" }, DEBUG_NAMED_VALUE_END /* must be last */ }; diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 46db2c968f3..bd0dc76ec2b 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -110,7 +110,7 @@ struct u_log_context; #define DBG_NO_RB_PLUS (1ull << 45) #define DBG_SI_SCHED (1ull << 46) #define DBG_MONOLITHIC_SHADERS (1ull << 47) -/* gap */ +#define DBG_NO_OUT_OF_ORDER (1ull << 48) #define DBG_UNSAFE_MATH (1ull << 49) #define DBG_NO_DCC_FB (1ull << 50) #define DBG_TEST_VMFAULT_CP (1ull << 51) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1989574511e..68d63692e4f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1045,6 +1045,9 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, sscreen->b.info.pfp_fw_version >= 79 && sscreen->b.info.me_fw_version >= 142); + sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI && + sscreen->b.info.max_se >= 2 && + !(sscreen->b.debug_flags & DBG_NO_OUT_OF_ORDER); sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 && sscreen->b.family <= CHIP_POLARIS12) || sscreen->b.family == CHIP_VEGA10 || diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 10215a35886..6d9d3def7b5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -94,6 +94,7 @@ struct si_screen { bool has_clear_state; bool has_distributed_tess; bool has_draw_indirect_multi; + bool has_out_of_order_rast; bool has_msaa_sample_loc_bug; bool dpbb_allowed; bool dfsm_allowed; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f2a6a259919..9287086038d 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -423,6 +423,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->alpha_to_coverage = state->alpha_to_coverage; blend->alpha_to_one = state->alpha_to_one; blend->dual_src_blend = util_blend_state_is_dual(state, 0); + blend->logicop_enable = state->logicop_enable; if (state->logicop_enable) { color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); @@ -630,6 +631,13 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) si_mark_atom_dirty(sctx, &sctx->dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + (!old_blend || + (old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || + old_blend->logicop_enable != blend->logicop_enable))) + si_mark_atom_dirty(sctx, &sctx->msaa_config); } static void si_delete_blend_state(struct pipe_context *ctx, void *state) @@ -1059,6 +1067,30 @@ static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s) s->zpass_op != PIPE_STENCIL_OP_KEEP); } +static bool si_order_invariant_stencil_op(enum pipe_stencil_op op) +{ + /* REPLACE is normally order invariant, except when the stencil + * reference value is written by the fragment shader. Tracking this + * interaction does not seem worth the effort, so be conservative. */ + return op != PIPE_STENCIL_OP_INCR && + op != PIPE_STENCIL_OP_DECR && + op != PIPE_STENCIL_OP_REPLACE; +} + +/* Compute whether, assuming Z writes are disabled, this stencil state is order + * invariant in the sense that the set of passing fragments as well as the + * final stencil buffer result does not depend on the order of fragments. */ +static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state) +{ + return !state->enabled || !state->writemask || + /* The following assumes that Z writes are disabled. */ + (state->func == PIPE_FUNC_ALWAYS && + si_order_invariant_stencil_op(state->zpass_op) && + si_order_invariant_stencil_op(state->zfail_op)) || + (state->func == PIPE_FUNC_NEVER && + si_order_invariant_stencil_op(state->fail_op)); +} + static void *si_create_dsa_state(struct pipe_context *ctx, const struct pipe_depth_stencil_alpha_state *state) { @@ -1125,6 +1157,44 @@ static void *si_create_dsa_state(struct pipe_context *ctx, si_dsa_writes_stencil(&state->stencil[1])); dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled; + + bool zfunc_is_ordered = + state->depth.func == PIPE_FUNC_NEVER || + state->depth.func == PIPE_FUNC_LESS || + state->depth.func == PIPE_FUNC_LEQUAL || + state->depth.func == PIPE_FUNC_GREATER || + state->depth.func == PIPE_FUNC_GEQUAL; + + bool nozwrite_and_order_invariant_stencil = + !dsa->db_can_write || + (!dsa->depth_write_enabled && + si_order_invariant_stencil_state(&state->stencil[0]) && + si_order_invariant_stencil_state(&state->stencil[1])); + + dsa->order_invariance[1].zs = + nozwrite_and_order_invariant_stencil || + (!dsa->stencil_write_enabled && zfunc_is_ordered); + dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered; + + dsa->order_invariance[1].pass_set = + nozwrite_and_order_invariant_stencil || + (!dsa->stencil_write_enabled && + (state->depth.func == PIPE_FUNC_ALWAYS || + state->depth.func == PIPE_FUNC_NEVER)); + dsa->order_invariance[0].pass_set = + !dsa->depth_write_enabled || + (state->depth.func == PIPE_FUNC_ALWAYS || + state->depth.func == PIPE_FUNC_NEVER); + + const bool assume_no_z_fights = false; + + dsa->order_invariance[1].pass_last = + assume_no_z_fights && !dsa->stencil_write_enabled && + dsa->depth_write_enabled && zfunc_is_ordered; + dsa->order_invariance[0].pass_last = + assume_no_z_fights && + dsa->depth_write_enabled && zfunc_is_ordered; + return dsa; } @@ -1154,6 +1224,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) old_dsa->stencil_enabled != dsa->stencil_enabled || old_dsa->db_can_write != dsa->db_can_write))) si_mark_atom_dirty(sctx, &sctx->dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + (!old_dsa || + memcmp(old_dsa->order_invariance, dsa->order_invariance, + sizeof(old_dsa->order_invariance)))) + si_mark_atom_dirty(sctx, &sctx->msaa_config); } static void si_delete_dsa_state(struct pipe_context *ctx, void *state) @@ -1198,6 +1274,11 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, struct si_context *sctx = (struct si_context*)ctx; si_mark_atom_dirty(sctx, &sctx->db_render_state); + + bool perfect_enable = sctx->b.num_perfect_occlusion_queries != 0; + + if (perfect_enable != old_perfect_enable) + si_mark_atom_dirty(sctx, &sctx->msaa_config); } static void si_save_qbo_state(struct pipe_context *ctx, struct r600_qbo_state *st) @@ -2549,6 +2630,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, struct r600_texture *rtex; bool old_any_dst_linear = sctx->framebuffer.any_dst_linear; unsigned old_nr_samples = sctx->framebuffer.nr_samples; + unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit; + bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf; + bool old_has_stencil = + old_has_zsbuf && + ((struct r600_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; bool unbound = false; int i; @@ -2706,15 +2792,17 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, } } + struct r600_texture *zstex = NULL; + if (state->zsbuf) { surf = (struct r600_surface*)state->zsbuf; - rtex = (struct r600_texture*)surf->base.texture; + zstex = (struct r600_texture*)surf->base.texture; if (!surf->depth_initialized) { si_init_depth_surface(sctx, surf); } - if (vi_tc_compat_htile_enabled(rtex, surf->base.u.tex.level)) + if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level)) sctx->framebuffer.DB_has_shader_readable_metadata = true; r600_context_add_resource_size(ctx, surf->base.texture); @@ -2730,6 +2818,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) si_mark_atom_dirty(sctx, &sctx->msaa_config); + if (sctx->screen->has_out_of_order_rast && + (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || + !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || + (zstex && zstex->surface.has_stencil != old_has_stencil))) + si_mark_atom_dirty(sctx, &sctx->msaa_config); + if (sctx->framebuffer.nr_samples != old_nr_samples) { si_mark_atom_dirty(sctx, &sctx->msaa_config); si_mark_atom_dirty(sctx, &sctx->db_render_state); @@ -3066,16 +3160,75 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx, } } +static bool si_out_of_order_rasterization(struct si_context *sctx) +{ + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_dsa *dsa = sctx->queued.named.dsa; + + if (!sctx->screen->has_out_of_order_rast) + return false; + + unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; + + if (blend) { + colormask &= blend->cb_target_enabled_4bit; + } else { + colormask = 0; + } + + /* Conservative: No logic op. */ + if (colormask && blend->logicop_enable) + return false; + + struct si_dsa_order_invariance dsa_order_invariant = { + .zs = true, .pass_set = true, .pass_last = false + }; + + if (sctx->framebuffer.state.zsbuf) { + struct r600_texture *zstex = + (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture; + bool has_stencil = zstex->surface.has_stencil; + dsa_order_invariant = dsa->order_invariance[has_stencil]; + if (!dsa_order_invariant.zs) + return false; + + /* The set of PS invocations is always order invariant, + * except when early Z/S tests are requested. */ + if (sctx->ps_shader.cso && + sctx->ps_shader.cso->info.writes_memory && + sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && + !dsa_order_invariant.pass_set) + return false; + + if (sctx->b.num_perfect_occlusion_queries != 0 && + !dsa_order_invariant.pass_set) + return false; + } + + if (!colormask) + return true; + + bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0; + + if (blend_enabled) + return false; /* TODO */ + + return dsa_order_invariant.pass_last; +} + static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes; /* 33% faster rendering to linear color buffers */ bool dst_is_linear = sctx->framebuffer.any_dst_linear; + bool out_of_order_rast = si_out_of_order_rasterization(sctx); unsigned sc_mode_cntl_1 = S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | + S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | + S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | /* always 1: */ S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 672229c15d9..4f14f89166d 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -49,15 +49,16 @@ struct si_shader_selector; struct si_state_blend { struct si_pm4_state pm4; uint32_t cb_target_mask; - bool alpha_to_coverage; - bool alpha_to_one; - bool dual_src_blend; /* Set 0xf or 0x0 (4 bits) per render target if the following is * true. ANDed with spi_shader_col_format. */ unsigned cb_target_enabled_4bit; unsigned blend_enable_4bit; unsigned need_src_alpha_4bit; + bool alpha_to_coverage:1; + bool alpha_to_one:1; + bool dual_src_blend:1; + bool logicop_enable:1; }; struct si_state_rasterizer { @@ -89,15 +90,36 @@ struct si_dsa_stencil_ref_part { uint8_t writemask[2]; }; +struct si_dsa_order_invariance { + /** Whether the final result in Z/S buffers is guaranteed to be + * invariant under changes to the order in which fragments arrive. */ + bool zs:1; + + /** Whether the set of fragments that pass the combined Z/S test is + * guaranteed to be invariant under changes to the order in which + * fragments arrive. */ + bool pass_set:1; + + /** Whether the last fragment that passes the combined Z/S test at each + * sample is guaranteed to be invariant under changes to the order in + * which fragments arrive. */ + bool pass_last:1; +}; + struct si_state_dsa { struct si_pm4_state pm4; struct si_dsa_stencil_ref_part stencil_ref; + + /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */ + struct si_dsa_order_invariance order_invariance[2]; + ubyte alpha_func:3; bool depth_enabled:1; bool depth_write_enabled:1; bool stencil_enabled:1; bool stencil_write_enabled:1; bool db_can_write:1; + }; struct si_stencil_ref { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index bbc6b1d7080..53a60ba11ed 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2421,6 +2421,13 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) if (!old_sel || old_sel->info.colors_written != sel->info.colors_written) si_mark_atom_dirty(sctx, &sctx->cb_render_state); + + if (sctx->screen->has_out_of_order_rast && + (!old_sel || + old_sel->info.writes_memory != sel->info.writes_memory || + old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] != + sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])) + si_mark_atom_dirty(sctx, &sctx->msaa_config); } si_set_active_descriptors_for_shader(sctx, sel); } -- 2.30.2