radeonsi: enable out-of-order rasterization when possible on VI and GFX9 dGPUs
authorNicolai Hähnle <nicolai.haehnle@amd.com>
Fri, 8 Sep 2017 10:05:24 +0000 (12:05 +0200)
committerNicolai Hähnle <nicolai.haehnle@amd.com>
Mon, 18 Sep 2017 09:25:19 +0000 (11:25 +0200)
This does not take commutative blending into account yet.

R600_DEBUG=nooutoforder disables it.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
src/gallium/drivers/radeon/r600_pipe_common.c
src/gallium/drivers/radeon/r600_pipe_common.h
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index 1183e181a828b5585745c10b79b54efbaf3b2d0f..59fcb63fb7aea7514ea6ee6a482ddd558e22ad8e 100644 (file)
@@ -891,6 +891,7 @@ static const struct debug_named_value common_debug_options[] = {
        { "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" },
        { "nodpbb", DBG_NO_DPBB, "Disable DPBB." },
        { "nodfsm", DBG_NO_DFSM, "Disable DFSM." },
+       { "nooutoforder", DBG_NO_OUT_OF_ORDER, "Disable out-of-order rasterization" },
 
        DEBUG_NAMED_VALUE_END /* must be last */
 };
index 46db2c968f31332488b6776c3532cd51eaed6581..bd0dc76ec2b960945d5173d5bfae1957e82750f9 100644 (file)
@@ -110,7 +110,7 @@ struct u_log_context;
 #define DBG_NO_RB_PLUS         (1ull << 45)
 #define DBG_SI_SCHED           (1ull << 46)
 #define DBG_MONOLITHIC_SHADERS (1ull << 47)
-/* gap */
+#define DBG_NO_OUT_OF_ORDER    (1ull << 48)
 #define DBG_UNSAFE_MATH                (1ull << 49)
 #define DBG_NO_DCC_FB          (1ull << 50)
 #define DBG_TEST_VMFAULT_CP    (1ull << 51)
index 1989574511ea2c501efa81f537c9d061339316cf..68d63692e4f5ff3f2855ae71b2392653484b1f0c 100644 (file)
@@ -1045,6 +1045,9 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
                 sscreen->b.info.pfp_fw_version >= 79 &&
                 sscreen->b.info.me_fw_version >= 142);
 
+       sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI &&
+                                        sscreen->b.info.max_se >= 2 &&
+                                        !(sscreen->b.debug_flags & DBG_NO_OUT_OF_ORDER);
        sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
                                            sscreen->b.family <= CHIP_POLARIS12) ||
                                           sscreen->b.family == CHIP_VEGA10 ||
index 10215a35886f37f98de5834d7f363467050f3bd5..6d9d3def7b526a560c35a626860cb66649d80265 100644 (file)
@@ -94,6 +94,7 @@ struct si_screen {
        bool                            has_clear_state;
        bool                            has_distributed_tess;
        bool                            has_draw_indirect_multi;
+       bool                            has_out_of_order_rast;
        bool                            has_msaa_sample_loc_bug;
        bool                            dpbb_allowed;
        bool                            dfsm_allowed;
index f2a6a259919da3a664646dea789a15399ad42f51..9287086038d5552e9dfa52ec74d4f0243d8f4638 100644 (file)
@@ -423,6 +423,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
        blend->alpha_to_coverage = state->alpha_to_coverage;
        blend->alpha_to_one = state->alpha_to_one;
        blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+       blend->logicop_enable = state->logicop_enable;
 
        if (state->logicop_enable) {
                color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
@@ -630,6 +631,13 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
             old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
             old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
                si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+
+       if (sctx->screen->has_out_of_order_rast &&
+           (!old_blend ||
+            (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+             old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+             old_blend->logicop_enable != blend->logicop_enable)))
+               si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -1059,6 +1067,30 @@ static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
                s->zpass_op != PIPE_STENCIL_OP_KEEP);
 }
 
+static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
+{
+       /* REPLACE is normally order invariant, except when the stencil
+        * reference value is written by the fragment shader. Tracking this
+        * interaction does not seem worth the effort, so be conservative. */
+       return op != PIPE_STENCIL_OP_INCR &&
+              op != PIPE_STENCIL_OP_DECR &&
+              op != PIPE_STENCIL_OP_REPLACE;
+}
+
+/* Compute whether, assuming Z writes are disabled, this stencil state is order
+ * invariant in the sense that the set of passing fragments as well as the
+ * final stencil buffer result does not depend on the order of fragments. */
+static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
+{
+       return !state->enabled || !state->writemask ||
+              /* The following assumes that Z writes are disabled. */
+              (state->func == PIPE_FUNC_ALWAYS &&
+               si_order_invariant_stencil_op(state->zpass_op) &&
+               si_order_invariant_stencil_op(state->zfail_op)) ||
+              (state->func == PIPE_FUNC_NEVER &&
+               si_order_invariant_stencil_op(state->fail_op));
+}
+
 static void *si_create_dsa_state(struct pipe_context *ctx,
                                 const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -1125,6 +1157,44 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
                                      si_dsa_writes_stencil(&state->stencil[1]));
        dsa->db_can_write = dsa->depth_write_enabled ||
                            dsa->stencil_write_enabled;
+
+       bool zfunc_is_ordered =
+               state->depth.func == PIPE_FUNC_NEVER ||
+               state->depth.func == PIPE_FUNC_LESS ||
+               state->depth.func == PIPE_FUNC_LEQUAL ||
+               state->depth.func == PIPE_FUNC_GREATER ||
+               state->depth.func == PIPE_FUNC_GEQUAL;
+
+       bool nozwrite_and_order_invariant_stencil =
+               !dsa->db_can_write ||
+               (!dsa->depth_write_enabled &&
+                si_order_invariant_stencil_state(&state->stencil[0]) &&
+                si_order_invariant_stencil_state(&state->stencil[1]));
+
+       dsa->order_invariance[1].zs =
+               nozwrite_and_order_invariant_stencil ||
+               (!dsa->stencil_write_enabled && zfunc_is_ordered);
+       dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
+
+       dsa->order_invariance[1].pass_set =
+               nozwrite_and_order_invariant_stencil ||
+               (!dsa->stencil_write_enabled &&
+                (state->depth.func == PIPE_FUNC_ALWAYS ||
+                 state->depth.func == PIPE_FUNC_NEVER));
+       dsa->order_invariance[0].pass_set =
+               !dsa->depth_write_enabled ||
+               (state->depth.func == PIPE_FUNC_ALWAYS ||
+                state->depth.func == PIPE_FUNC_NEVER);
+
+       const bool assume_no_z_fights = false;
+
+       dsa->order_invariance[1].pass_last =
+               assume_no_z_fights && !dsa->stencil_write_enabled &&
+               dsa->depth_write_enabled && zfunc_is_ordered;
+       dsa->order_invariance[0].pass_last =
+               assume_no_z_fights &&
+               dsa->depth_write_enabled && zfunc_is_ordered;
+
        return dsa;
 }
 
@@ -1154,6 +1224,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
              old_dsa->stencil_enabled != dsa->stencil_enabled ||
              old_dsa->db_can_write != dsa->db_can_write)))
                si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+
+       if (sctx->screen->has_out_of_order_rast &&
+           (!old_dsa ||
+            memcmp(old_dsa->order_invariance, dsa->order_invariance,
+                   sizeof(old_dsa->order_invariance))))
+               si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
@@ -1198,6 +1274,11 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx,
        struct si_context *sctx = (struct si_context*)ctx;
 
        si_mark_atom_dirty(sctx, &sctx->db_render_state);
+
+       bool perfect_enable = sctx->b.num_perfect_occlusion_queries != 0;
+
+       if (perfect_enable != old_perfect_enable)
+               si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_save_qbo_state(struct pipe_context *ctx, struct r600_qbo_state *st)
@@ -2549,6 +2630,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        struct r600_texture *rtex;
        bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
        unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+       unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+       bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+       bool old_has_stencil =
+               old_has_zsbuf &&
+               ((struct r600_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
        bool unbound = false;
        int i;
 
@@ -2706,15 +2792,17 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                }
        }
 
+       struct r600_texture *zstex = NULL;
+
        if (state->zsbuf) {
                surf = (struct r600_surface*)state->zsbuf;
-               rtex = (struct r600_texture*)surf->base.texture;
+               zstex = (struct r600_texture*)surf->base.texture;
 
                if (!surf->depth_initialized) {
                        si_init_depth_surface(sctx, surf);
                }
 
-               if (vi_tc_compat_htile_enabled(rtex, surf->base.u.tex.level))
+               if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level))
                        sctx->framebuffer.DB_has_shader_readable_metadata = true;
 
                r600_context_add_resource_size(ctx, surf->base.texture);
@@ -2730,6 +2818,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
                si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
+       if (sctx->screen->has_out_of_order_rast &&
+           (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+            !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+            (zstex && zstex->surface.has_stencil != old_has_stencil)))
+               si_mark_atom_dirty(sctx, &sctx->msaa_config);
+
        if (sctx->framebuffer.nr_samples != old_nr_samples) {
                si_mark_atom_dirty(sctx, &sctx->msaa_config);
                si_mark_atom_dirty(sctx, &sctx->db_render_state);
@@ -3066,16 +3160,75 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
        }
 }
 
+static bool si_out_of_order_rasterization(struct si_context *sctx)
+{
+       struct si_state_blend *blend = sctx->queued.named.blend;
+       struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+       if (!sctx->screen->has_out_of_order_rast)
+               return false;
+
+       unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
+
+       if (blend) {
+               colormask &= blend->cb_target_enabled_4bit;
+       } else {
+               colormask = 0;
+       }
+
+       /* Conservative: No logic op. */
+       if (colormask && blend->logicop_enable)
+               return false;
+
+       struct si_dsa_order_invariance dsa_order_invariant = {
+               .zs = true, .pass_set = true, .pass_last = false
+       };
+
+       if (sctx->framebuffer.state.zsbuf) {
+               struct r600_texture *zstex =
+                       (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+               bool has_stencil = zstex->surface.has_stencil;
+               dsa_order_invariant = dsa->order_invariance[has_stencil];
+               if (!dsa_order_invariant.zs)
+                       return false;
+
+               /* The set of PS invocations is always order invariant,
+                * except when early Z/S tests are requested. */
+               if (sctx->ps_shader.cso &&
+                   sctx->ps_shader.cso->info.writes_memory &&
+                   sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+                   !dsa_order_invariant.pass_set)
+                       return false;
+
+               if (sctx->b.num_perfect_occlusion_queries != 0 &&
+                   !dsa_order_invariant.pass_set)
+                       return false;
+       }
+
+       if (!colormask)
+               return true;
+
+       bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0;
+
+       if (blend_enabled)
+               return false; /* TODO */
+
+       return dsa_order_invariant.pass_last;
+}
+
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
        /* 33% faster rendering to linear color buffers */
        bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+       bool out_of_order_rast = si_out_of_order_rasterization(sctx);
        unsigned sc_mode_cntl_1 =
                S_028A4C_WALK_SIZE(dst_is_linear) |
                S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
                S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+               S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+               S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
                /* always 1: */
                S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
                S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
index 672229c15d9f60071b3682c35db577c2b664a3f7..4f14f89166d24f19e673886afa52804463e2a01a 100644 (file)
@@ -49,15 +49,16 @@ struct si_shader_selector;
 struct si_state_blend {
        struct si_pm4_state     pm4;
        uint32_t                cb_target_mask;
-       bool                    alpha_to_coverage;
-       bool                    alpha_to_one;
-       bool                    dual_src_blend;
        /* Set 0xf or 0x0 (4 bits) per render target if the following is
         * true. ANDed with spi_shader_col_format.
         */
        unsigned                cb_target_enabled_4bit;
        unsigned                blend_enable_4bit;
        unsigned                need_src_alpha_4bit;
+       bool                    alpha_to_coverage:1;
+       bool                    alpha_to_one:1;
+       bool                    dual_src_blend:1;
+       bool                    logicop_enable:1;
 };
 
 struct si_state_rasterizer {
@@ -89,15 +90,36 @@ struct si_dsa_stencil_ref_part {
        uint8_t                 writemask[2];
 };
 
+struct si_dsa_order_invariance {
+       /** Whether the final result in Z/S buffers is guaranteed to be
+        * invariant under changes to the order in which fragments arrive. */
+       bool zs:1;
+
+       /** Whether the set of fragments that pass the combined Z/S test is
+        * guaranteed to be invariant under changes to the order in which
+        * fragments arrive. */
+       bool pass_set:1;
+
+       /** Whether the last fragment that passes the combined Z/S test at each
+        * sample is guaranteed to be invariant under changes to the order in
+        * which fragments arrive. */
+       bool pass_last:1;
+};
+
 struct si_state_dsa {
        struct si_pm4_state             pm4;
        struct si_dsa_stencil_ref_part  stencil_ref;
+
+       /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
+       struct si_dsa_order_invariance  order_invariance[2];
+
        ubyte                           alpha_func:3;
        bool                            depth_enabled:1;
        bool                            depth_write_enabled:1;
        bool                            stencil_enabled:1;
        bool                            stencil_write_enabled:1;
        bool                            db_can_write:1;
+
 };
 
 struct si_stencil_ref {
index bbc6b1d7080a54dd41c5bebfcd3acd32a138b377..53a60ba11edb9e36827c1c7ece14cf9199aae018 100644 (file)
@@ -2421,6 +2421,13 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
                if (!old_sel ||
                    old_sel->info.colors_written != sel->info.colors_written)
                        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+
+               if (sctx->screen->has_out_of_order_rast &&
+                   (!old_sel ||
+                    old_sel->info.writes_memory != sel->info.writes_memory ||
+                    old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
+                    sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
+                       si_mark_atom_dirty(sctx, &sctx->msaa_config);
        }
        si_set_active_descriptors_for_shader(sctx, sel);
 }