radeonsi/gfx10: combine hw edgeflags with user edgeflags for correct behavior
authorMarek Olšák <marek.olsak@amd.com>
Sat, 6 Jul 2019 04:12:26 +0000 (00:12 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Sat, 20 Jul 2019 00:16:19 +0000 (20:16 -0400)
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index 5c410d79ec1c1d4a1e0b45883a8754eacc6dc357..6b3c1017fb2a549935cb9fd6df6c53662ad7161b 100644 (file)
@@ -500,8 +500,12 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
 {
        unsigned lds_vertex_size = 0;
 
+       /* The edgeflag is always stored in the last element that's also
+        * used for padding to reduce LDS bank conflicts. */
        if (shader->selector->so.num_outputs)
                lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
+       if (shader->selector->ngg_writes_edgeflag)
+               lds_vertex_size = MAX2(lds_vertex_size, 1);
 
        return lds_vertex_size;
 }
@@ -541,7 +545,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 
        LLVMValueRef vertex_ptr = NULL;
 
-       if (sel->so.num_outputs)
+       if (sel->so.num_outputs || sel->ngg_writes_edgeflag)
                vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
        for (unsigned i = 0; i < info->num_outputs; i++) {
@@ -563,6 +567,19 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                                LLVMBuildStore(builder, tmp2, tmp);
                        }
                }
+
+               /* Store the edgeflag at the end (if streamout is enabled) */
+               if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
+                   sel->ngg_writes_edgeflag) {
+                       LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
+                       /* The output is a float, but the hw expects a 1-bit integer. */
+                       edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->i32, "");
+                       edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->i32_1);
+
+                       tmp = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+                       tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+                       LLVMBuildStore(builder, edgeflag, tmp);
+               }
        }
 
        lp_build_endif(&ctx->merged_wrap_if_state);
@@ -623,13 +640,35 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                emitted_prims = nggso.emit[0];
        }
 
+       LLVMValueRef user_edgeflags[3] = {};
+
+       if (sel->ngg_writes_edgeflag) {
+               /* Streamout already inserted the barrier, so don't insert it again. */
+               if (!sel->so.num_outputs)
+                       ac_build_s_barrier(&ctx->ac);
+
+               ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+               /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
+               for (unsigned i = 0; i < num_vertices; i++) {
+                       tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+                       tmp2 = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+                       tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
+                       tmp = LLVMBuildLoad(builder, tmp, "");
+                       tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, "");
+
+                       user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
+                       LLVMBuildStore(builder, tmp, user_edgeflags[i]);
+               }
+               ac_build_endif(&ctx->ac, 5400);
+       }
+
        /* Copy Primitive IDs from GS threads to the LDS address corresponding
         * to the ES thread of the provoking vertex.
         */
        if (ctx->type == PIPE_SHADER_VERTEX &&
            ctx->shader->key.mono.u.vs_export_prim_id) {
-               /* Streamout uses LDS. We need to wait for it before we can reuse it. */
-               if (sel->so.num_outputs)
+               /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
+               if (sel->so.num_outputs || sel->ngg_writes_edgeflag)
                        ac_build_s_barrier(&ctx->ac);
 
                ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
@@ -647,8 +686,6 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                ac_build_endif(&ctx->ac, 5400);
        }
 
-       /* TODO: primitive culling */
-
        build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
 
        /* Update query buffer */
@@ -711,9 +748,20 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
 
                for (unsigned i = 0; i < num_vertices; ++i) {
+                       if (ctx->type != PIPE_SHADER_VERTEX) {
+                               prim.edgeflag[i] = ctx->i1false;
+                               continue;
+                       }
+
                        tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
                                            LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
                        prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+                       if (sel->ngg_writes_edgeflag) {
+                               tmp2 = LLVMBuildLoad(builder, user_edgeflags[i], "");
+                               prim.edgeflag[i] = LLVMBuildAnd(builder, prim.edgeflag[i],
+                                                               tmp2, "");
+                       }
                }
 
                build_export_prim(ctx, &prim);
index 75f2c241d3380695894484c4b87af3a18d3822d0..a3578ebf720efda6f44d437108b56e1a73d10421 100644 (file)
@@ -2964,11 +2964,11 @@ void si_llvm_export_vs(struct si_shader_context *ctx,
 
        /* Write the misc vector (point size, edgeflag, layer, viewport). */
        if (shader->selector->info.writes_psize ||
-           shader->selector->info.writes_edgeflag ||
+           shader->selector->pos_writes_edgeflag ||
            shader->selector->info.writes_viewport_index ||
            shader->selector->info.writes_layer) {
                pos_args[1].enabled_channels = shader->selector->info.writes_psize |
-                                              (shader->selector->info.writes_edgeflag << 1) |
+                                              (shader->selector->pos_writes_edgeflag << 1) |
                                               (shader->selector->info.writes_layer << 2);
 
                pos_args[1].valid_mask = 0; /* EXEC mask */
@@ -2983,7 +2983,7 @@ void si_llvm_export_vs(struct si_shader_context *ctx,
                if (shader->selector->info.writes_psize)
                        pos_args[1].out[0] = psize_value;
 
-               if (shader->selector->info.writes_edgeflag) {
+               if (shader->selector->pos_writes_edgeflag) {
                        /* The output is a float, but the hw expects an integer
                         * with the first bit containing the edge flag. */
                        edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
index 070ccd2ad1775a1cc611ebd51aebd88999726e33..211e2db169782280794b84bc9c34dd0e7011cabb 100644 (file)
@@ -351,6 +351,8 @@ struct si_shader_selector {
        bool            vs_needs_prolog;
        bool            force_correct_derivs_after_kill;
        bool            prim_discard_cs_allowed;
+       bool            ngg_writes_edgeflag;
+       bool            pos_writes_edgeflag;
        unsigned        pa_cl_vs_out_cntl;
        ubyte           clipdist_mask;
        ubyte           culldist_mask;
index b381b3953bc54e5aa74fd79b0129966af9b334c8..52b3489509dd61a930cffa59f0bdf2975162ee4d 100644 (file)
@@ -1216,17 +1216,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(
                        shader->ngg.max_vert_out_per_gs_instance);
 
-       /* User edge flags are set by the pos exports. If user edge flags are
-        * not used, we must use hw-generated edge flags and pass them via
-        * the prim export to prevent drawing lines on internal edges of
-        * decomposed primitives (such as quads) with polygon mode = lines.
-        *
-        * TODO: We should combine hw-generated edge flags with user edge
-        *       flags in the shader.
+       /* Always output hw-generated edge flags and pass them via the prim
+        * export to prevent drawing lines on internal edges of decomposed
+        * primitives (such as quads) with polygon mode = lines. Only VS needs
+        * this.
         */
        shader->ctx_reg.ngg.pa_cl_ngg_cntl =
-               S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX &&
-                                                !gs_info->writes_edgeflag);
+               S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
 
        shader->ge_cntl =
                S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
@@ -2671,6 +2667,14 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
                !sel->so.num_outputs;
 
+       if (sel->type == PIPE_SHADER_VERTEX &&
+           sel->info.writes_edgeflag) {
+               if (sscreen->info.chip_class >= GFX10)
+                       sel->ngg_writes_edgeflag = true;
+               else
+                       sel->pos_writes_edgeflag = true;
+       }
+
        /* Set which opcode uses which (i,j) pair. */
        if (sel->info.uses_persp_opcode_interp_centroid)
                sel->info.uses_persp_centroid = true;
@@ -2817,11 +2821,11 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 
        /* PA_CL_VS_OUT_CNTL */
        bool misc_vec_ena =
-               sel->info.writes_psize || sel->info.writes_edgeflag ||
+               sel->info.writes_psize || sel->pos_writes_edgeflag ||
                sel->info.writes_layer || sel->info.writes_viewport_index;
        sel->pa_cl_vs_out_cntl =
                S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
-               S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag) |
+               S_02881C_USE_VTX_EDGE_FLAG(sel->pos_writes_edgeflag) |
                S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
                S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
                S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |