radeonsi/gfx10: export correct PrimitiveID from NGG vertex shaders
authorMarek Olšák <marek.olsak@amd.com>
Thu, 6 Jun 2019 00:20:47 +0000 (20:20 -0400)
committerMarek Olšák <marek.olsak@amd.com>
Wed, 3 Jul 2019 19:51:13 +0000 (15:51 -0400)
Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c

index 8461a39488e82b85e140199469d9bb35a3f5af3a..8fbce10012f5ddb76a0b9c79f8161f767767022e 100644 (file)
@@ -616,6 +616,30 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                emitted_prims = nggso.emit[0];
        }
 
+       /* Copy Primitive IDs from GS threads to the LDS address corresponding
+        * to the ES thread of the provoking vertex.
+        */
+       if (ctx->type == PIPE_SHADER_VERTEX &&
+           ctx->shader->key.mono.u.vs_export_prim_id) {
+               /* Streamout uses LDS. We need to wait for it before we can reuse it. */
+               if (sel->so.num_outputs)
+                       ac_build_s_barrier(&ctx->ac);
+
+               ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+               /* Extract the PROVOKING_VTX_INDEX field. */
+               LLVMValueRef provoking_vtx_in_prim =
+                       si_unpack_param(ctx, ctx->param_vs_state_bits, 4, 2);
+
+               /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
+               LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
+               LLVMValueRef provoking_vtx_index =
+                       LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
+
+               LLVMBuildStore(builder, ctx->abi.gs_prim_id,
+                              ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
+               ac_build_endif(&ctx->ac, 5400);
+       }
+
        /* TODO: primitive culling */
 
        build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
@@ -700,12 +724,23 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                        }
                }
 
-               /* TODO: Vertex shaders have to get PrimitiveID from GS VGPRs. */
-               if (ctx->type == PIPE_SHADER_TESS_EVAL &&
-                   ctx->shader->key.mono.u.vs_export_prim_id) {
+               if (ctx->shader->key.mono.u.vs_export_prim_id) {
                        outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
                        outputs[i].semantic_index = 0;
-                       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+
+                       if (ctx->type == PIPE_SHADER_VERTEX) {
+                               /* Wait for GS stores to finish. */
+                               ac_build_s_barrier(&ctx->ac);
+
+                               tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring,
+                                                   get_thread_id_in_tg(ctx));
+                               outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
+                       } else {
+                               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+                               outputs[i].values[0] = si_get_primitive_id(ctx, 0);
+                       }
+
+                       outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
                        for (unsigned j = 1; j < 4; j++)
                                outputs[i].values[j] = LLVMGetUndef(ctx->f32);
 
index f32e64ea5707fa150086971dc3cf40be97b04d3c..277a25a0b3ee47304d528333544c67556f0887b6 100644 (file)
@@ -441,6 +441,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
        ctx->last_prim = -1;
        ctx->last_multi_vgt_param = -1;
        ctx->last_rast_prim = -1;
+       ctx->last_flatshade_first = -1;
        ctx->last_sc_line_stipple = ~0;
        ctx->last_vs_state = ~0;
        ctx->last_ls = NULL;
index a9080c93505f9f87a4d1b973a062583bcb0e94c0..298e63738c47eb44ad94ad4cb2f1341dc44be3c3 100644 (file)
@@ -1059,6 +1059,7 @@ struct si_context {
        int                     last_prim;
        int                     last_multi_vgt_param;
        int                     last_rast_prim;
+       int                     last_flatshade_first;
        unsigned                last_sc_line_stipple;
        unsigned                current_vs_state;
        unsigned                last_vs_state;
index 032b5a7bd8b29253205a75c491c2c1511aba25b1..be3b897c791915bbfd337bf01b3069d43ef76d95 100644 (file)
@@ -5228,10 +5228,20 @@ static bool si_shader_binary_open(struct si_screen *screen,
                esgs_ring_size = shader->gs_info.esgs_ring_size;;
        }
 
-       if (sel && shader->key.as_ngg && sel->so.num_outputs) {
-               unsigned esgs_vertex_bytes = 4 * (4 * sel->info.num_outputs + 1);
-               esgs_ring_size = MAX2(esgs_ring_size,
-                                     shader->ngg.max_out_verts * esgs_vertex_bytes);
+       if (sel && shader->key.as_ngg) {
+               if (sel->so.num_outputs) {
+                       unsigned esgs_vertex_bytes = 4 * (4 * sel->info.num_outputs + 1);
+                       esgs_ring_size = MAX2(esgs_ring_size,
+                                             shader->ngg.max_out_verts * esgs_vertex_bytes);
+               }
+
+               /* GS stores Primitive IDs into LDS at the address corresponding
+                * to the provoking vertex. All vertex threads load and export
+                * PrimitiveID for their thread.
+                */
+               if (sel->type == PIPE_SHADER_VERTEX &&
+                   shader->key.mono.u.vs_export_prim_id)
+                       esgs_ring_size = MAX2(esgs_ring_size, shader->ngg.max_out_verts * 4);
        }
 
        if (esgs_ring_size) {
index b545bf1bc235dc46fdb81076a609420960328710..801895b240cb0d0bd283ec4cf3c264d0f6f2387d 100644 (file)
@@ -241,13 +241,14 @@ enum {
 };
 
 /* Fields of driver-defined VS state SGPR. */
-/* Clamp vertex color output (only used in VS as VS). */
 #define S_VS_STATE_CLAMP_VERTEX_COLOR(x)       (((unsigned)(x) & 0x1) << 0)
 #define C_VS_STATE_CLAMP_VERTEX_COLOR          0xFFFFFFFE
 #define S_VS_STATE_INDEXED(x)                  (((unsigned)(x) & 0x1) << 1)
 #define C_VS_STATE_INDEXED                     0xFFFFFFFD
 #define S_VS_STATE_OUTPRIM(x)                  (((unsigned)(x) & 0x3) << 2)
 #define C_VS_STATE_OUTPRIM                     0xFFFFFFF3
+#define S_VS_STATE_PROVOKING_VTX_INDEX(x)      (((unsigned)(x) & 0x3) << 4)
+#define C_VS_STATE_PROVOKING_VTX_INDEX         0xFFFFFFCF
 #define S_VS_STATE_LS_OUT_PATCH_SIZE(x)                (((unsigned)(x) & 0x1FFF) << 8)
 #define C_VS_STATE_LS_OUT_PATCH_SIZE           0xFFE000FF
 #define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)       (((unsigned)(x) & 0xFF) << 24)
index e9388e6252ce8dc7b06bb0902e690c449e2afc61..9f3e08675ac78ea6cb76b6f3707e15783168c40c 100644 (file)
@@ -892,6 +892,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
        rs->clamp_fragment_color = state->clamp_fragment_color;
        rs->clamp_vertex_color = state->clamp_vertex_color;
        rs->flatshade = state->flatshade;
+       rs->flatshade_first = state->flatshade_first;
        rs->sprite_coord_enable = state->sprite_coord_enable;
        rs->rasterizer_discard = state->rasterizer_discard;
        rs->pa_sc_line_stipple = state->line_stipple_enable ?
index 4493969037c9bfb7f6bd0a9427eadd1e35a562ce..91b4f1ea13eac110401f5fce2315039762200bee 100644 (file)
@@ -74,6 +74,7 @@ struct si_state_rasterizer {
        unsigned                clip_plane_enable:8;
        unsigned                half_pixel_center:1;
        unsigned                flatshade:1;
+       unsigned                flatshade_first:1;
        unsigned                two_side:1;
        unsigned                multisample_enable:1;
        unsigned                force_persample_interp:1;
index eddfdd65da259a2aad98989e50af8758b45d50c5..2f142bc67810c7cbb8307f2f4f7a47ef0f290141 100644 (file)
@@ -586,7 +586,9 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
        struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
        if (likely(rast_prim == sctx->last_rast_prim &&
-                  rs->pa_sc_line_stipple == sctx->last_sc_line_stipple))
+                  rs->pa_sc_line_stipple == sctx->last_sc_line_stipple &&
+                  (sctx->chip_class <= GFX9 ||
+                   rs->flatshade_first == sctx->last_flatshade_first)))
                return;
 
        if (util_prim_is_lines(rast_prim)) {
@@ -599,9 +601,10 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
                sctx->context_roll = true;
        }
 
+       unsigned gs_out = si_conv_prim_to_gs_out(sctx->current_rast_prim);
+
        if (rast_prim != sctx->last_rast_prim &&
            (sctx->ngg || sctx->gs_shader.cso)) {
-               unsigned gs_out = si_conv_prim_to_gs_out(sctx->current_rast_prim);
                radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
                sctx->context_roll = true;
 
@@ -611,8 +614,15 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
                }
        }
 
+       if (sctx->chip_class >= GFX10) {
+               unsigned vtx_index = rs->flatshade_first ? 0 : gs_out;
+               sctx->current_vs_state &= C_VS_STATE_PROVOKING_VTX_INDEX;
+               sctx->current_vs_state |= S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
+       }
+
        sctx->last_rast_prim = rast_prim;
        sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
+       sctx->last_flatshade_first = rs->flatshade_first;
 }
 
 static void si_emit_vs_state(struct si_context *sctx,