- struct si_shader_selector *sel = shader->selector;
- const ubyte *num_components = sel->info.num_stream_output_components;
- unsigned gs_num_invocations = sel->gs_num_invocations;
- struct si_pm4_state *pm4;
- uint64_t va;
- unsigned max_stream = sel->max_gs_stream;
- unsigned offset;
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- offset = num_components[0] * sel->gs_max_out_vertices;
- si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
- if (max_stream >= 1)
- offset += num_components[1] * sel->gs_max_out_vertices;
- si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
- if (max_stream >= 2)
- offset += num_components[2] * sel->gs_max_out_vertices;
- si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
- si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
- si_conv_prim_to_gs_out(sel->gs_output_prim));
- if (max_stream >= 3)
- offset += num_components[3] * sel->gs_max_out_vertices;
- si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
-
- /* The GSVS_RING_ITEMSIZE register takes 15 bits */
- assert(offset < (1 << 15));
-
- si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
-
- si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
- si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
- si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
- si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
-
- si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
- S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
- S_028B90_ENABLE(gs_num_invocations > 0));
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (sscreen->info.chip_class >= GFX9) {
- unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
- unsigned es_type = shader->key.part.gs.es->type;
- unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
- struct gfx9_gs_info gs_info;
-
- if (es_type == PIPE_SHADER_VERTEX)
- /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
- es_vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
- else if (es_type == PIPE_SHADER_TESS_EVAL)
- es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
- else
- unreachable("invalid shader selector type");
-
- /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
- * VGPR[0:4] are always loaded.
- */
- if (sel->info.uses_invocationid)
- gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
- else if (sel->info.uses_primid)
- gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
- else if (input_prim >= PIPE_PRIM_TRIANGLES)
- gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
- else
- gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
- unsigned num_user_sgprs;
- if (es_type == PIPE_SHADER_VERTEX)
- num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
- else
- num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
- gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
-
- si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
-
- si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B228_DX10_CLAMP(1) |
- S_00B228_FLOAT_MODE(shader->config.float_mode) |
- S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
- si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
- S_00B22C_USER_SGPR(num_user_sgprs) |
- S_00B22C_USER_SGPR_MSB(num_user_sgprs >> 5) |
- S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
- S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
- S_00B22C_LDS_SIZE(gs_info.lds_size) |
- S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-
- si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
- S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) |
- S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) |
- S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup));
- si_pm4_set_reg(pm4, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
- S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
- si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
- shader->key.part.gs.es->esgs_itemsize / 4);
-
- if (es_type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
-
- polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
- NULL, pm4);
- } else {
- si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
- si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
-
- si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B228_DX10_CLAMP(1) |
- S_00B228_FLOAT_MODE(shader->config.float_mode));
- si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
- S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
- S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
- }
+ struct si_shader_selector *sel = shader->selector;
+ const ubyte *num_components = sel->info.num_stream_output_components;
+ unsigned gs_num_invocations = sel->gs_num_invocations;
+ struct si_pm4_state *pm4;
+ uint64_t va;
+ unsigned max_stream = sel->max_gs_stream;
+ unsigned offset;
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ pm4->atom.emit = si_emit_shader_gs;
+
+ offset = num_components[0] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
+
+ if (max_stream >= 1)
+ offset += num_components[1] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
+
+ if (max_stream >= 2)
+ offset += num_components[2] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
+
+ if (max_stream >= 3)
+ offset += num_components[3] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
+
+ /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+ assert(offset < (1 << 15));
+
+ shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
+
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
+
+ shader->ctx_reg.gs.vgt_gs_instance_cnt =
+ S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+
+ va = shader->bo->gpu_address;
+
+ if (sscreen->info.chip_class >= GFX9) {
+ unsigned input_prim = sel->info.base.gs.input_primitive;
+ gl_shader_stage es_stage = shader->key.part.gs.es->info.stage;
+ unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+
+ if (es_stage == MESA_SHADER_VERTEX) {
+ es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+ } else if (es_stage == MESA_SHADER_TESS_EVAL)
+ es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
+ else
+ unreachable("invalid shader selector type");
+
+ /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+ * VGPR[0:4] are always loaded.
+ */
+ if (sel->info.uses_invocationid)
+ gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+ else if (sel->info.uses_primid)
+ gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+ else if (input_prim >= PIPE_PRIM_TRIANGLES)
+ gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+ else
+ gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+ unsigned num_user_sgprs;
+ if (es_stage == MESA_SHADER_VERTEX)
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+ else
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+ } else {
+ si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
+ }
+
+ uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
+ S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+ S_00B228_FLOAT_MODE(shader->config.float_mode) |
+ S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+ uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
+ S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+ S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
+ S_00B22C_LDS_SIZE(shader->config.lds_size) |
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+ } else {
+ rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
+ rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+ }
+
+ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+ }
+
+ shader->ctx_reg.gs.vgt_gs_onchip_cntl =
+ S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
+ shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
+ S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
+ shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
+
+ if (es_stage == MESA_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+ polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+ } else {
+ si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+
+ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+ S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+ }
+}
+
+static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
+{
+ enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+ sctx->tracked_regs.reg_value[reg] != value) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (sctx->chip_class == GFX10) {
+ /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+ }
+
+ radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+
+ sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+ sctx->tracked_regs.reg_value[reg] = value;
+ }
+}
+
+/* Common tail code for NGG primitive shaders. */
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader,
+ unsigned initial_cdw)
+{
+ radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+ SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+ shader->ctx_reg.ngg.ge_max_output_per_subgroup);
+ radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+ shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
+ radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+ shader->ctx_reg.ngg.vgt_primitiveid_en);
+ radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
+ radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+ shader->ctx_reg.ngg.vgt_gs_instance_cnt);
+ radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
+ radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+ shader->ctx_reg.ngg.spi_vs_out_config);
+ radeon_opt_set_context_reg2(
+ sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
+ shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
+ radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+ shader->ctx_reg.ngg.pa_cl_vte_cntl);
+ radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+ shader->ctx_reg.ngg.pa_cl_ngg_cntl);
+
+ radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+}
+
+static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
+{
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+}
+
+static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
+{
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
+
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+}
+
+static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
+{
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+}
+
+static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
+{
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
+
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+}
+
+unsigned si_get_input_prim(const struct si_shader_selector *gs)
+{
+ if (gs->info.stage == MESA_SHADER_GEOMETRY)
+ return gs->info.base.gs.input_primitive;
+
+ if (gs->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (gs->info.base.tess.point_mode)
+ return PIPE_PRIM_POINTS;
+ if (gs->info.base.tess.primitive_mode == GL_LINES)
+ return PIPE_PRIM_LINES;
+ return PIPE_PRIM_TRIANGLES;
+ }
+
+ /* TODO: Set this correctly if the primitive type is set in the shader key. */
+ return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
+}
+
+static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg)
+{
+ bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+ sel->info.writes_layer || sel->info.writes_viewport_index;
+ return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+ S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
+ S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+ S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+ S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+ S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+}
+
+/**
+ * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
+ * in NGG mode.
+ */
+static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
+{
+ const struct si_shader_selector *gs_sel = shader->selector;
+ const struct si_shader_info *gs_info = &gs_sel->info;
+ const gl_shader_stage gs_stage = shader->selector->info.stage;
+ const struct si_shader_selector *es_sel =
+ shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+ const struct si_shader_info *es_info = &es_sel->info;
+ const gl_shader_stage es_stage = es_sel->info.stage;
+ unsigned num_user_sgprs;
+ unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+ uint64_t va;
+ bool window_space = gs_info->stage == MESA_SHADER_VERTEX ?
+ gs_info->base.vs.window_space_position : 0;
+ bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
+ unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+ unsigned input_prim = si_get_input_prim(gs_sel);
+ bool break_wave_at_eoi = false;
+ struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ if (es_stage == MESA_SHADER_TESS_EVAL) {
+ pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
+ : gfx10_emit_shader_ngg_tess_nogs;
+ } else {
+ pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
+ : gfx10_emit_shader_ngg_notess_nogs;
+ }
+
+ va = shader->bo->gpu_address;
+
+ if (es_stage == MESA_SHADER_VERTEX) {
+ es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+
+ if (es_info->base.vs.blit_sgprs_amd) {
+ num_user_sgprs =
+ SI_SGPR_VS_BLIT_DATA + es_info->base.vs.blit_sgprs_amd;
+ } else {
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+ }
+ } else {
+ assert(es_stage == MESA_SHADER_TESS_EVAL);
+ es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+ if (es_enable_prim_id || gs_info->uses_primid)
+ break_wave_at_eoi = true;
+ }
+
+ /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+ * VGPR[0:4] are always loaded.
+ *
+ * Vertex shaders always need to load VGPR3, because they need to
+ * pass edge flags for decomposed primitives (such as quads) to the PA
+ * for the GL_LINE polygon mode to skip rendering lines on inner edges.
+ */
+ if (gs_info->uses_invocationid ||
+ (gs_stage == MESA_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+ gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
+ else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
+ (gs_stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+ gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+ else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
+ gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+ else
+ gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+ unsigned wave_size = si_get_shader_wave_size(shader);
+
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
+ si_pm4_set_reg(
+ pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
+ S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
+ S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
+ S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
+ S_00B22C_USER_SGPR(num_user_sgprs) |
+ S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+ S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
+ S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
+ S_00B22C_LDS_SIZE(shader->config.lds_size));
+
+ /* Determine LATE_ALLOC_GS. */
+ unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
+ unsigned late_alloc_wave64; /* The limit is per SA. */
+
+ /* For Wave32, the hw will launch twice the number of late
+ * alloc waves, so 1 == 2x wave32.
+ *
+ * Don't use late alloc for NGG on Navi14 due to a hw bug.
+ */
+ if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
+ late_alloc_wave64 = 0;
+ else if (num_cu_per_sh <= 6)
+ late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+ else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+ else
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+ /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
+ if (sscreen->info.chip_class == GFX10)
+ late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
+
+ si_pm4_set_reg(
+ pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
+ nparams = MAX2(shader->info.nr_param_exports, 1);
+ shader->ctx_reg.ngg.spi_vs_out_config =
+ S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
+ S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+
+ shader->ctx_reg.ngg.spi_shader_idx_format =
+ S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
+ shader->ctx_reg.ngg.spi_shader_pos_format =
+ S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+ S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE);
+
+ shader->ctx_reg.ngg.vgt_primitiveid_en =
+ S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
+ S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+ gs_sel->info.writes_primid);
+
+ if (gs_stage == MESA_SHADER_GEOMETRY) {
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
+ } else {
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
+ }
+
+ if (es_stage == MESA_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, es_sel, pm4);
+
+ shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
+ S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
+ shader->ctx_reg.ngg.ge_max_output_per_subgroup =
+ S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
+ shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
+ S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
+ shader->ctx_reg.ngg.vgt_gs_instance_cnt =
+ S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
+ S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
+
+ /* Always output hw-generated edge flags and pass them via the prim
+ * export to prevent drawing lines on internal edges of decomposed
+ * primitives (such as quads) with polygon mode = lines. Only VS needs
+ * this.
+ */
+ shader->ctx_reg.ngg.pa_cl_ngg_cntl =
+ S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_stage == MESA_SHADER_VERTEX) |
+ /* Reuse for NGG. */
+ S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0);
+ shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
+
+ /* Oversubscribe PC. This improves performance when there are too many varyings. */
+ float oversub_pc_factor = 0.25;
+
+ if (shader->key.opt.ngg_culling) {
+ /* Be more aggressive with NGG culling. */
+ if (shader->info.nr_param_exports > 4)
+ oversub_pc_factor = 1;
+ else if (shader->info.nr_param_exports > 2)
+ oversub_pc_factor = 0.75;
+ else
+ oversub_pc_factor = 0.5;
+ }
+
+ unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
+ shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+ S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+ } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+ } else {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+ /* Bug workaround for a possible hang with non-tessellation cases.
+ * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+ *
+ * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+ */
+ if ((sscreen->info.chip_class == GFX10) &&
+ (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
+ shader->ngg.hw_max_esverts != 256) {
+ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+ if (shader->ngg.hw_max_esverts > 5) {
+ shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+ }
+ }
+ }
+
+ if (window_space) {
+ shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+ } else {
+ shader->ctx_reg.ngg.pa_cl_vte_cntl =
+ S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+ S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+ S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+ }
+}
+
+static void si_emit_shader_vs(struct si_context *sctx)
+{
+ struct si_shader *shader = sctx->queued.named.vs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
+ shader->ctx_reg.vs.vgt_gs_mode);
+ radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+ shader->ctx_reg.vs.vgt_primitiveid_en);
+
+ if (sctx->chip_class <= GFX8) {
+ radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
+ shader->ctx_reg.vs.vgt_reuse_off);
+ }
+
+ radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+ shader->ctx_reg.vs.spi_vs_out_config);
+
+ radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
+ SI_TRACKED_SPI_SHADER_POS_FORMAT,
+ shader->ctx_reg.vs.spi_shader_pos_format);
+
+ radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+ shader->ctx_reg.vs.pa_cl_vte_cntl);
+
+ if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
+
+ if (shader->vgt_vertex_reuse_block_cntl)
+ radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ shader->vgt_vertex_reuse_block_cntl);
+
+ /* Required programming for tessellation. (legacy pipeline only) */
+ if (sctx->chip_class >= GFX10 && shader->selector->info.stage == MESA_SHADER_TESS_EVAL) {
+ radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
+ SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ S_028A44_ES_VERTS_PER_SUBGRP(250) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+ }
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+ if (sctx->chip_class >= GFX10)
+ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);