ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8);
radv_emit_prefetch(cmd_buffer, va, gs->code_size);
- radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
- radeon_emit(cmd_buffer->cs, va >> 8);
- radeon_emit(cmd_buffer->cs, va >> 40);
- radeon_emit(cmd_buffer->cs, gs->rsrc1);
- radeon_emit(cmd_buffer->cs, gs->rsrc2);
+ if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+ radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
+ radeon_emit(cmd_buffer->cs, va >> 8);
+ radeon_emit(cmd_buffer->cs, va >> 40);
+
+ radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
+ radeon_emit(cmd_buffer->cs, gs->rsrc1);
+ radeon_emit(cmd_buffer->cs, gs->rsrc2 |
+ S_00B22C_LDS_SIZE(pipeline->graphics.gs.lds_size));
+
+ radeon_set_context_reg(cmd_buffer->cs, R_028A44_VGT_GS_ONCHIP_CNTL, pipeline->graphics.gs.vgt_gs_onchip_cntl);
+ radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup);
+ radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, pipeline->graphics.gs.vgt_esgs_ring_itemsize);
+ } else {
+ radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
+ radeon_emit(cmd_buffer->cs, va >> 8);
+ radeon_emit(cmd_buffer->cs, va >> 40);
+ radeon_emit(cmd_buffer->cs, gs->rsrc1);
+ radeon_emit(cmd_buffer->cs, gs->rsrc2);
+ }
radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader, &pipeline->gs_copy_shader->info.vs.outinfo);
return key;
}
+
+static void calculate_gfx9_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+ struct radv_pipeline *pipeline)
+{
+ struct ac_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+ struct ac_es_output_info *es_info = radv_pipeline_has_tess(pipeline) ?
+ &gs_info->tes.es_info : &gs_info->vs.es_info;
+ unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
+ bool uses_adjacency;
+ switch(pCreateInfo->pInputAssemblyState->topology) {
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+ uses_adjacency = false;
+ break;
+ default:
+ uses_adjacency = false;
+ break;
+ }
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space. */
+ const unsigned max_lds_size = 8 * 1024;
+ const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
+ unsigned esgs_lds_size;
+
+ /* All these are per subgroup: */
+ const unsigned max_out_prims = 32 * 1024;
+ const unsigned max_es_verts = 255;
+ const unsigned ideal_gs_prims = 64;
+ unsigned max_gs_prims, gs_prims;
+ unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+ assert(gs_num_invocations <= 32); /* GL maximum */
+
+ if (uses_adjacency || gs_num_invocations > 1)
+ max_gs_prims = 127 / gs_num_invocations;
+ else
+ max_gs_prims = 255;
+
+ /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+ * Make sure we don't go over the maximum value.
+ */
+ if (gs_info->gs.vertices_out > 0) {
+ max_gs_prims = MIN2(max_gs_prims,
+ max_out_prims /
+ (gs_info->gs.vertices_out * gs_num_invocations));
+ }
+ assert(max_gs_prims > 0);
+
+ /* If the primitive has adjacency, halve the number of vertices
+ * that will be reused in multiple primitives.
+ */
+ min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
+
+ gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+ /* Compute ESGS LDS size based on the worst case number of ES vertices
+ * needed to create the target number of GS prims per subgroup.
+ */
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+ /* If total LDS usage is too big, refactor partitions based on ratio
+ * of ESGS item sizes.
+ */
+ if (esgs_lds_size > max_lds_size) {
+ /* Our target GS Prims Per Subgroup was too large. Calculate
+ * the maximum number of GS Prims Per Subgroup that will fit
+ * into LDS, capped by the maximum that the hardware can support.
+ */
+ gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
+ max_gs_prims);
+ assert(gs_prims > 0);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims,
+ max_es_verts);
+
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+ assert(esgs_lds_size <= max_lds_size);
+ }
+
+ /* Now calculate remaining ESGS information. */
+ if (esgs_lds_size)
+ es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+ else
+ es_verts = max_es_verts;
+
+ /* Vertices for adjacency primitives are not always reused, so restore
+ * it for ES_VERTS_PER_SUBGRP.
+ */
+ min_es_verts = gs_info->gs.vertices_in;
+
+ /* For normal primitives, the VGT only checks if they are past the ES
+ * verts per subgroup after allocating a full GS primitive and if they
+ * are, kick off a new subgroup. But if those additional ES verts are
+ * unique (e.g. not reused) we need to make sure there is enough LDS
+ * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+ */
+ es_verts -= min_es_verts - 1;
+
+ uint32_t es_verts_per_subgroup = es_verts;
+ uint32_t gs_prims_per_subgroup = gs_prims;
+ uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+ uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
+ pipeline->graphics.gs.lds_size = align(esgs_lds_size, 128) / 128;
+ pipeline->graphics.gs.vgt_gs_onchip_cntl =
+ S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
+ pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup =
+ S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
+ pipeline->graphics.gs.vgt_esgs_ring_itemsize = esgs_itemsize;
+ assert(max_prims_per_subgroup <= max_out_prims);
+}
+
static void
calculate_gs_ring_sizes(struct radv_pipeline *pipeline)
{
esgs_ring_size = align(esgs_ring_size, alignment);
gsvs_ring_size = align(gsvs_ring_size, alignment);
- pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+ if (pipeline->device->physical_device->rad_info.chip_class <= VI)
+ pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+
pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
}
pipeline->graphics.vgt_shader_stages_en = stages;
- if (radv_pipeline_has_gs(pipeline))
+ if (radv_pipeline_has_gs(pipeline)) {
calculate_gs_ring_sizes(pipeline);
+ if (device->physical_device->rad_info.chip_class >= GFX9)
+ calculate_gfx9_gs_info(pCreateInfo, pipeline);
+ }
if (radv_pipeline_has_tess(pipeline)) {
if (pipeline->graphics.prim == V_008958_DI_PT_PATCH) {