From 73749caf0e9bae845d8dd5aed181e4e3cb65c918 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Fri, 20 Oct 2017 02:24:24 +0200 Subject: [PATCH] radv: calculate and emit GFX9 GS registers to pipeline state. Reviewed-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 25 ++++-- src/amd/vulkan/radv_pipeline.c | 126 ++++++++++++++++++++++++++++++- src/amd/vulkan/radv_private.h | 8 ++ src/amd/vulkan/radv_shader.c | 6 ++ 4 files changed, 158 insertions(+), 7 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 3dc356ca8e0..f4aa9e9b16f 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -864,11 +864,26 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer, ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8); radv_emit_prefetch(cmd_buffer, va, gs->code_size); - radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4); - radeon_emit(cmd_buffer->cs, va >> 8); - radeon_emit(cmd_buffer->cs, va >> 40); - radeon_emit(cmd_buffer->cs, gs->rsrc1); - radeon_emit(cmd_buffer->cs, gs->rsrc2); + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); + radeon_emit(cmd_buffer->cs, gs->rsrc1); + radeon_emit(cmd_buffer->cs, gs->rsrc2 | + S_00B22C_LDS_SIZE(pipeline->graphics.gs.lds_size)); + + radeon_set_context_reg(cmd_buffer->cs, R_028A44_VGT_GS_ONCHIP_CNTL, pipeline->graphics.gs.vgt_gs_onchip_cntl); + radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup); + radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, pipeline->graphics.gs.vgt_esgs_ring_itemsize); + } else { + radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4); + radeon_emit(cmd_buffer->cs, va >> 8); + radeon_emit(cmd_buffer->cs, va >> 40); + radeon_emit(cmd_buffer->cs, gs->rsrc1); + radeon_emit(cmd_buffer->cs, gs->rsrc2); + } radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader, &pipeline->gs_copy_shader->info.vs.outinfo); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 21265c45df3..eac4456a3fb 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1161,6 +1161,123 @@ radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es, return key; } + +static void calculate_gfx9_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo, + struct radv_pipeline *pipeline) +{ + struct ac_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; + struct ac_es_output_info *es_info = radv_pipeline_has_tess(pipeline) ? + &gs_info->tes.es_info : &gs_info->vs.es_info; + unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1); + bool uses_adjacency; + switch(pCreateInfo->pInputAssemblyState->topology) { + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + uses_adjacency = false; + break; + default: + uses_adjacency = false; + break; + } + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. */ + const unsigned max_lds_size = 8 * 1024; + const unsigned esgs_itemsize = es_info->esgs_itemsize / 4; + unsigned esgs_lds_size; + + /* All these are per subgroup: */ + const unsigned max_out_prims = 32 * 1024; + const unsigned max_es_verts = 255; + const unsigned ideal_gs_prims = 64; + unsigned max_gs_prims, gs_prims; + unsigned min_es_verts, es_verts, worst_case_es_verts; + + assert(gs_num_invocations <= 32); /* GL maximum */ + + if (uses_adjacency || gs_num_invocations > 1) + max_gs_prims = 127 / gs_num_invocations; + else + max_gs_prims = 255; + + /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. + * Make sure we don't go over the maximum value. + */ + if (gs_info->gs.vertices_out > 0) { + max_gs_prims = MIN2(max_gs_prims, + max_out_prims / + (gs_info->gs.vertices_out * gs_num_invocations)); + } + assert(max_gs_prims > 0); + + /* If the primitive has adjacency, halve the number of vertices + * that will be reused in multiple primitives. + */ + min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1); + + gs_prims = MIN2(ideal_gs_prims, max_gs_prims); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); + + /* Compute ESGS LDS size based on the worst case number of ES vertices + * needed to create the target number of GS prims per subgroup. + */ + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + + /* If total LDS usage is too big, refactor partitions based on ratio + * of ESGS item sizes. + */ + if (esgs_lds_size > max_lds_size) { + /* Our target GS Prims Per Subgroup was too large. Calculate + * the maximum number of GS Prims Per Subgroup that will fit + * into LDS, capped by the maximum that the hardware can support. + */ + gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), + max_gs_prims); + assert(gs_prims > 0); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, + max_es_verts); + + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + assert(esgs_lds_size <= max_lds_size); + } + + /* Now calculate remaining ESGS information. */ + if (esgs_lds_size) + es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); + else + es_verts = max_es_verts; + + /* Vertices for adjacency primitives are not always reused, so restore + * it for ES_VERTS_PER_SUBGRP. + */ + min_es_verts = gs_info->gs.vertices_in; + + /* For normal primitives, the VGT only checks if they are past the ES + * verts per subgroup after allocating a full GS primitive and if they + * are, kick off a new subgroup. But if those additional ES verts are + * unique (e.g. not reused) we need to make sure there is enough LDS + * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. + */ + es_verts -= min_es_verts - 1; + + uint32_t es_verts_per_subgroup = es_verts; + uint32_t gs_prims_per_subgroup = gs_prims; + uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; + uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out; + pipeline->graphics.gs.lds_size = align(esgs_lds_size, 128) / 128; + pipeline->graphics.gs.vgt_gs_onchip_cntl = + S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | + S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup); + pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup = + S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); + pipeline->graphics.gs.vgt_esgs_ring_itemsize = esgs_itemsize; + assert(max_prims_per_subgroup <= max_out_prims); +} + static void calculate_gs_ring_sizes(struct radv_pipeline *pipeline) { @@ -1194,7 +1311,9 @@ calculate_gs_ring_sizes(struct radv_pipeline *pipeline) esgs_ring_size = align(esgs_ring_size, alignment); gsvs_ring_size = align(gsvs_ring_size, alignment); - pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + if (pipeline->device->physical_device->rad_info.chip_class <= VI) + pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size); } @@ -1916,8 +2035,11 @@ radv_pipeline_init(struct radv_pipeline *pipeline, pipeline->graphics.vgt_shader_stages_en = stages; - if (radv_pipeline_has_gs(pipeline)) + if (radv_pipeline_has_gs(pipeline)) { calculate_gs_ring_sizes(pipeline); + if (device->physical_device->rad_info.chip_class >= GFX9) + calculate_gfx9_gs_info(pCreateInfo, pipeline); + } if (radv_pipeline_has_tess(pipeline)) { if (pipeline->graphics.prim == V_008958_DI_PT_PATCH) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 10b7983760d..6c3a1bbbb39 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1050,6 +1050,13 @@ struct radv_tessellation_state { uint32_t tf_param; }; +struct radv_gs_state { + uint32_t vgt_gs_onchip_cntl; + uint32_t vgt_gs_max_prims_per_subgroup; + uint32_t vgt_esgs_ring_itemsize; + uint32_t lds_size; +}; + struct radv_vertex_elements_info { uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS]; uint32_t format_size[MAX_VERTEX_ATTRIBS]; @@ -1084,6 +1091,7 @@ struct radv_pipeline { struct radv_raster_state raster; struct radv_multisample_state ms; struct radv_tessellation_state tess; + struct radv_gs_state gs; uint32_t db_shader_control; uint32_t shader_z_format; unsigned prim; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 3688680107f..a5e2826401b 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -400,6 +400,12 @@ radv_fill_shader_variant(struct radv_device *device, } if (device->physical_device->rad_info.chip_class >= GFX9 && + stage == MESA_SHADER_GEOMETRY) { + /* TODO: Figure out how many we actually need. */ + variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(3); + variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(3) | + S_00B22C_OC_LDS_EN(1); + } else if (device->physical_device->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_TESS_CTRL) variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt); else -- 2.30.2