From e4752dafede30fbfc93208d9d4091873a8bd5d31 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 13 Jan 2020 18:30:50 +0100 Subject: [PATCH] radv/gfx10: implement NGG GS queries The number of generated primitives is only counted by the hardware if GS uses the legacy path. For NGG GS, we need to accumulate that value in the NGG GS itself. To achieve that, we use a plain GDS atomic operation. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/vulkan/radv_cmd_buffer.c | 35 ++++++++++++++ src/amd/vulkan/radv_nir_to_llvm.c | 27 +++++++++++ src/amd/vulkan/radv_private.h | 6 ++- src/amd/vulkan/radv_query.c | 79 +++++++++++++++++++++++++++++-- src/amd/vulkan/radv_shader.h | 3 +- src/amd/vulkan/radv_shader_args.c | 8 ++++ src/amd/vulkan/radv_shader_args.h | 3 ++ 7 files changed, 155 insertions(+), 6 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f81e5fa91dc..21ef5caa8e5 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2574,6 +2574,35 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; } +static void +radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_userdata_info *loc; + uint32_t ngg_gs_state = 0; + uint32_t base_reg; + + if (!radv_pipeline_has_gs(pipeline) || + !radv_pipeline_has_ngg(pipeline)) + return; + + /* By default NGG GS queries are disabled but they are enabled if the + * command buffer has active GDS queries or if it's a secondary command + * buffer that inherits the number of generated primitives. + */ + if (cmd_buffer->state.active_pipeline_gds_queries || + (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) + ngg_gs_state = 1; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, + AC_UD_NGG_GS_STATE); + base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY]; + assert(loc->sgpr_idx != -1); + + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + ngg_gs_state); +} + static void radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { @@ -2581,6 +2610,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool radv_flush_streamout_descriptors(cmd_buffer); radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); + radv_flush_ngg_gs_state(cmd_buffer); } struct radv_draw_info { @@ -3349,6 +3379,9 @@ VkResult radv_BeginCommandBuffer( return result; } + cmd_buffer->state.inherited_pipeline_statistics = + pBeginInfo->pInheritanceInfo->pipelineStatistics; + radv_cmd_buffer_set_subpass(cmd_buffer, subpass); } @@ -4089,6 +4122,8 @@ void radv_CmdExecuteCommands( primary->tess_rings_needed = true; if (secondary->sample_positions_needed) primary->sample_positions_needed = true; + if (secondary->gds_needed) + primary->gds_needed = true; if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 58b679a35ae..422ffa17699 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3170,6 +3170,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) build_streamout(ctx, &nggso); } + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + /* TODO: culling */ /* Determine vertex liveness. */ diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 4494d595074..ca7d9a084f5 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1260,6 +1260,7 @@ struct radv_cmd_state { unsigned active_occlusion_queries; bool perfect_occlusion_queries_enabled; unsigned active_pipeline_queries; + unsigned active_pipeline_gds_queries; float offset_scale; uint32_t trace_id; uint32_t last_ia_multi_vgt_param; @@ -1275,6 +1276,9 @@ struct radv_cmd_state { int predication_type; /* -1: disabled, 0: normal, 1: inverted */ uint64_t predication_va; + /* Inheritance info. */ + VkQueryPipelineStatisticFlags inherited_pipeline_statistics; + bool context_roll_without_scissor_emitted; }; @@ -1333,7 +1337,7 @@ struct radv_cmd_buffer { uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; bool tess_rings_needed; - bool gds_needed; /* for GFX10 streamout */ + bool gds_needed; /* for GFX10 streamout and NGG GS queries */ bool gds_oa_needed; /* for GFX10 streamout */ bool sample_positions_needed; diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index f59e435e018..6f660c109e6 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -40,6 +40,14 @@ static const int pipelinestat_block_size = 11 * 8; static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; +static unsigned +radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag) +{ + int offset = ffs(flag) - 1; + assert(offset < ARRAY_SIZE(pipeline_statistics_indices)); + return pipeline_statistics_indices[offset]; +} + static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag) { return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag))); @@ -1261,6 +1269,22 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, radv_meta_restore(&saved_state, cmd_buffer); } +static bool +radv_query_pool_needs_gds(struct radv_device *device, + struct radv_query_pool *pool) +{ + /* The number of primitives generated by geometry shader invocations is + * only counted by the hardware if GS uses the legacy path. When NGG GS + * is used, the hardware can't know the number of generated primitives + * and we have to it manually inside the shader. To achieve that, the + * driver does a plain GDS atomic to accumulate that value. + * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end + * query. + */ + return device->physical_device->use_ngg && + (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); +} + VkResult radv_CreateQueryPool( VkDevice _device, const VkQueryPoolCreateInfo* pCreateInfo, @@ -1725,6 +1749,7 @@ static unsigned event_type_for_stream(unsigned stream) } static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, VkQueryType query_type, VkQueryControlFlags flags, @@ -1776,6 +1801,30 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + /* Record that the command buffer needs GDS. */ + cmd_buffer->gds_needed = true; + + cmd_buffer->state.active_pipeline_gds_queries++; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1794,6 +1843,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, } static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, uint64_t avail_va, VkQueryType query_type, uint32_t index) { @@ -1841,6 +1891,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + cmd_buffer->state.active_pipeline_gds_queries--; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1884,7 +1955,7 @@ void radv_CmdBeginQueryIndexedEXT( va += pool->stride * query; - emit_begin_query(cmd_buffer, va, pool->type, flags, index); + emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index); } void radv_CmdBeginQuery( @@ -1911,7 +1982,7 @@ void radv_CmdEndQueryIndexedEXT( /* Do not need to add the pool BO to the list because the query must * currently be active, which means the BO is already in the list. */ - emit_end_query(cmd_buffer, va, avail_va, pool->type, index); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index); /* * For multiview we have to emit a query for each bit in the mask, @@ -1928,8 +1999,8 @@ void radv_CmdEndQueryIndexedEXT( for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { va += pool->stride; avail_va += 4; - emit_begin_query(cmd_buffer, va, pool->type, 0, 0); - emit_end_query(cmd_buffer, va, avail_va, pool->type, 0); + emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0); } } } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 131774bd886..b38710e6fcf 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -148,7 +148,8 @@ enum radv_ud_index { AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, AC_UD_VIEW_INDEX = 4, AC_UD_STREAMOUT_BUFFERS = 5, - AC_UD_SHADER_START = 6, + AC_UD_NGG_GS_STATE = 6, + AC_UD_SHADER_START = 7, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_MAX_UD, diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index 6f40808d825..1b57d402d5c 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -615,6 +615,11 @@ radv_declare_shader_args(struct radv_shader_args *args, &args->ac.view_index); } + if (args->options->key.vs_common_out.as_ngg) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ngg_gs_state); + } + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->gs_vtx_offset[0]); ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, @@ -742,6 +747,9 @@ radv_declare_shader_args(struct radv_shader_args *args, } if (args->ac.view_index.used) set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + + if (args->ngg_gs_state.used) + set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1); break; case MESA_SHADER_FRAGMENT: break; diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index 3c7aceb6385..451077a9ede 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -65,6 +65,9 @@ struct radv_shader_args { struct ac_arg streamout_config; struct ac_arg streamout_offset[4]; + /* NGG GS */ + struct ac_arg ngg_gs_state; + bool is_gs_copy_shader; }; -- 2.30.2