radv/gfx10: implement NGG GS queries
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 13 Jan 2020 17:30:50 +0000 (18:30 +0100)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 29 Jan 2020 16:40:48 +0000 (17:40 +0100)
The number of generated primitives is only counted by the hardware
if GS uses the legacy path. For NGG GS, we need to accumulate that
value in the NGG GS itself. To achieve that, we use a plain GDS
atomic operation.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3380>

src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_nir_to_llvm.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_query.c
src/amd/vulkan/radv_shader.h
src/amd/vulkan/radv_shader_args.c
src/amd/vulkan/radv_shader_args.h

index f81e5fa91dc15e985b1c616c79262bc2d8e93bb4..21ef5caa8e565d3b5dcc74387ef7f972f2d92cce 100644 (file)
@@ -2574,6 +2574,35 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
        cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
 }
 
+static void
+radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
+{
+       struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+       struct radv_userdata_info *loc;
+       uint32_t ngg_gs_state = 0;
+       uint32_t base_reg;
+
+       if (!radv_pipeline_has_gs(pipeline) ||
+           !radv_pipeline_has_ngg(pipeline))
+               return;
+
+       /* By default NGG GS queries are disabled but they are enabled if the
+        * command buffer has active GDS queries or if it's a secondary command
+        * buffer that inherits the number of generated primitives.
+        */
+       if (cmd_buffer->state.active_pipeline_gds_queries ||
+           (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
+               ngg_gs_state = 1;
+
+       loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
+                                   AC_UD_NGG_GS_STATE);
+       base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
+       assert(loc->sgpr_idx != -1);
+
+       radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
+                         ngg_gs_state);
+}
+
 static void
 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
 {
@@ -2581,6 +2610,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool
        radv_flush_streamout_descriptors(cmd_buffer);
        radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
        radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
+       radv_flush_ngg_gs_state(cmd_buffer);
 }
 
 struct radv_draw_info {
@@ -3349,6 +3379,9 @@ VkResult radv_BeginCommandBuffer(
                                return result;
                }
 
+               cmd_buffer->state.inherited_pipeline_statistics =
+                       pBeginInfo->pInheritanceInfo->pipelineStatistics;
+
                radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
        }
 
@@ -4089,6 +4122,8 @@ void radv_CmdExecuteCommands(
                        primary->tess_rings_needed = true;
                if (secondary->sample_positions_needed)
                        primary->sample_positions_needed = true;
+               if (secondary->gds_needed)
+                       primary->gds_needed = true;
 
                if (!secondary->state.framebuffer &&
                    (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
index 58b679a35ae22e4c989549bb20a6720012935df3..422ffa1769977daa912c19e7bd49f1d07b1f42a7 100644 (file)
@@ -3170,6 +3170,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
                build_streamout(ctx, &nggso);
        }
 
+       /* Write shader query data. */
+       tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
+       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+       ac_build_ifcc(&ctx->ac, tmp, 5109);
+       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+                           LLVMConstInt(ctx->ac.i32, 4, false), "");
+       ac_build_ifcc(&ctx->ac, tmp, 5110);
+       {
+               tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+
+               ac_llvm_add_target_dep_function_attr(ctx->main_function,
+                                                    "amdgpu-gds-size", 256);
+
+               LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+               LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+               const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
+
+               /* Use a plain GDS atomic to accumulate the number of generated
+                * primitives.
+                */
+               ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase,
+                                   tmp, sync_scope);
+       }
+       ac_build_endif(&ctx->ac, 5110);
+       ac_build_endif(&ctx->ac, 5109);
+
        /* TODO: culling */
 
        /* Determine vertex liveness. */
index 4494d59507493d341083900a9d6a034e5da7ab8c..ca7d9a084f51511c2dc644bc71f831059d581ed0 100644 (file)
@@ -1260,6 +1260,7 @@ struct radv_cmd_state {
        unsigned                                     active_occlusion_queries;
        bool                                         perfect_occlusion_queries_enabled;
        unsigned                                     active_pipeline_queries;
+       unsigned                                     active_pipeline_gds_queries;
        float                                        offset_scale;
        uint32_t                                      trace_id;
        uint32_t                                      last_ia_multi_vgt_param;
@@ -1275,6 +1276,9 @@ struct radv_cmd_state {
        int predication_type; /* -1: disabled, 0: normal, 1: inverted */
        uint64_t predication_va;
 
+       /* Inheritance info. */
+       VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
+
        bool context_roll_without_scissor_emitted;
 };
 
@@ -1333,7 +1337,7 @@ struct radv_cmd_buffer {
        uint32_t esgs_ring_size_needed;
        uint32_t gsvs_ring_size_needed;
        bool tess_rings_needed;
-       bool gds_needed; /* for GFX10 streamout */
+       bool gds_needed; /* for GFX10 streamout and NGG GS queries */
        bool gds_oa_needed; /* for GFX10 streamout */
        bool sample_positions_needed;
 
index f59e435e018dfbebe161d643ff32b88b525d3bed..6f660c109e6ca97500e3904e97e1164a63bb6988 100644 (file)
 static const int pipelinestat_block_size = 11 * 8;
 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
 
+static unsigned
+radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
+{
+       int offset = ffs(flag) - 1;
+       assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
+       return pipeline_statistics_indices[offset];
+}
+
 static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
 {
        return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
@@ -1261,6 +1269,22 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
        radv_meta_restore(&saved_state, cmd_buffer);
 }
 
+static bool
+radv_query_pool_needs_gds(struct radv_device *device,
+                         struct radv_query_pool *pool)
+{
+       /* The number of primitives generated by geometry shader invocations is
+        * only counted by the hardware if GS uses the legacy path. When NGG GS
+        * is used, the hardware can't know the number of generated primitives
+        * and we have to it manually inside the shader. To achieve that, the
+        * driver does a plain GDS atomic to accumulate that value.
+        * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
+        * query.
+        */
+       return device->physical_device->use_ngg &&
+              (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+}
+
 VkResult radv_CreateQueryPool(
        VkDevice                                    _device,
        const VkQueryPoolCreateInfo*                pCreateInfo,
@@ -1725,6 +1749,7 @@ static unsigned event_type_for_stream(unsigned stream)
 }
 
 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
+                            struct radv_query_pool *pool,
                             uint64_t va,
                             VkQueryType query_type,
                             VkQueryControlFlags flags,
@@ -1776,6 +1801,30 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
                radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
                radeon_emit(cs, va);
                radeon_emit(cs, va >> 32);
+
+               if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+                       int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+                       /* Make sure GDS is idle before copying the value. */
+                       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+                                                       RADV_CMD_FLAG_INV_L2;
+                       si_emit_cache_flush(cmd_buffer);
+
+                       va += 8 * idx;
+
+                       si_cs_emit_write_event_eop(cs,
+                                                  cmd_buffer->device->physical_device->rad_info.chip_class,
+                                                  radv_cmd_buffer_uses_mec(cmd_buffer),
+                                                  V_028A90_PS_DONE, 0,
+                                                  EOP_DST_SEL_TC_L2,
+                                                  EOP_DATA_SEL_GDS,
+                                                  va, EOP_DATA_GDS(0, 1), 0);
+
+                       /* Record that the command buffer needs GDS. */
+                       cmd_buffer->gds_needed = true;
+
+                       cmd_buffer->state.active_pipeline_gds_queries++;
+               }
                break;
        case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
                radeon_check_space(cmd_buffer->device->ws, cs, 4);
@@ -1794,6 +1843,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
 }
 
 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
+                          struct radv_query_pool *pool,
                           uint64_t va, uint64_t avail_va,
                           VkQueryType query_type, uint32_t index)
 {
@@ -1841,6 +1891,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
                                           EOP_DATA_SEL_VALUE_32BIT,
                                           avail_va, 1,
                                           cmd_buffer->gfx9_eop_bug_va);
+
+               if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+                       int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+                       /* Make sure GDS is idle before copying the value. */
+                       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+                                                       RADV_CMD_FLAG_INV_L2;
+                       si_emit_cache_flush(cmd_buffer);
+
+                       va += 8 * idx;
+
+                       si_cs_emit_write_event_eop(cs,
+                                                  cmd_buffer->device->physical_device->rad_info.chip_class,
+                                                  radv_cmd_buffer_uses_mec(cmd_buffer),
+                                                  V_028A90_PS_DONE, 0,
+                                                  EOP_DST_SEL_TC_L2,
+                                                  EOP_DATA_SEL_GDS,
+                                                  va, EOP_DATA_GDS(0, 1), 0);
+
+                       cmd_buffer->state.active_pipeline_gds_queries--;
+               }
                break;
        case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
                radeon_check_space(cmd_buffer->device->ws, cs, 4);
@@ -1884,7 +1955,7 @@ void radv_CmdBeginQueryIndexedEXT(
 
        va += pool->stride * query;
 
-       emit_begin_query(cmd_buffer, va, pool->type, flags, index);
+       emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
 }
 
 void radv_CmdBeginQuery(
@@ -1911,7 +1982,7 @@ void radv_CmdEndQueryIndexedEXT(
        /* Do not need to add the pool BO to the list because the query must
         * currently be active, which means the BO is already in the list.
         */
-       emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
+       emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
 
        /*
         * For multiview we have to emit a query for each bit in the mask,
@@ -1928,8 +1999,8 @@ void radv_CmdEndQueryIndexedEXT(
                for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
                        va += pool->stride;
                        avail_va += 4;
-                       emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
-                       emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
+                       emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
+                       emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
                }
        }
 }
index 131774bd8861944e7fa2747a43edac5d28c6e981..b38710e6fcfafee4dcc0548708c2592a4cb44e0c 100644 (file)
@@ -148,7 +148,8 @@ enum radv_ud_index {
        AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
        AC_UD_VIEW_INDEX = 4,
        AC_UD_STREAMOUT_BUFFERS = 5,
-       AC_UD_SHADER_START = 6,
+       AC_UD_NGG_GS_STATE = 6,
+       AC_UD_SHADER_START = 7,
        AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
        AC_UD_VS_BASE_VERTEX_START_INSTANCE,
        AC_UD_VS_MAX_UD,
index 6f40808d8255ffa94c57b429dbc392045a48356b..1b57d402d5cbe4c8799a559e71a48ecbbea87977 100644 (file)
@@ -615,6 +615,11 @@ radv_declare_shader_args(struct radv_shader_args *args,
                                           &args->ac.view_index);
                        }
 
+                       if (args->options->key.vs_common_out.as_ngg) {
+                               ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
+                                          &args->ngg_gs_state);
+                       }
+
                        ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
                                   &args->gs_vtx_offset[0]);
                        ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
@@ -742,6 +747,9 @@ radv_declare_shader_args(struct radv_shader_args *args,
                }
                if (args->ac.view_index.used)
                        set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+
+               if (args->ngg_gs_state.used)
+                       set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
                break;
        case MESA_SHADER_FRAGMENT:
                break;
index 3c7aceb638588b3a1dc72029acbeda8ce530b619..451077a9ede049a444e7a5d07be6bc4b334390a9 100644 (file)
@@ -65,6 +65,9 @@ struct radv_shader_args {
        struct ac_arg streamout_config;
        struct ac_arg streamout_offset[4];
 
+       /* NGG GS */
+       struct ac_arg ngg_gs_state;
+
        bool is_gs_copy_shader;
 };