cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
}
+static void
+radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
+{
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ struct radv_userdata_info *loc;
+ uint32_t ngg_gs_state = 0;
+ uint32_t base_reg;
+
+ if (!radv_pipeline_has_gs(pipeline) ||
+ !radv_pipeline_has_ngg(pipeline))
+ return;
+
+ /* By default NGG GS queries are disabled but they are enabled if the
+ * command buffer has active GDS queries or if it's a secondary command
+ * buffer that inherits the number of generated primitives.
+ */
+ if (cmd_buffer->state.active_pipeline_gds_queries ||
+ (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
+ ngg_gs_state = 1;
+
+ loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
+ AC_UD_NGG_GS_STATE);
+ base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
+ assert(loc->sgpr_idx != -1);
+
+ radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
+ ngg_gs_state);
+}
+
static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
radv_flush_streamout_descriptors(cmd_buffer);
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
+ radv_flush_ngg_gs_state(cmd_buffer);
}
struct radv_draw_info {
return result;
}
+ cmd_buffer->state.inherited_pipeline_statistics =
+ pBeginInfo->pInheritanceInfo->pipelineStatistics;
+
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
}
primary->tess_rings_needed = true;
if (secondary->sample_positions_needed)
primary->sample_positions_needed = true;
+ if (secondary->gds_needed)
+ primary->gds_needed = true;
if (!secondary->state.framebuffer &&
(primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
build_streamout(ctx, &nggso);
}
+ /* Write shader query data. */
+ tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5109);
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+ LLVMConstInt(ctx->ac.i32, 4, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5110);
+ {
+ tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+
+ ac_llvm_add_target_dep_function_attr(ctx->main_function,
+ "amdgpu-gds-size", 256);
+
+ LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+ LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+ const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
+
+ /* Use a plain GDS atomic to accumulate the number of generated
+ * primitives.
+ */
+ ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase,
+ tmp, sync_scope);
+ }
+ ac_build_endif(&ctx->ac, 5110);
+ ac_build_endif(&ctx->ac, 5109);
+
/* TODO: culling */
/* Determine vertex liveness. */
static const int pipelinestat_block_size = 11 * 8;
static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
+static unsigned
+radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
+{
+ int offset = ffs(flag) - 1;
+ assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
+ return pipeline_statistics_indices[offset];
+}
+
static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
{
return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
radv_meta_restore(&saved_state, cmd_buffer);
}
+static bool
+radv_query_pool_needs_gds(struct radv_device *device,
+ struct radv_query_pool *pool)
+{
+ /* The number of primitives generated by geometry shader invocations is
+ * only counted by the hardware if GS uses the legacy path. When NGG GS
+ * is used, the hardware can't know the number of generated primitives
+ * and we have to it manually inside the shader. To achieve that, the
+ * driver does a plain GDS atomic to accumulate that value.
+ * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
+ * query.
+ */
+ return device->physical_device->use_ngg &&
+ (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+}
+
VkResult radv_CreateQueryPool(
VkDevice _device,
const VkQueryPoolCreateInfo* pCreateInfo,
}
static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_query_pool *pool,
uint64_t va,
VkQueryType query_type,
VkQueryControlFlags flags,
radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+
+ if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+ int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+ /* Make sure GDS is idle before copying the value. */
+ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_INV_L2;
+ si_emit_cache_flush(cmd_buffer);
+
+ va += 8 * idx;
+
+ si_cs_emit_write_event_eop(cs,
+ cmd_buffer->device->physical_device->rad_info.chip_class,
+ radv_cmd_buffer_uses_mec(cmd_buffer),
+ V_028A90_PS_DONE, 0,
+ EOP_DST_SEL_TC_L2,
+ EOP_DATA_SEL_GDS,
+ va, EOP_DATA_GDS(0, 1), 0);
+
+ /* Record that the command buffer needs GDS. */
+ cmd_buffer->gds_needed = true;
+
+ cmd_buffer->state.active_pipeline_gds_queries++;
+ }
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
radeon_check_space(cmd_buffer->device->ws, cs, 4);
}
static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_query_pool *pool,
uint64_t va, uint64_t avail_va,
VkQueryType query_type, uint32_t index)
{
EOP_DATA_SEL_VALUE_32BIT,
avail_va, 1,
cmd_buffer->gfx9_eop_bug_va);
+
+ if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+ int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+ /* Make sure GDS is idle before copying the value. */
+ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_INV_L2;
+ si_emit_cache_flush(cmd_buffer);
+
+ va += 8 * idx;
+
+ si_cs_emit_write_event_eop(cs,
+ cmd_buffer->device->physical_device->rad_info.chip_class,
+ radv_cmd_buffer_uses_mec(cmd_buffer),
+ V_028A90_PS_DONE, 0,
+ EOP_DST_SEL_TC_L2,
+ EOP_DATA_SEL_GDS,
+ va, EOP_DATA_GDS(0, 1), 0);
+
+ cmd_buffer->state.active_pipeline_gds_queries--;
+ }
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
radeon_check_space(cmd_buffer->device->ws, cs, 4);
va += pool->stride * query;
- emit_begin_query(cmd_buffer, va, pool->type, flags, index);
+ emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
}
void radv_CmdBeginQuery(
/* Do not need to add the pool BO to the list because the query must
* currently be active, which means the BO is already in the list.
*/
- emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
+ emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
/*
* For multiview we have to emit a query for each bit in the mask,
for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
va += pool->stride;
avail_va += 4;
- emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
- emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
+ emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
+ emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
}
}
}