X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_query.c;h=2b2e80f4e5b0c4217795928cd7b3a1996c31d3dc;hb=6f81e07ecb8c0793dc482307d5d96fd3df95b7d2;hp=3c40774042dd67b43ae4f07a0fdeabc9bb7893fd;hpb=a6c2001ace8ddf56b3781bc0bbf181173b8f393b;p=mesa.git diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 3c40774042d..2b2e80f4e5b 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -519,8 +519,6 @@ VkResult radv_device_init_meta_query_state(struct radv_device *device) struct radv_shader_module occlusion_cs = { .nir = NULL }; struct radv_shader_module pipeline_statistics_cs = { .nir = NULL }; - zero(device->meta_state.query); - occlusion_cs.nir = build_occlusion_query_shader(device); pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device); @@ -651,9 +649,12 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, uint32_t pipeline_stats_mask, uint32_t avail_offset) { struct radv_device *device = cmd_buffer->device; - struct radv_meta_saved_compute_state saved_state; + struct radv_meta_saved_state saved_state; - radv_meta_save_compute(&saved_state, cmd_buffer, 4); + radv_meta_save(&saved_state, cmd_buffer, + RADV_META_SAVE_COMPUTE_PIPELINE | + RADV_META_SAVE_CONSTANTS | + RADV_META_SAVE_DESCRIPTORS); struct radv_buffer dst_buffer = { .bo = dst_bo, @@ -737,7 +738,7 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_CS_PARTIAL_FLUSH; - radv_meta_restore_compute(&saved_state, cmd_buffer, 4); + radv_meta_restore(&saved_state, cmd_buffer); } VkResult radv_CreateQueryPool( @@ -747,13 +748,12 @@ VkResult radv_CreateQueryPool( VkQueryPool* pQueryPool) { RADV_FROM_HANDLE(radv_device, device, _device); - uint64_t size; struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!pool) - return VK_ERROR_OUT_OF_HOST_MEMORY; + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); switch(pCreateInfo->queryType) { @@ -773,17 +773,17 @@ VkResult radv_CreateQueryPool( pool->type = pCreateInfo->queryType; pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics; pool->availability_offset = pool->stride * pCreateInfo->queryCount; - size = pool->availability_offset; + pool->size = pool->availability_offset; if (pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP || pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS) - size += 4 * pCreateInfo->queryCount; + pool->size += 4 * pCreateInfo->queryCount; - pool->bo = device->ws->buffer_create(device->ws, size, - 64, RADEON_DOMAIN_GTT, 0); + pool->bo = device->ws->buffer_create(device->ws, pool->size, + 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING); if (!pool->bo) { vk_free2(&device->alloc, pAllocator, pool); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); } pool->ptr = device->ws->buffer_map(pool->bo); @@ -791,9 +791,9 @@ VkResult radv_CreateQueryPool( if (!pool->ptr) { device->ws->buffer_destroy(pool->bo); vk_free2(&device->alloc, pAllocator, pool); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); } - memset(pool->ptr, 0, size); + memset(pool->ptr, 0, pool->size); *pQueryPool = radv_query_pool_to_handle(pool); return VK_SUCCESS; @@ -952,12 +952,12 @@ void radv_CmdCopyQueryPoolResults( RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); struct radeon_winsys_cs *cs = cmd_buffer->cs; unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); - uint64_t dest_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + uint64_t va = radv_buffer_get_va(pool->bo); + uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo); dest_va += dst_buffer->offset + dstOffset; - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8); + radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo, 8); + radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo, 8); switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: @@ -1057,39 +1057,58 @@ void radv_CmdResetQueryPool( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); - uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint32_t flush_bits = 0; - cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8); + flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo, + firstQuery * pool->stride, + queryCount * pool->stride, 0); - si_cp_dma_clear_buffer(cmd_buffer, va + firstQuery * pool->stride, - queryCount * pool->stride, 0); if (pool->type == VK_QUERY_TYPE_TIMESTAMP || - pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) - si_cp_dma_clear_buffer(cmd_buffer, va + pool->availability_offset + firstQuery * 4, - queryCount * 4, 0); + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { + flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo, + pool->availability_offset + firstQuery * 4, + queryCount * 4, 0); + } + + if (flush_bits) { + /* Only need to flush caches for the compute shader path. */ + cmd_buffer->pending_reset_query = true; + cmd_buffer->state.flush_bits |= flush_bits; + } } -void radv_CmdBeginQuery( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - VkQueryControlFlags flags) +static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, + uint64_t va, + VkQueryType query_type, + VkQueryControlFlags flags) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); struct radeon_winsys_cs *cs = cmd_buffer->cs; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); - va += pool->stride * query; - - cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); - - switch (pool->type) { + switch (query_type) { case VK_QUERY_TYPE_OCCLUSION: radeon_check_space(cmd_buffer->device->ws, cs, 7); ++cmd_buffer->state.active_occlusion_queries; - if (cmd_buffer->state.active_occlusion_queries == 1) + if (cmd_buffer->state.active_occlusion_queries == 1) { + if (flags & VK_QUERY_CONTROL_PRECISE_BIT) { + /* This is the first occlusion query, enable + * the hint if the precision bit is set. + */ + cmd_buffer->state.perfect_occlusion_queries_enabled = true; + } + radv_set_db_count_control(cmd_buffer); + } else { + if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && + !cmd_buffer->state.perfect_occlusion_queries_enabled) { + /* This is not the first query, but this one + * needs to enable precision, DB_COUNT_CONTROL + * has to be updated accordingly. + */ + cmd_buffer->state.perfect_occlusion_queries_enabled = true; + + radv_set_db_count_control(cmd_buffer); + } + } radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); @@ -1107,31 +1126,28 @@ void radv_CmdBeginQuery( default: unreachable("beginning unhandled query type"); } -} +} -void radv_CmdEndQuery( - VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query) +static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, + uint64_t va, uint64_t avail_va, + VkQueryType query_type) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); struct radeon_winsys_cs *cs = cmd_buffer->cs; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); - uint64_t avail_va = va + pool->availability_offset + 4 * query; - va += pool->stride * query; - - cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8); - - switch (pool->type) { + switch (query_type) { case VK_QUERY_TYPE_OCCLUSION: radeon_check_space(cmd_buffer->device->ws, cs, 14); cmd_buffer->state.active_occlusion_queries--; - if (cmd_buffer->state.active_occlusion_queries == 0) + if (cmd_buffer->state.active_occlusion_queries == 0) { radv_set_db_count_control(cmd_buffer); + /* Reset the perfect occlusion queries hint now that no + * queries are active. + */ + cmd_buffer->state.perfect_occlusion_queries_enabled = false; + } + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va + 8); @@ -1151,8 +1167,8 @@ void radv_CmdEndQuery( si_cs_emit_write_event_eop(cs, false, cmd_buffer->device->physical_device->rad_info.chip_class, - false, - EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_BOTTOM_OF_PIPE_TS, 0, 1, avail_va, 0, 1); break; default: @@ -1160,6 +1176,73 @@ void radv_CmdEndQuery( } } +void radv_CmdBeginQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + struct radeon_winsys_cs *cs = cmd_buffer->cs; + uint64_t va = radv_buffer_get_va(pool->bo); + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo, 8); + + if (cmd_buffer->pending_reset_query) { + if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) { + /* Only need to flush caches if the query pool size is + * large enough to be resetted using the compute shader + * path. Small pools don't need any cache flushes + * because we use a CP dma clear. + */ + si_emit_cache_flush(cmd_buffer); + cmd_buffer->pending_reset_query = false; + } + } + + va += pool->stride * query; + + emit_begin_query(cmd_buffer, va, pool->type, flags); + + /* + * For multiview we have to emit a query for each bit in the mask, + * however the first query we emit will get the totals for all the + * operations, so we don't want to get a real value in the other + * queries. This emits a fake begin/end sequence so the waiting + * code gets a completed query value and doesn't hang, but the + * query returns 0. + */ + if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { + uint64_t avail_va = va + pool->availability_offset + 4 * query; + + for (unsigned i = 0; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { + va += pool->stride; + avail_va += 4; + emit_begin_query(cmd_buffer, va, pool->type, flags); + emit_end_query(cmd_buffer, va, avail_va, pool->type); + } + } +} + + +void radv_CmdEndQuery( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); + uint64_t va = radv_buffer_get_va(pool->bo); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + va += pool->stride * query; + + /* Do not need to add the pool BO to the list because the query must + * currently be active, which means the BO is already in the list. + */ + emit_end_query(cmd_buffer, va, avail_va, pool->type); +} + void radv_CmdWriteTimestamp( VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage, @@ -1170,48 +1253,55 @@ void radv_CmdWriteTimestamp( RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); bool mec = radv_cmd_buffer_uses_mec(cmd_buffer); struct radeon_winsys_cs *cs = cmd_buffer->cs; - uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo); + uint64_t va = radv_buffer_get_va(pool->bo); uint64_t avail_va = va + pool->availability_offset + 4 * query; uint64_t query_va = va + pool->stride * query; - cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5); - - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28); - - switch(pipelineStage) { - case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | - COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | - COPY_DATA_DST_SEL(V_370_MEM_ASYNC)); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - radeon_emit(cs, query_va); - radeon_emit(cs, query_va >> 32); - - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_ME)); - radeon_emit(cs, avail_va); - radeon_emit(cs, avail_va >> 32); - radeon_emit(cs, 1); - break; - default: - si_cs_emit_write_event_eop(cs, - false, - cmd_buffer->device->physical_device->rad_info.chip_class, - mec, - V_028A90_BOTTOM_OF_PIPE_TS, 0, - 3, query_va, 0, 0); - si_cs_emit_write_event_eop(cs, - false, - cmd_buffer->device->physical_device->rad_info.chip_class, - mec, - V_028A90_BOTTOM_OF_PIPE_TS, 0, - 1, avail_va, 0, 1); - break; - } + radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo, 5); + int num_queries = 1; + if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) + num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask); + + MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries); + + for (unsigned i = 0; i < num_queries; i++) { + switch(pipelineStage) { + case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | + COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | + COPY_DATA_DST_SEL(V_370_MEM_ASYNC)); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, query_va); + radeon_emit(cs, query_va >> 32); + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, avail_va); + radeon_emit(cs, avail_va >> 32); + radeon_emit(cs, 1); + break; + default: + si_cs_emit_write_event_eop(cs, + false, + cmd_buffer->device->physical_device->rad_info.chip_class, + mec, + V_028A90_BOTTOM_OF_PIPE_TS, 0, + 3, query_va, 0, 0); + si_cs_emit_write_event_eop(cs, + false, + cmd_buffer->device->physical_device->rad_info.chip_class, + mec, + V_028A90_BOTTOM_OF_PIPE_TS, 0, + 1, avail_va, 0, 1); + break; + } + query_va += pool->stride; + avail_va += 4; + } assert(cmd_buffer->cs->cdw <= cdw_max); }