From e1dc3ab753480db414a68ef7944f00cfc75d5882 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 9 Sep 2019 10:26:54 +0200 Subject: [PATCH] radv/gfx10: allocate GDS/OA buffer objects for NGG streamout This allocates two BOs for GFX10 NGG streamout. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/radv_cmd_buffer.c | 4 ++ src/amd/vulkan/radv_device.c | 66 ++++++++++++++++++++++++++++++-- src/amd/vulkan/radv_private.h | 4 ++ 3 files changed, 70 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f55d78b9250..22b89760bbb 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -337,6 +337,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->esgs_ring_size_needed = 0; cmd_buffer->gsvs_ring_size_needed = 0; cmd_buffer->tess_rings_needed = false; + cmd_buffer->gds_needed = false; cmd_buffer->sample_positions_needed = false; if (cmd_buffer->upload.upload_bo) @@ -5815,6 +5816,9 @@ radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) ((old_streamout_enabled != so->streamout_enabled) || (old_hw_enabled_mask != so->hw_enabled_mask))) radv_emit_streamout_enable(cmd_buffer); + + if (cmd_buffer->device->physical_device->use_ngg_streamout) + cmd_buffer->gds_needed = true; } static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 53a08bcdc5a..7786a71afc8 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1746,6 +1746,10 @@ radv_queue_finish(struct radv_queue *queue) queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); if (queue->tess_rings_bo) queue->device->ws->buffer_destroy(queue->tess_rings_bo); + if (queue->gds_bo) + queue->device->ws->buffer_destroy(queue->gds_bo); + if (queue->gds_oa_bo) + queue->device->ws->buffer_destroy(queue->gds_oa_bo); if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } @@ -2598,6 +2602,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t esgs_ring_size, uint32_t gsvs_ring_size, bool needs_tess_rings, + bool needs_gds, bool needs_sample_positions, struct radeon_cmdbuf **initial_full_flush_preamble_cs, struct radeon_cmdbuf **initial_preamble_cs, @@ -2609,8 +2614,10 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *tess_rings_bo = NULL; + struct radeon_winsys_bo *gds_bo = NULL; + struct radeon_winsys_bo *gds_oa_bo = NULL; struct radeon_cmdbuf *dest_cs[3] = {0}; - bool add_tess_rings = false, add_sample_positions = false; + bool add_tess_rings = false, add_gds = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; unsigned hs_offchip_param = 0; @@ -2620,6 +2627,10 @@ radv_get_preamble_cs(struct radv_queue *queue, if (needs_tess_rings) add_tess_rings = true; } + if (!queue->has_gds) { + if (needs_gds) + add_gds = true; + } if (!queue->has_sample_positions) { if (needs_sample_positions) add_sample_positions = true; @@ -2635,7 +2646,7 @@ radv_get_preamble_cs(struct radv_queue *queue, compute_scratch_size <= queue->compute_scratch_size && esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size && - !add_tess_rings && !add_sample_positions && + !add_tess_rings && !add_gds && !add_sample_positions && queue->initial_preamble_cs) { *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs; *initial_preamble_cs = queue->initial_preamble_cs; @@ -2711,6 +2722,32 @@ radv_get_preamble_cs(struct radv_queue *queue, tess_rings_bo = queue->tess_rings_bo; } + if (add_gds) { + assert(queue->device->physical_device->rad_info.chip_class >= GFX10); + + /* 4 streamout GDS counters. + * We need 256B (64 dw) of GDS, otherwise streamout hangs. + */ + gds_bo = queue->device->ws->buffer_create(queue->device->ws, + 256, 4, + RADEON_DOMAIN_GDS, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_bo) + goto fail; + + gds_oa_bo = queue->device->ws->buffer_create(queue->device->ws, + 4, 1, + RADEON_DOMAIN_OA, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_oa_bo) + goto fail; + } else { + gds_bo = queue->gds_bo; + gds_oa_bo = queue->gds_oa_bo; + } + if (scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo || gsvs_ring_bo != queue->gsvs_ring_bo || @@ -2801,6 +2838,11 @@ radv_get_preamble_cs(struct radv_queue *queue, radv_emit_global_shader_pointers(queue, cs, descriptor_bo); radv_emit_compute_scratch(queue, cs, compute_scratch_bo); + if (gds_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_bo); + if (gds_oa_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_oa_bo); + if (i == 0) { si_cs_emit_cache_flush(cs, queue->device->physical_device->rad_info.chip_class, @@ -2876,6 +2918,14 @@ radv_get_preamble_cs(struct radv_queue *queue, queue->has_tess_rings = true; } + if (gds_bo != queue->gds_bo) { + queue->gds_bo = gds_bo; + queue->has_gds = true; + } + + if (gds_oa_bo != queue->gds_oa_bo) + queue->gds_oa_bo = gds_oa_bo; + if (descriptor_bo != queue->descriptor_bo) { if (queue->descriptor_bo) queue->device->ws->buffer_destroy(queue->descriptor_bo); @@ -2908,6 +2958,11 @@ fail: queue->device->ws->buffer_destroy(gsvs_ring_bo); if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo) queue->device->ws->buffer_destroy(tess_rings_bo); + if (gds_bo && gds_bo != queue->gds_bo) + queue->device->ws->buffer_destroy(gds_bo); + if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo) + queue->device->ws->buffer_destroy(gds_oa_bo); + return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } @@ -3070,6 +3125,7 @@ VkResult radv_QueueSubmit( VkResult result; bool fence_emitted = false; bool tess_rings_needed = false; + bool gds_needed = false; bool sample_positions_needed = false; /* Do this first so failing to allocate scratch buffers can't result in @@ -3085,14 +3141,16 @@ VkResult radv_QueueSubmit( esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); tess_rings_needed |= cmd_buffer->tess_rings_needed; + gds_needed |= cmd_buffer->gds_needed; sample_positions_needed |= cmd_buffer->sample_positions_needed; } } result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, tess_rings_needed, - sample_positions_needed, &initial_flush_preamble_cs, - &initial_preamble_cs, &continue_preamble_cs); + gds_needed, sample_positions_needed, + &initial_flush_preamble_cs, + &initial_preamble_cs, &continue_preamble_cs); if (result != VK_SUCCESS) return result; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 8b612155621..03dc9e02145 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -663,6 +663,7 @@ struct radv_queue { uint32_t esgs_ring_size; uint32_t gsvs_ring_size; bool has_tess_rings; + bool has_gds; bool has_sample_positions; struct radeon_winsys_bo *scratch_bo; @@ -671,6 +672,8 @@ struct radv_queue { struct radeon_winsys_bo *esgs_ring_bo; struct radeon_winsys_bo *gsvs_ring_bo; struct radeon_winsys_bo *tess_rings_bo; + struct radeon_winsys_bo *gds_bo; + struct radeon_winsys_bo *gds_oa_bo; struct radeon_cmdbuf *initial_preamble_cs; struct radeon_cmdbuf *initial_full_flush_preamble_cs; struct radeon_cmdbuf *continue_preamble_cs; @@ -1223,6 +1226,7 @@ struct radv_cmd_buffer { uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; bool tess_rings_needed; + bool gds_needed; /* for GFX10 streamout */ bool sample_positions_needed; VkResult record_result; -- 2.30.2