From 1fa5b755c23f97d6a92465b08d460bb4406e44dd Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 20 Jan 2017 11:06:52 +1000 Subject: [PATCH] radv: emit geometry ring size and pointers via preamble (v2) This uses the scratch infrastructure to handle the esgs and gsvs rings. (this replaces the old code that did this with patching). v2: fix correct ring sizes, reset sizes (Bas) Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 31 +++++ src/amd/vulkan/radv_device.c | 202 +++++++++++++++++++++++++++++-- src/amd/vulkan/radv_private.h | 8 ++ 3 files changed, 230 insertions(+), 11 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 3d1542d4663..9bc50ad0929 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1457,12 +1457,17 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->scratch_size_needed = 0; cmd_buffer->compute_scratch_size_needed = 0; + cmd_buffer->esgs_ring_size_needed = 0; + cmd_buffer->gsvs_ring_size_needed = 0; + if (cmd_buffer->upload.upload_bo) cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->upload.upload_bo, 8); cmd_buffer->upload.offset = 0; cmd_buffer->record_fail = false; + + cmd_buffer->ring_offsets_idx = -1; } VkResult radv_ResetCommandBuffer( @@ -1649,6 +1654,7 @@ VkResult radv_EndCommandBuffer( if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) si_emit_cache_flush(cmd_buffer); + if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || cmd_buffer->record_fail) return VK_ERROR_OUT_OF_DEVICE_MEMORY; @@ -1735,6 +1741,20 @@ void radv_CmdBindPipeline( radv_dynamic_state_copy(&cmd_buffer->state.dynamic, &pipeline->dynamic_state, pipeline->dynamic_state_mask); + + if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed) + cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size; + if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) + cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; + + if (radv_pipeline_has_gs(pipeline)) { + struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, + AC_UD_SCRATCH_RING_OFFSETS); + if (cmd_buffer->ring_offsets_idx == -1) + cmd_buffer->ring_offsets_idx = loc->sgpr_idx; + else if (loc->sgpr_idx != -1) + assert(loc->sgpr_idx != cmd_buffer->ring_offsets_idx); + } break; default: assert(!"invalid bind point"); @@ -1887,6 +1907,17 @@ void radv_CmdExecuteCommands( primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed, secondary->compute_scratch_size_needed); + if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) + primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; + if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) + primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; + + if (secondary->ring_offsets_idx != -1) { + if (primary->ring_offsets_idx == -1) + primary->ring_offsets_idx = secondary->ring_offsets_idx; + else + assert(secondary->ring_offsets_idx == primary->ring_offsets_idx); + } primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); } diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index b7978bb16a2..984bd75bb80 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -764,6 +764,10 @@ radv_queue_finish(struct radv_queue *queue) queue->device->ws->buffer_destroy(queue->descriptor_bo); if (queue->scratch_bo) queue->device->ws->buffer_destroy(queue->scratch_bo); + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } @@ -1046,24 +1050,118 @@ static void radv_dump_trace(struct radv_device *device, fclose(f); } +static void +fill_geom_rings(struct radv_queue *queue, + uint32_t *map, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t gsvs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo) +{ + uint64_t esgs_va, gsvs_va; + esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); + gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); + uint32_t *desc = &map[4]; + + /* stride 0, num records - size, add tid, swizzle, elsize4, + index stride 64 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(3) | + S_008F0C_ADD_TID_ENABLE(true); + + desc += 4; + /* GS entry for ES->GS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = esgs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = esgs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + + desc += 4; + /* VS entry for GS->VS ring */ + /* stride 0, num records - size, elsize0, + index stride 0 */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = gsvs_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + /* stride gsvs_itemsize, num records 64 + elsize 4, index stride 16 */ + /* shader will patch stride and desc[2] */ + desc[0] = gsvs_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)| + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(true); + desc[2] = 0; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | + S_008F0C_INDEX_STRIDE(1) | + S_008F0C_ADD_TID_ENABLE(true); +} + static VkResult radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size, uint32_t compute_scratch_size, + uint32_t esgs_ring_size, + uint32_t gsvs_ring_size, struct radeon_winsys_cs **preamble_cs) { struct radeon_winsys_bo *scratch_bo = NULL; struct radeon_winsys_bo *descriptor_bo = NULL; struct radeon_winsys_bo *compute_scratch_bo = NULL; + struct radeon_winsys_bo *esgs_ring_bo = NULL; + struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_cs *cs = NULL; - if (!scratch_size && !compute_scratch_size) { + if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) { *preamble_cs = NULL; return VK_SUCCESS; } if (scratch_size <= queue->scratch_size && - compute_scratch_size <= queue->compute_scratch_size) { + compute_scratch_size <= queue->compute_scratch_size && + esgs_ring_size <= queue->esgs_ring_size && + gsvs_ring_size <= queue->gsvs_ring_size) { *preamble_cs = queue->preamble_cs; return VK_SUCCESS; } @@ -1091,9 +1189,43 @@ radv_get_preamble_cs(struct radv_queue *queue, } else compute_scratch_bo = queue->compute_scratch_bo; - if (scratch_bo != queue->scratch_bo) { + if (esgs_ring_size > queue->esgs_ring_size) { + esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + esgs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!esgs_ring_bo) + goto fail; + } else { + esgs_ring_bo = queue->esgs_ring_bo; + esgs_ring_size = queue->esgs_ring_size; + } + + if (gsvs_ring_size > queue->gsvs_ring_size) { + gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + gsvs_ring_size, + 4096, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!gsvs_ring_bo) + goto fail; + } else { + gsvs_ring_bo = queue->gsvs_ring_bo; + gsvs_ring_size = queue->gsvs_ring_size; + } + + if (scratch_bo != queue->scratch_bo || + esgs_ring_bo != queue->esgs_ring_bo || + gsvs_ring_bo != queue->gsvs_ring_bo) { + uint32_t size = 0; + if (gsvs_ring_bo || esgs_ring_bo) + size = 80; /* 2 dword + 2 padding + 4 dword * 4 */ + else if (scratch_bo) + size = 8; /* 2 dword */ + descriptor_bo = queue->device->ws->buffer_create(queue->device->ws, - 8, + size, 4096, RADEON_DOMAIN_VRAM, RADEON_FLAG_CPU_ACCESS); @@ -1111,22 +1243,49 @@ radv_get_preamble_cs(struct radv_queue *queue, if (scratch_bo) queue->device->ws->cs_add_buffer(cs, scratch_bo, 8); + if (esgs_ring_bo) + queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8); + + if (gsvs_ring_bo) + queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); + if (descriptor_bo) queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); if (descriptor_bo != queue->descriptor_bo) { - uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); - uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | - S_008F04_SWIZZLE_ENABLE(1); - uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo); - map[0] = scratch_va; - map[1] = rsrc1; + if (scratch_bo) { + uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo); + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + map[0] = scratch_va; + map[1] = rsrc1; + } + + if (esgs_ring_bo || gsvs_ring_bo) + fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo); queue->device->ws->buffer_unmap(descriptor_bo); } + if (esgs_ring_bo || gsvs_ring_bo) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } else { + radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2); + radeon_emit(cs, esgs_ring_size >> 8); + radeon_emit(cs, gsvs_ring_size >> 8); + } + } + if (descriptor_bo) { uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, R_00B130_SPI_SHADER_USER_DATA_VS_0, @@ -1178,6 +1337,20 @@ radv_get_preamble_cs(struct radv_queue *queue, queue->compute_scratch_size = compute_scratch_size; } + if (esgs_ring_bo != queue->esgs_ring_bo) { + if (queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(queue->esgs_ring_bo); + queue->esgs_ring_bo = esgs_ring_bo; + queue->esgs_ring_size = esgs_ring_size; + } + + if (gsvs_ring_bo != queue->gsvs_ring_bo) { + if (queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + queue->gsvs_ring_bo = gsvs_ring_bo; + queue->gsvs_ring_size = gsvs_ring_size; + } + if (descriptor_bo != queue->descriptor_bo) { if (queue->descriptor_bo) queue->device->ws->buffer_destroy(queue->descriptor_bo); @@ -1196,6 +1369,10 @@ fail: queue->device->ws->buffer_destroy(scratch_bo); if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo) queue->device->ws->buffer_destroy(compute_scratch_bo); + if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo) + queue->device->ws->buffer_destroy(esgs_ring_bo); + if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) + queue->device->ws->buffer_destroy(gsvs_ring_bo); return VK_ERROR_OUT_OF_DEVICE_MEMORY; } @@ -1213,6 +1390,7 @@ VkResult radv_QueueSubmit( uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; uint32_t scratch_size = 0; uint32_t compute_scratch_size = 0; + uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; struct radeon_winsys_cs *preamble_cs = NULL; VkResult result; @@ -1226,10 +1404,12 @@ VkResult radv_QueueSubmit( scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); compute_scratch_size = MAX2(compute_scratch_size, cmd_buffer->compute_scratch_size_needed); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); } } - result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, &preamble_cs); + result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs); if (result != VK_SUCCESS) return result; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 3a0318b9fc2..57aa9ead9b7 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -470,10 +470,14 @@ struct radv_queue { uint32_t scratch_size; uint32_t compute_scratch_size; + uint32_t esgs_ring_size; + uint32_t gsvs_ring_size; struct radeon_winsys_bo *scratch_bo; struct radeon_winsys_bo *descriptor_bo; struct radeon_winsys_bo *compute_scratch_bo; + struct radeon_winsys_bo *esgs_ring_bo; + struct radeon_winsys_bo *gsvs_ring_bo; struct radeon_winsys_cs *preamble_cs; }; @@ -742,6 +746,10 @@ struct radv_cmd_buffer { uint32_t scratch_size_needed; uint32_t compute_scratch_size_needed; + uint32_t esgs_ring_size_needed; + uint32_t gsvs_ring_size_needed; + + int ring_offsets_idx; /* just used for verification */ }; struct radv_image; -- 2.30.2