From 46e52df34d3074f1fc649195dded461bcb64a231 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 30 Mar 2017 08:02:14 +0100 Subject: [PATCH] radv: add tessellation ring allocation support. (v2) This patch adds support for the offchip rings for storing tessellation factors and attribute data. It includes the register setup for the TF ring v2: always do tess ring size calcs (Bas) Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 6 + src/amd/vulkan/radv_device.c | 206 +++++++++++++++++++++++++++++-- src/amd/vulkan/radv_private.h | 4 + 3 files changed, 203 insertions(+), 13 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index e0667047a9d..7d568e8c352 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -221,6 +221,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->compute_scratch_size_needed = 0; cmd_buffer->esgs_ring_size_needed = 0; cmd_buffer->gsvs_ring_size_needed = 0; + cmd_buffer->tess_rings_needed = false; if (cmd_buffer->upload.upload_bo) cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, @@ -1903,6 +1904,9 @@ void radv_CmdBindPipeline( if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size; + if (radv_pipeline_has_tess(pipeline)) + cmd_buffer->tess_rings_needed = true; + if (radv_pipeline_has_gs(pipeline)) { struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY, AC_UD_SCRATCH_RING_OFFSETS); @@ -2070,6 +2074,8 @@ void radv_CmdExecuteCommands( primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; + if (secondary->tess_rings_needed) + primary->tess_rings_needed = true; if (secondary->ring_offsets_idx != -1) { if (primary->ring_offsets_idx == -1) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index fe531e1072f..4d685646a62 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -845,6 +845,10 @@ radv_queue_finish(struct radv_queue *queue) queue->device->ws->buffer_destroy(queue->esgs_ring_bo); if (queue->gsvs_ring_bo) queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); + if (queue->tess_factor_ring_bo) + queue->device->ws->buffer_destroy(queue->tess_factor_ring_bo); + if (queue->tess_offchip_ring_bo) + queue->device->ws->buffer_destroy(queue->tess_offchip_ring_bo); if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } @@ -1182,20 +1186,29 @@ static void radv_dump_trace(struct radv_device *device, } static void -fill_geom_rings(struct radv_queue *queue, - uint32_t *map, - uint32_t esgs_ring_size, - struct radeon_winsys_bo *esgs_ring_bo, - uint32_t gsvs_ring_size, - struct radeon_winsys_bo *gsvs_ring_bo) +fill_geom_tess_rings(struct radv_queue *queue, + uint32_t *map, + uint32_t esgs_ring_size, + struct radeon_winsys_bo *esgs_ring_bo, + uint32_t gsvs_ring_size, + struct radeon_winsys_bo *gsvs_ring_bo, + uint32_t tess_factor_ring_size, + struct radeon_winsys_bo *tess_factor_ring_bo, + uint32_t tess_offchip_ring_size, + struct radeon_winsys_bo *tess_offchip_ring_bo) { uint64_t esgs_va = 0, gsvs_va = 0; + uint64_t tess_factor_va = 0, tess_offchip_va = 0; uint32_t *desc = &map[4]; if (esgs_ring_bo) esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo); if (gsvs_ring_bo) gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo); + if (tess_factor_ring_bo) + tess_factor_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo); + if (tess_offchip_ring_bo) + tess_offchip_va = queue->device->ws->buffer_get_va(tess_offchip_ring_bo); /* stride 0, num records - size, add tid, swizzle, elsize4, index stride 64 */ @@ -1270,6 +1283,88 @@ fill_geom_rings(struct radv_queue *queue, S_008F0C_ELEMENT_SIZE(1) | S_008F0C_INDEX_STRIDE(1) | S_008F0C_ADD_TID_ENABLE(true); + desc += 4; + + desc[0] = tess_factor_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_factor_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = tess_factor_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); + desc += 4; + + desc[0] = tess_offchip_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(tess_offchip_va >> 32) | + S_008F04_STRIDE(0) | + S_008F04_SWIZZLE_ENABLE(false); + desc[2] = tess_offchip_ring_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(0) | + S_008F0C_INDEX_STRIDE(0) | + S_008F0C_ADD_TID_ENABLE(false); +} + +static unsigned +radv_get_hs_offchip_param(struct radv_device *device, uint32_t *max_offchip_buffers_p) +{ + bool double_offchip_buffers = device->physical_device->rad_info.chip_class >= CIK && + device->physical_device->rad_info.family != CHIP_CARRIZO && + device->physical_device->rad_info.family != CHIP_STONEY; + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + unsigned max_offchip_buffers = max_offchip_buffers_per_se * + device->physical_device->rad_info.max_se; + unsigned offchip_granularity; + unsigned hs_offchip_param; + switch (device->tess_offchip_block_dw_size) { + default: + assert(0); + /* fall through */ + case 8192: + offchip_granularity = V_03093C_X_8K_DWORDS; + break; + case 4096: + offchip_granularity = V_03093C_X_4K_DWORDS; + break; + } + + switch (device->physical_device->rad_info.chip_class) { + case SI: + max_offchip_buffers = MIN2(max_offchip_buffers, 126); + break; + case CIK: + max_offchip_buffers = MIN2(max_offchip_buffers, 508); + break; + case VI: + default: + max_offchip_buffers = MIN2(max_offchip_buffers, 512); + break; + } + + *max_offchip_buffers_p = max_offchip_buffers; + if (device->physical_device->rad_info.chip_class >= CIK) { + if (device->physical_device->rad_info.chip_class >= VI) + --max_offchip_buffers; + hs_offchip_param = + S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity); + } else { + hs_offchip_param = + S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); + } + return hs_offchip_param; } static VkResult @@ -1278,6 +1373,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t compute_scratch_size, uint32_t esgs_ring_size, uint32_t gsvs_ring_size, + bool needs_tess_rings, struct radeon_winsys_cs **initial_preamble_cs, struct radeon_winsys_cs **continue_preamble_cs) { @@ -1286,12 +1382,28 @@ radv_get_preamble_cs(struct radv_queue *queue, struct radeon_winsys_bo *compute_scratch_bo = NULL; struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL; + struct radeon_winsys_bo *tess_factor_ring_bo = NULL; + struct radeon_winsys_bo *tess_offchip_ring_bo = NULL; struct radeon_winsys_cs *dest_cs[2] = {0}; + bool add_tess_rings = false; + unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; + unsigned max_offchip_buffers; + unsigned hs_offchip_param = 0; + if (!queue->has_tess_rings) { + if (needs_tess_rings) + add_tess_rings = true; + } + tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se; + hs_offchip_param = radv_get_hs_offchip_param(queue->device, + &max_offchip_buffers); + tess_offchip_ring_size = max_offchip_buffers * + queue->device->tess_offchip_block_dw_size * 4; if (scratch_size <= queue->scratch_size && compute_scratch_size <= queue->compute_scratch_size && esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size && + !add_tess_rings && queue->initial_preamble_cs) { *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; @@ -1349,12 +1461,35 @@ radv_get_preamble_cs(struct radv_queue *queue, gsvs_ring_size = queue->gsvs_ring_size; } + if (add_tess_rings) { + tess_factor_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + tess_factor_ring_size, + 256, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!tess_factor_ring_bo) + goto fail; + tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws, + tess_offchip_ring_size, + 256, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_CPU_ACCESS); + if (!tess_offchip_ring_bo) + goto fail; + } else { + tess_factor_ring_bo = queue->tess_factor_ring_bo; + tess_offchip_ring_bo = queue->tess_offchip_ring_bo; + } + if (scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo || - gsvs_ring_bo != queue->gsvs_ring_bo) { + gsvs_ring_bo != queue->gsvs_ring_bo || + tess_factor_ring_bo != queue->tess_factor_ring_bo || + tess_offchip_ring_bo != queue->tess_offchip_ring_bo) { uint32_t size = 0; - if (gsvs_ring_bo || esgs_ring_bo) - size = 80; /* 2 dword + 2 padding + 4 dword * 4 */ + if (gsvs_ring_bo || esgs_ring_bo || + tess_factor_ring_bo || tess_offchip_ring_bo) + size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ else if (scratch_bo) size = 8; /* 2 dword */ @@ -1386,6 +1521,12 @@ radv_get_preamble_cs(struct radv_queue *queue, if (gsvs_ring_bo) queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8); + if (tess_factor_ring_bo) + queue->device->ws->cs_add_buffer(cs, tess_factor_ring_bo, 8); + + if (tess_offchip_ring_bo) + queue->device->ws->cs_add_buffer(cs, tess_offchip_ring_bo, 8); + if (descriptor_bo) queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8); @@ -1400,18 +1541,24 @@ radv_get_preamble_cs(struct radv_queue *queue, map[1] = rsrc1; } - if (esgs_ring_bo || gsvs_ring_bo) - fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo); + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) + fill_geom_tess_rings(queue, map, + esgs_ring_size, esgs_ring_bo, + gsvs_ring_size, gsvs_ring_bo, + tess_factor_ring_size, tess_factor_ring_bo, + tess_offchip_ring_size, tess_offchip_ring_bo); queue->device->ws->buffer_unmap(descriptor_bo); } - if (esgs_ring_bo || gsvs_ring_bo) { + if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + if (esgs_ring_bo || gsvs_ring_bo) { if (queue->device->physical_device->rad_info.chip_class >= CIK) { radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2); radeon_emit(cs, esgs_ring_size >> 8); @@ -1423,6 +1570,24 @@ radv_get_preamble_cs(struct radv_queue *queue, } } + if (tess_factor_ring_bo) { + uint64_t tf_va = queue->device->ws->buffer_get_va(tess_factor_ring_bo); + if (queue->device->physical_device->rad_info.chip_class >= CIK) { + radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, + S_030938_SIZE(tess_factor_ring_size / 4)); + radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, + tf_va >> 8); + radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param); + } else { + radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE, + S_008988_SIZE(tess_factor_ring_size / 4)); + radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE, + tf_va >> 8); + radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM, + hs_offchip_param); + } + } + if (descriptor_bo) { uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0, R_00B130_SPI_SHADER_USER_DATA_VS_0, @@ -1504,6 +1669,15 @@ radv_get_preamble_cs(struct radv_queue *queue, queue->gsvs_ring_size = gsvs_ring_size; } + if (tess_factor_ring_bo != queue->tess_factor_ring_bo) { + queue->tess_factor_ring_bo = tess_factor_ring_bo; + } + + if (tess_offchip_ring_bo != queue->tess_offchip_ring_bo) { + queue->tess_offchip_ring_bo = tess_offchip_ring_bo; + queue->has_tess_rings = true; + } + if (descriptor_bo != queue->descriptor_bo) { if (queue->descriptor_bo) queue->device->ws->buffer_destroy(queue->descriptor_bo); @@ -1530,6 +1704,10 @@ fail: queue->device->ws->buffer_destroy(esgs_ring_bo); if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo) queue->device->ws->buffer_destroy(gsvs_ring_bo); + if (tess_factor_ring_bo && tess_factor_ring_bo != queue->tess_factor_ring_bo) + queue->device->ws->buffer_destroy(tess_factor_ring_bo); + if (tess_offchip_ring_bo && tess_offchip_ring_bo != queue->tess_offchip_ring_bo) + queue->device->ws->buffer_destroy(tess_offchip_ring_bo); return VK_ERROR_OUT_OF_DEVICE_MEMORY; } @@ -1551,6 +1729,7 @@ VkResult radv_QueueSubmit( struct radeon_winsys_cs *initial_preamble_cs = NULL, *continue_preamble_cs = NULL; VkResult result; bool fence_emitted = false; + bool tess_rings_needed = false; /* Do this first so failing to allocate scratch buffers can't result in * partially executed submissions. */ @@ -1564,11 +1743,12 @@ VkResult radv_QueueSubmit( cmd_buffer->compute_scratch_size_needed); esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + tess_rings_needed |= cmd_buffer->tess_rings_needed; } } result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, - esgs_ring_size, gsvs_ring_size, + esgs_ring_size, gsvs_ring_size, tess_rings_needed, &initial_preamble_cs, &continue_preamble_cs); if (result != VK_SUCCESS) return result; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 3c246641af7..d6982d826aa 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -459,12 +459,15 @@ struct radv_queue { uint32_t compute_scratch_size; uint32_t esgs_ring_size; uint32_t gsvs_ring_size; + bool has_tess_rings; struct radeon_winsys_bo *scratch_bo; struct radeon_winsys_bo *descriptor_bo; struct radeon_winsys_bo *compute_scratch_bo; struct radeon_winsys_bo *esgs_ring_bo; struct radeon_winsys_bo *gsvs_ring_bo; + struct radeon_winsys_bo *tess_factor_ring_bo; + struct radeon_winsys_bo *tess_offchip_ring_bo; struct radeon_winsys_cs *initial_preamble_cs; struct radeon_winsys_cs *continue_preamble_cs; }; @@ -744,6 +747,7 @@ struct radv_cmd_buffer { uint32_t compute_scratch_size_needed; uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; + bool tess_rings_needed; int ring_offsets_idx; /* just used for verification */ }; -- 2.30.2