From f08a80dcd493c64922c04a2563025bddabcac230 Mon Sep 17 00:00:00 2001 From: Brian Ho Date: Fri, 15 May 2020 10:52:43 -0700 Subject: [PATCH] turnip: Allocate tess BOs as a function of draw size To store tess outputs, the HS stg's into two buffers, one for per-vertex/per-patch output variables (tess_param) and one for TessLevelInner/Outer (tess_factor). The addresses of these buffers are uploaded as consts to the HS/DS and the tess_factor iova is written to REG_A6XX_PC_TESSFACTOR_ADDR. While the sizes of these buffers are a function of vetex count and patch count, allocation is relatively straightforward on freedreno- just keep track of the max required buffer size for the entire batch and allocate before batch submit. In Vulkan, however, a given pipeline can be bound multiple times across any number of command buffers, each drawing with a different number of vertices. One solution is to track the max buffer size for the entire command buffer (similar to fd_batch) and on vkEndCommandBuffer, allocate appropriately sized tess BOs. Since the tess BOs addresses are emitted as part of the pipeline state setup (e.g. PKT4 to REG_A6XX_PC_TESSFACTOR_ADDR), we need to create a new state group independent of a specific pipeline and parameterize its IB with the command buffer specific tess BO iovas. Without a larger refactor, the simplest way to do this is just to emit per-draw call consts and leverage scratch_bo to re-use buffers. This way we won't have to store and rewrite earlier packets in the command stream on vkEndCommandBuffer. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 130 +++++++++++++++++++++++++++ src/freedreno/vulkan/tu_pipeline.c | 113 +++++++++++++++++++---- src/freedreno/vulkan/tu_private.h | 10 +++ 3 files changed, 235 insertions(+), 18 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 731d361712e..a18f19ea0a0 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -3027,6 +3027,121 @@ tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } } +static uint64_t +get_tess_param_bo_size(const struct tu_pipeline *pipeline, + const struct tu_draw_info *draw_info) +{ + /* TODO: For indirect draws, we can't compute the BO size ahead of time. + * Still not sure what to do here, so just allocate a reasonably large + * BO and hope for the best for now. + * (maxTessellationControlPerVertexOutputComponents * 2048 vertices + + * maxTessellationControlPerPatchOutputComponents * 512 patches) */ + if (draw_info->indirect) { + return ((128 * 2048) + (128 * 512)) * 4; + } + + /* For each patch, adreno lays out the tess param BO in memory as: + * (v_input[0][0])...(v_input[i][j])(p_input[0])...(p_input[k]). + * where i = # vertices per patch, j = # per-vertex outputs, and + * k = # per-patch outputs.*/ + uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0; + uint32_t num_patches = draw_info->count / verts_per_patch; + return draw_info->count * pipeline->tess.per_vertex_output_size + + pipeline->tess.per_patch_output_size * num_patches; +} + +static uint64_t +get_tess_factor_bo_size(const struct tu_pipeline *pipeline, + const struct tu_draw_info *draw_info) +{ + /* TODO: For indirect draws, we can't compute the BO size ahead of time. + * Still not sure what to do here, so just allocate a reasonably large + * BO and hope for the best for now. + * (quad factor stride * 512 patches) */ + if (draw_info->indirect) { + return (28 * 512) * 4; + } + + /* Each distinct patch gets its own tess factor output. */ + uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0; + uint32_t num_patches = draw_info->count / verts_per_patch; + uint32_t factor_stride; + switch (pipeline->tess.patch_type) { + case IR3_TESS_ISOLINES: + factor_stride = 12; + break; + case IR3_TESS_TRIANGLES: + factor_stride = 20; + break; + case IR3_TESS_QUADS: + factor_stride = 28; + break; + default: + unreachable("bad tessmode"); + } + return factor_stride * num_patches; +} + +static VkResult +tu6_emit_tess_consts(struct tu_cmd_buffer *cmd, + const struct tu_draw_info *draw, + const struct tu_pipeline *pipeline, + struct tu_cs_entry *entry) +{ + struct tu_cs cs; + VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 20, &cs); + if (result != VK_SUCCESS) + return result; + + uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw); + uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw); + uint64_t tess_bo_size = tess_factor_size + tess_param_size; + if (tess_bo_size > 0) { + struct tu_bo *tess_bo; + result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo); + if (result != VK_SUCCESS) + return result; + + tu_bo_list_add(&cmd->bo_list, tess_bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + uint64_t tess_factor_iova = tess_bo->iova; + uint64_t tess_param_iova = tess_factor_iova + tess_factor_size; + + tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_qw(&cs, tess_param_iova); + tu_cs_emit_qw(&cs, tess_factor_iova); + + tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_qw(&cs, tess_param_iova); + tu_cs_emit_qw(&cs, tess_factor_iova); + + tu_cs_emit_pkt4(&cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2); + tu_cs_emit_qw(&cs, tess_factor_iova); + + /* TODO: Without this WFI here, the hardware seems unable to read these + * addresses we just emitted. Freedreno emits these consts as part of + * IB1 instead of in a draw state which might make this WFI unnecessary, + * but it requires a bit more indirection (SS6_INDIRECT for consts). */ + tu_cs_emit_wfi(&cs); + } + *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs); + return VK_SUCCESS; +} + static VkResult tu6_bind_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3092,6 +3207,15 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, if (result != VK_SUCCESS) return result; + bool has_tess = + pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; + struct tu_cs_entry tess_consts = {}; + if (has_tess) { + result = tu6_emit_tess_consts(cmd, draw, pipeline, &tess_consts); + if (result != VK_SUCCESS) + return result; + } + /* for the first draw in a renderpass, re-emit all the draw states * * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was @@ -3107,6 +3231,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib); + tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib); @@ -3132,6 +3257,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, * note we eventually don't want to have to emit anything here */ uint32_t draw_state_count = + has_tess + ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 3 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + @@ -3139,6 +3265,10 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count); + /* We may need to re-emit tess consts if the current draw call is + * sufficiently larger than the last draw call. */ + if (has_tess) + tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts); if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) { tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]); diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index e6442575af5..5e21b6031fa 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -663,8 +663,8 @@ tu6_emit_link_map(struct tu_cs *cs, if (size <= 0) return; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, size, - patch_locs); + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, + size, patch_locs); } static uint16_t @@ -1129,24 +1129,65 @@ tu6_emit_fs_outputs(struct tu_cs *cs, } static void -tu6_emit_geometry_consts(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *gs) { - unsigned num_vertices = gs->shader->nir->info.gs.vertices_in; - - uint32_t params[4] = { - vs->output_size * num_vertices * 4, /* primitive stride */ - vs->output_size * 4, /* vertex stride */ +tu6_emit_geom_tess_consts(struct tu_cs *cs, + const struct ir3_shader_variant *vs, + const struct ir3_shader_variant *hs, + const struct ir3_shader_variant *ds, + const struct ir3_shader_variant *gs, + uint32_t cps_per_patch) +{ + uint32_t num_vertices = + hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in; + + uint32_t vs_params[4] = { + vs->output_size * num_vertices * 4, /* vs primitive stride */ + vs->output_size * 4, /* vs vertex stride */ 0, 0, }; uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param; tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, - ARRAY_SIZE(params), params); + ARRAY_SIZE(vs_params), vs_params); + + if (hs) { + assert(ds->type != MESA_SHADER_NONE); + uint32_t hs_params[4] = { + vs->output_size * num_vertices * 4, /* hs primitive stride */ + vs->output_size * 4, /* hs vertex stride */ + hs->output_size, + cps_per_patch, + }; + + uint32_t hs_base = hs->const_state->offsets.primitive_param; + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, + ARRAY_SIZE(hs_params), hs_params); + if (gs) + num_vertices = gs->shader->nir->info.gs.vertices_in; + + uint32_t ds_params[4] = { + ds->output_size * num_vertices * 4, /* ds primitive stride */ + ds->output_size * 4, /* ds vertex stride */ + hs->output_size, /* hs vertex stride (dwords) */ + hs->shader->nir->info.tess.tcs_vertices_out + }; + + uint32_t ds_base = ds->const_state->offsets.primitive_param; + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, + ARRAY_SIZE(ds_params), ds_params); + } - uint32_t gs_base = ir3_const_state(gs)->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, - ARRAY_SIZE(params), params); + if (gs) { + const struct ir3_shader_variant *prev = ds ? ds : vs; + uint32_t gs_params[4] = { + prev->output_size * num_vertices * 4, /* gs primitive stride */ + prev->output_size * 4, /* gs vertex stride */ + 0, + 0, + }; + uint32_t gs_base = gs->const_state->offsets.primitive_param; + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, + ARRAY_SIZE(gs_params), gs_params); + } } static void @@ -1158,6 +1199,8 @@ tu6_emit_program(struct tu_cs *cs, { const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; const struct ir3_shader_variant *bs = builder->binning_variant; + const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; + const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY]; const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT]; gl_shader_stage stage = MESA_SHADER_VERTEX; @@ -1207,8 +1250,11 @@ tu6_emit_program(struct tu_cs *cs, builder->render_components); } - if (gs) - tu6_emit_geometry_consts(cs, vs, gs); + if (gs || hs) { + uint32_t cps_per_patch = builder->create_info->pTessellationState ? + builder->create_info->pTessellationState->patchControlPoints : 0; + tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch); + } } static void @@ -1695,7 +1741,8 @@ tu6_get_tessmode(struct tu_shader* shader) } static VkResult -tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder) +tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, + struct tu_pipeline *pipeline) { const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = { NULL @@ -1732,6 +1779,8 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder) builder->shaders[stage] = shader; } + pipeline->tess.patch_type = key.tessellation; + for (gl_shader_stage stage = MESA_SHADER_STAGES - 1; stage > MESA_SHADER_NONE; stage--) { if (!builder->shaders[stage]) @@ -1767,6 +1816,30 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder) sizeof(uint32_t) * variant->info.sizedwords; builder->binning_variant = variant; + if (builder->shaders[MESA_SHADER_TESS_CTRL]) { + struct ir3_shader *hs = + builder->shaders[MESA_SHADER_TESS_CTRL]->ir3_shader; + assert(hs->type != MESA_SHADER_NONE); + + /* Calculate and store the per-vertex and per-patch HS-output sizes. */ + uint32_t per_vertex_output_size = 0; + uint32_t per_patch_output_size = 0; + nir_foreach_variable (output, &hs->nir->outputs) { + switch (output->data.location) { + case VARYING_SLOT_TESS_LEVEL_OUTER: + case VARYING_SLOT_TESS_LEVEL_INNER: + continue; + } + uint32_t size = glsl_count_attribute_slots(output->type, false) * 4; + if (output->data.patch) + per_patch_output_size += size; + else + per_vertex_output_size += size; + } + pipeline->tess.per_vertex_output_size = per_vertex_output_size; + pipeline->tess.per_patch_output_size = per_patch_output_size; + } + return VK_SUCCESS; } @@ -1942,6 +2015,10 @@ tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder, assert(pipeline->ia.primtype == DI_PT_PATCHES0); assert(tess_info->patchControlPoints <= 32); pipeline->ia.primtype += tess_info->patchControlPoints; + const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; + const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; + pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1; + pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1; } static void @@ -2151,7 +2228,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, (*pipeline)->layout = builder->layout; /* compile and upload shaders */ - result = tu_pipeline_builder_compile_shaders(builder); + result = tu_pipeline_builder_compile_shaders(builder, *pipeline); if (result == VK_SUCCESS) result = tu_pipeline_builder_upload_shaders(builder, *pipeline); if (result != VK_SUCCESS) { diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 39f303ee7d7..153184f5999 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -427,6 +427,7 @@ enum tu_draw_state_group_id { TU_DRAW_STATE_PROGRAM, TU_DRAW_STATE_PROGRAM_BINNING, + TU_DRAW_STATE_TESS, TU_DRAW_STATE_VB, TU_DRAW_STATE_VI, TU_DRAW_STATE_VI_BINNING, @@ -1100,6 +1101,15 @@ struct tu_pipeline bool primitive_restart; } ia; + struct + { + uint32_t patch_type; + uint32_t per_vertex_output_size; + uint32_t per_patch_output_size; + uint32_t hs_bo_regid; + uint32_t ds_bo_regid; + } tess; + struct { struct tu_cs_entry state_ib; -- 2.30.2