From f08a80dcd493c64922c04a2563025bddabcac230 Mon Sep 17 00:00:00 2001
From: Brian Ho <brian@brkho.com>
Date: Fri, 15 May 2020 10:52:43 -0700
Subject: [PATCH] turnip: Allocate tess BOs as a function of draw size

To store tess outputs, the HS stg's into two buffers, one for
per-vertex/per-patch output variables (tess_param) and one for
TessLevelInner/Outer (tess_factor). The addresses of these buffers
are uploaded as consts to the HS/DS and the tess_factor iova is
written to REG_A6XX_PC_TESSFACTOR_ADDR. While the sizes of these
buffers are a function of vetex count and patch count, allocation is
relatively straightforward on freedreno- just keep track of the max
required buffer size for the entire batch and allocate before batch
submit.

In Vulkan, however, a given pipeline can be bound multiple times
across any number of command buffers, each drawing with a different
number of vertices. One solution is to track the max buffer size for
the entire command buffer (similar to fd_batch) and on
vkEndCommandBuffer, allocate appropriately sized tess BOs. Since the
tess BOs addresses are emitted as part of the pipeline state setup
(e.g. PKT4 to REG_A6XX_PC_TESSFACTOR_ADDR), we need to create a new
state group independent of a specific pipeline and parameterize its
IB with the command buffer specific tess BO iovas.

Without a larger refactor, the simplest way to do this is just to
emit per-draw call consts and leverage scratch_bo to re-use buffers.
This way we won't have to store and rewrite earlier packets in the
command stream on vkEndCommandBuffer.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5059>
---
 src/freedreno/vulkan/tu_cmd_buffer.c | 130 +++++++++++++++++++++++++++
 src/freedreno/vulkan/tu_pipeline.c   | 113 +++++++++++++++++++----
 src/freedreno/vulkan/tu_private.h    |  10 +++
 3 files changed, 235 insertions(+), 18 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 731d361712e..a18f19ea0a0 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3027,6 +3027,121 @@ tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    }
 }
 
+static uint64_t
+get_tess_param_bo_size(const struct tu_pipeline *pipeline,
+                       const struct tu_draw_info *draw_info)
+{
+   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
+    * Still not sure what to do here, so just allocate a reasonably large
+    * BO and hope for the best for now.
+    * (maxTessellationControlPerVertexOutputComponents * 2048 vertices +
+    *  maxTessellationControlPerPatchOutputComponents * 512 patches) */
+   if (draw_info->indirect) {
+      return ((128 * 2048) + (128 * 512)) * 4;
+   }
+
+   /* For each patch, adreno lays out the tess param BO in memory as:
+    * (v_input[0][0])...(v_input[i][j])(p_input[0])...(p_input[k]).
+    * where i = # vertices per patch, j = # per-vertex outputs, and
+    * k = # per-patch outputs.*/
+   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
+   uint32_t num_patches = draw_info->count / verts_per_patch;
+   return draw_info->count * pipeline->tess.per_vertex_output_size +
+          pipeline->tess.per_patch_output_size * num_patches;
+}
+
+static uint64_t
+get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
+                        const struct tu_draw_info *draw_info)
+{
+   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
+    * Still not sure what to do here, so just allocate a reasonably large
+    * BO and hope for the best for now.
+    * (quad factor stride * 512 patches) */
+   if (draw_info->indirect) {
+      return (28 * 512) * 4;
+   }
+
+   /* Each distinct patch gets its own tess factor output. */
+   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
+   uint32_t num_patches = draw_info->count / verts_per_patch;
+   uint32_t factor_stride;
+   switch (pipeline->tess.patch_type) {
+   case IR3_TESS_ISOLINES:
+      factor_stride = 12;
+      break;
+   case IR3_TESS_TRIANGLES:
+      factor_stride = 20;
+      break;
+   case IR3_TESS_QUADS:
+      factor_stride = 28;
+      break;
+   default:
+      unreachable("bad tessmode");
+   }
+   return factor_stride * num_patches;
+}
+
+static VkResult
+tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
+                     const struct tu_draw_info *draw,
+                     const struct tu_pipeline *pipeline,
+                     struct tu_cs_entry *entry)
+{
+   struct tu_cs cs;
+   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 20, &cs);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw);
+   uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw);
+   uint64_t tess_bo_size =  tess_factor_size + tess_param_size;
+   if (tess_bo_size > 0) {
+      struct tu_bo *tess_bo;
+      result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
+      if (result != VK_SUCCESS)
+         return result;
+
+      tu_bo_list_add(&cmd->bo_list, tess_bo,
+            MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+      uint64_t tess_factor_iova = tess_bo->iova;
+      uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
+
+      tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
+      tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
+            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+            CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
+            CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_qw(&cs, tess_param_iova);
+      tu_cs_emit_qw(&cs, tess_factor_iova);
+
+      tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
+      tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
+            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+            CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
+            CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_qw(&cs, tess_param_iova);
+      tu_cs_emit_qw(&cs, tess_factor_iova);
+
+      tu_cs_emit_pkt4(&cs, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
+      tu_cs_emit_qw(&cs, tess_factor_iova);
+
+      /* TODO: Without this WFI here, the hardware seems unable to read these
+       * addresses we just emitted. Freedreno emits these consts as part of
+       * IB1 instead of in a draw state which might make this WFI unnecessary,
+       * but it requires a bit more indirection (SS6_INDIRECT for consts). */
+      tu_cs_emit_wfi(&cs);
+   }
+   *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
+   return VK_SUCCESS;
+}
+
 static VkResult
 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
@@ -3092,6 +3207,15 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
    if (result != VK_SUCCESS)
       return result;
 
+   bool has_tess =
+         pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+   struct tu_cs_entry tess_consts = {};
+   if (has_tess) {
+      result = tu6_emit_tess_consts(cmd, draw, pipeline, &tess_consts);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
    /* for the first draw in a renderpass, re-emit all the draw states
     *
     * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
@@ -3107,6 +3231,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
 
       tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state_ib);
       tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state_ib);
+      tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
       tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI, pipeline->vi.state_ib);
       tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state_ib);
       tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_RAST, pipeline->rast.state_ib);
@@ -3132,6 +3257,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
        * note we eventually don't want to have to emit anything here
        */
       uint32_t draw_state_count =
+         has_tess +
          ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 3 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
@@ -3139,6 +3265,10 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
 
          tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
 
+         /* We may need to re-emit tess consts if the current draw call is
+          * sufficiently larger than the last draw call. */
+         if (has_tess)
+            tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_TESS, tess_consts);
          if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
             tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_CONST, cmd->state.shader_const_ib[MESA_SHADER_VERTEX]);
             tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]);
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index e6442575af5..5e21b6031fa 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -663,8 +663,8 @@ tu6_emit_link_map(struct tu_cs *cs,
    if (size <= 0)
       return;
 
-   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, size,
-                  patch_locs);
+   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0,
+                         size, patch_locs);
 }
 
 static uint16_t
@@ -1129,24 +1129,65 @@ tu6_emit_fs_outputs(struct tu_cs *cs,
 }
 
 static void
-tu6_emit_geometry_consts(struct tu_cs *cs,
-                         const struct ir3_shader_variant *vs,
-                         const struct ir3_shader_variant *gs) {
-   unsigned num_vertices = gs->shader->nir->info.gs.vertices_in;
-
-   uint32_t params[4] = {
-      vs->output_size * num_vertices * 4,  /* primitive stride */
-      vs->output_size * 4,                 /* vertex stride */
+tu6_emit_geom_tess_consts(struct tu_cs *cs,
+                          const struct ir3_shader_variant *vs,
+                          const struct ir3_shader_variant *hs,
+                          const struct ir3_shader_variant *ds,
+                          const struct ir3_shader_variant *gs,
+                          uint32_t cps_per_patch)
+{
+   uint32_t num_vertices =
+         hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
+
+   uint32_t vs_params[4] = {
+      vs->output_size * num_vertices * 4,  /* vs primitive stride */
+      vs->output_size * 4,                 /* vs vertex stride */
       0,
       0,
    };
    uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
-                  ARRAY_SIZE(params), params);
+                  ARRAY_SIZE(vs_params), vs_params);
+
+   if (hs) {
+      assert(ds->type != MESA_SHADER_NONE);
+      uint32_t hs_params[4] = {
+         vs->output_size * num_vertices * 4,  /* hs primitive stride */
+         vs->output_size * 4,                 /* hs vertex stride */
+         hs->output_size,
+         cps_per_patch,
+      };
+
+      uint32_t hs_base = hs->const_state->offsets.primitive_param;
+      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
+                     ARRAY_SIZE(hs_params), hs_params);
+      if (gs)
+         num_vertices = gs->shader->nir->info.gs.vertices_in;
+
+      uint32_t ds_params[4] = {
+         ds->output_size * num_vertices * 4,  /* ds primitive stride */
+         ds->output_size * 4,                 /* ds vertex stride */
+         hs->output_size,                     /* hs vertex stride (dwords) */
+         hs->shader->nir->info.tess.tcs_vertices_out
+      };
+
+      uint32_t ds_base = ds->const_state->offsets.primitive_param;
+      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
+                     ARRAY_SIZE(ds_params), ds_params);
+   }
 
-   uint32_t gs_base = ir3_const_state(gs)->offsets.primitive_param;
-   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
-                  ARRAY_SIZE(params), params);
+   if (gs) {
+      const struct ir3_shader_variant *prev = ds ? ds : vs;
+      uint32_t gs_params[4] = {
+         prev->output_size * num_vertices * 4,  /* gs primitive stride */
+         prev->output_size * 4,                 /* gs vertex stride */
+         0,
+         0,
+      };
+      uint32_t gs_base = gs->const_state->offsets.primitive_param;
+      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
+                     ARRAY_SIZE(gs_params), gs_params);
+   }
 }
 
 static void
@@ -1158,6 +1199,8 @@ tu6_emit_program(struct tu_cs *cs,
 {
    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
    const struct ir3_shader_variant *bs = builder->binning_variant;
+   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
+   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
    const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
    const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
    gl_shader_stage stage = MESA_SHADER_VERTEX;
@@ -1207,8 +1250,11 @@ tu6_emit_program(struct tu_cs *cs,
                           builder->render_components);
    }
 
-   if (gs)
-      tu6_emit_geometry_consts(cs, vs, gs);
+   if (gs || hs) {
+      uint32_t cps_per_patch = builder->create_info->pTessellationState ?
+            builder->create_info->pTessellationState->patchControlPoints : 0;
+      tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
+   }
 }
 
 static void
@@ -1695,7 +1741,8 @@ tu6_get_tessmode(struct tu_shader* shader)
 }
 
 static VkResult
-tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder)
+tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
+                                    struct tu_pipeline *pipeline)
 {
    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
       NULL
@@ -1732,6 +1779,8 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder)
       builder->shaders[stage] = shader;
    }
 
+   pipeline->tess.patch_type = key.tessellation;
+
    for (gl_shader_stage stage = MESA_SHADER_STAGES - 1;
         stage > MESA_SHADER_NONE; stage--) {
       if (!builder->shaders[stage])
@@ -1767,6 +1816,30 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder)
       sizeof(uint32_t) * variant->info.sizedwords;
    builder->binning_variant = variant;
 
+   if (builder->shaders[MESA_SHADER_TESS_CTRL]) {
+      struct ir3_shader *hs =
+            builder->shaders[MESA_SHADER_TESS_CTRL]->ir3_shader;
+      assert(hs->type != MESA_SHADER_NONE);
+
+      /* Calculate and store the per-vertex and per-patch HS-output sizes. */
+      uint32_t per_vertex_output_size = 0;
+      uint32_t per_patch_output_size = 0;
+      nir_foreach_variable (output, &hs->nir->outputs) {
+         switch (output->data.location) {
+         case VARYING_SLOT_TESS_LEVEL_OUTER:
+         case VARYING_SLOT_TESS_LEVEL_INNER:
+            continue;
+         }
+         uint32_t size = glsl_count_attribute_slots(output->type, false) * 4;
+         if (output->data.patch)
+            per_patch_output_size += size;
+         else
+            per_vertex_output_size += size;
+      }
+      pipeline->tess.per_vertex_output_size = per_vertex_output_size;
+      pipeline->tess.per_patch_output_size = per_patch_output_size;
+   }
+
    return VK_SUCCESS;
 }
 
@@ -1942,6 +2015,10 @@ tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
    assert(pipeline->ia.primtype == DI_PT_PATCHES0);
    assert(tess_info->patchControlPoints <= 32);
    pipeline->ia.primtype += tess_info->patchControlPoints;
+   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
+   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
+   pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
+   pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
 }
 
 static void
@@ -2151,7 +2228,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
    (*pipeline)->layout = builder->layout;
 
    /* compile and upload shaders */
-   result = tu_pipeline_builder_compile_shaders(builder);
+   result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
    if (result == VK_SUCCESS)
       result = tu_pipeline_builder_upload_shaders(builder, *pipeline);
    if (result != VK_SUCCESS) {
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 39f303ee7d7..153184f5999 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -427,6 +427,7 @@ enum tu_draw_state_group_id
 {
    TU_DRAW_STATE_PROGRAM,
    TU_DRAW_STATE_PROGRAM_BINNING,
+   TU_DRAW_STATE_TESS,
    TU_DRAW_STATE_VB,
    TU_DRAW_STATE_VI,
    TU_DRAW_STATE_VI_BINNING,
@@ -1100,6 +1101,15 @@ struct tu_pipeline
       bool primitive_restart;
    } ia;
 
+   struct
+   {
+      uint32_t patch_type;
+      uint32_t per_vertex_output_size;
+      uint32_t per_patch_output_size;
+      uint32_t hs_bo_regid;
+      uint32_t ds_bo_regid;
+   } tess;
+
    struct
    {
       struct tu_cs_entry state_ib;
-- 
2.30.2