From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 7 Nov 2019 14:40:45 +0000 (+0100)
Subject: v3d: compute appropriate VPM memory configuration for geometry shader workloads
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=76fc8c8bb1979122af40ed143fed726050b293b9;p=mesa.git

v3d: compute appropriate VPM memory configuration for geometry shader workloads

Geometry shaders can output many vertices and thus have higher VPM memory
pressure as a result. It is possible that too wide geometry shader dispatches
exceed the maximum available VPM output allocated, in which case we need
to reduce the dispatch width until we can fit the VPM memory requirements.
Supported dispatch widths for geometry shaders are 16, 8, 4, 1.

There is a limit in the number of VPM output sectors that can be used by a
geometry shader that we can meet by lowering the dispatch width at compile
time, however, at draw time we need to revisit this number and, together with
other elements that can contribute to total VPM memory requirements, decide
on a configuration that can fit the program into the available VPM memory.
Ideally, we also want to aim for not using more than half of the available
memory so we that we can run a pair of bin and render programs in parallel.

v2: fixed language in comment and typo in commit log. (Alejandro)

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
---

diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index a5b4748aaf0..4249c181bf1 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -716,6 +716,13 @@ struct v3d_gs_prog_data {
         /* Total number of components written, for the shader state record. */
         uint32_t vpm_output_size;
 
+        /* Maximum SIMD dispatch width to not exceed VPM output size limits
+         * in the geometry shader. Notice that the final dispatch width has to
+         * be decided at draw time and could be lower based on the VPM pressure
+         * added by other shader stages.
+         */
+        uint8_t simd_width;
+
         /* Output primitive type */
         uint8_t out_prim_type;
 
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 4f88b86a5d2..34f7773f066 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -680,6 +680,24 @@ v3d_gs_set_prog_data(struct v3d_compile *c,
         /* Output segment size is in sectors (8 rows of 32 bits per channel) */
         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
 
+        /* Compute SIMD dispatch width and update VPM output size accordingly
+         * to ensure we can fit our program in memory. Available widths are
+         * 16, 8, 4, 1.
+         *
+         * Notice that at draw time we will have to consider VPM memory
+         * requirements from other stages and choose a smaller dispatch
+         * width if needed to fit the program in VPM memory.
+         */
+        prog_data->simd_width = 16;
+        while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
+               prog_data->simd_width == 2) {
+                prog_data->simd_width >>= 1;
+                prog_data->vpm_output_size =
+                        align(prog_data->vpm_output_size, 2) / 2;
+        }
+        assert(prog_data->vpm_output_size <= 16);
+        assert(prog_data->simd_width != 2);
+
         prog_data->out_prim_type = c->s->info.gs.output_primitive;
         prog_data->num_invocations = c->s->info.gs.invocations;
 }
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 07d2749a87a..a614a6c8158 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -328,6 +328,16 @@ v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job)
         }
 }
 
+struct vpm_config {
+        uint32_t As;
+        uint32_t Vc;
+        uint32_t Gs;
+        uint32_t Gd;
+        uint32_t Gv;
+        uint32_t Ve;
+        uint32_t gs_width;
+};
+
 #if V3D_VERSION >= 41
 static void
 v3d_emit_gs_state_record(struct v3d_job *job,
@@ -398,9 +408,28 @@ v3d_emit_tes_gs_common_params(struct v3d_job *job,
         }
 }
 
+static uint8_t
+simd_width_to_gs_pack_mode(uint32_t width)
+{
+    switch (width) {
+    case 16:
+        return V3D_PACK_MODE_16_WAY;
+    case 8:
+        return V3D_PACK_MODE_8_WAY;
+    case 4:
+        return V3D_PACK_MODE_4_WAY;
+    case 1:
+        return V3D_PACK_MODE_1_WAY;
+    default:
+        unreachable("Invalid SIMD width");
+    };
+}
+
 static void
 v3d_emit_tes_gs_shader_params(struct v3d_job *job,
-                              struct v3d_gs_prog_data *gs)
+                              uint32_t gs_simd,
+                              uint32_t gs_vpm_output_size,
+                              uint32_t gs_max_vpm_input_size_per_batch)
 {
         cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
                 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
@@ -409,9 +438,9 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job,
                 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
                 shader.tes_output_segment_size_in_sectors = 1;
                 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
-                shader.gs_output_segment_size_in_sectors =
-                        gs->vpm_output_size;
-                shader.gs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; /* FIXME*/
+                shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
+                shader.gs_output_segment_pack_mode =
+                        simd_width_to_gs_pack_mode(gs_simd);
                 shader.tbg_max_patches_per_tcs_batch = 1;
                 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
                 shader.tbg_min_tcs_output_segments_required_in_play = 1;
@@ -420,11 +449,156 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job,
                 shader.tpg_max_vertex_segments_per_tes_batch = 0;
                 shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
                 shader.tpg_min_tes_output_segments_required_in_play = 1;
-                shader.gbg_max_tes_output_vertex_segments_per_gs_batch = 0;
+                shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
+                        gs_max_vpm_input_size_per_batch;
                 shader.gbg_min_gs_output_segments_required_in_play = 1;
         }
 }
 
+static inline uint32_t
+compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
+{
+        assert(devinfo->vpm_size > 0);
+        const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
+        return devinfo->vpm_size / sector_size;
+}
+
+/* Computes various parameters affecting VPM memory configuration for programs
+ * involving geometry shaders to ensure the program fits in memory and honors
+ * requirements described in section "VPM usage" of the programming manual.
+ */
+static void
+compute_vpm_config_gs(struct v3d_device_info *devinfo,
+                      struct v3d_vs_prog_data *vs,
+                      struct v3d_gs_prog_data *gs,
+                      struct vpm_config *vpm_cfg_out)
+{
+        const uint32_t A = vs->separate_segments ? 1 : 0;
+        const uint32_t Ad = vs->vpm_input_size;
+        const uint32_t Vd = vs->vpm_output_size;
+
+        const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
+
+        /* Try to fit program into our VPM memory budget by adjusting
+         * configurable parameters iteratively. We do this in two phases:
+         * the first phase tries to fit the program into the total available
+         * VPM memory. If we suceed at that, then the second phase attempts
+         * to fit the program into half of that budget so we can run bin and
+         * render programs in parallel.
+         */
+        struct vpm_config vpm_cfg[2];
+        struct vpm_config *final_vpm_cfg = NULL;
+        uint32_t phase = 0;
+
+        vpm_cfg[phase].As = 1;
+        vpm_cfg[phase].Gs = 1;
+        vpm_cfg[phase].Gd = gs->vpm_output_size;
+        vpm_cfg[phase].gs_width = gs->simd_width;
+
+        /* While there is a requirement that Vc >= [Vn / 16], this is
+         * always the case when tessellation is not present because in that
+         * case Vn can only be 6 at most (when input primitive is triangles
+         * with adjacency).
+         *
+         * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
+         * and Broadcom has not found it worth it to increase it beyond this
+         * in general. Increasing Vc also increases VPM memory pressure which
+         * can turn up being detrimental for performance in some scenarios.
+         */
+        vpm_cfg[phase].Vc = 2;
+
+        /* Gv is a constraint on the hardware to not exceed the
+         * specified number of vertex segments per GS batch. If adding a
+         * new primitive to a GS batch would result in a range of more
+         * than Gv vertex segments being referenced by the batch, then
+         * the hardware will flush the batch and start a new one. This
+         * means that we can choose any value we want, we just need to
+         * be aware that larger values improve GS batch utilization
+         * at the expense of more VPM memory pressure (which can affect
+         * other performance aspects, such as GS dispatch width).
+         * We start with the largest value, and will reduce it if we
+         * find that total memory pressure is too high.
+         */
+        vpm_cfg[phase].Gv = 3;
+        do {
+                /* When GS is present in absence of TES, then we need to satisfy
+                 * that Ve >= Gv. We go with the smallest value of Ve to avoid
+                 * increasing memory pressure.
+                 */
+                vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
+
+                uint32_t vpm_sectors =
+                        A * vpm_cfg[phase].As * Ad +
+                        (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
+                        vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
+
+                /* Ideally we want to use no more than half of the available
+                 * memory so we can execute a bin and render program in parallel
+                 * without stalls. If we achieved that then we are done.
+                 */
+                if (vpm_sectors <= vpm_size / 2) {
+                        final_vpm_cfg = &vpm_cfg[phase];
+                        break;
+                }
+
+                /* At the very least, we should not allocate more than the
+                 * total available VPM memory. If we have a configuration that
+                 * succeeds at this we save it and continue to see if we can
+                 * meet the half-memory-use criteria too.
+                 */
+                if (phase == 0 && vpm_sectors <= vpm_size) {
+                        vpm_cfg[1] = vpm_cfg[0];
+                        phase = 1;
+                }
+
+                /* Try lowering Gv */
+                if (vpm_cfg[phase].Gv > 0) {
+                        vpm_cfg[phase].Gv--;
+                        continue;
+                }
+
+                /* Try lowering GS dispatch width */
+                if (vpm_cfg[phase].gs_width > 1) {
+                        do {
+                                vpm_cfg[phase].gs_width >>= 1;
+                                vpm_cfg[phase].Gd =
+                                        align(vpm_cfg[phase].Gd, 2) / 2;
+                        } while (vpm_cfg[phase].gs_width == 2);
+
+                        /* Reset Gv to max after dropping dispatch width */
+                        vpm_cfg[phase].Gv = 3;
+                        continue;
+                }
+
+                /* We ran out of options to reduce memory pressure. If we
+                 * are at phase 1 we have at least a valid configuration, so we
+                 * we use that.
+                 */
+                if (phase == 1)
+                       final_vpm_cfg = &vpm_cfg[0];
+                break;
+        } while (true);
+
+        if (!final_vpm_cfg) {
+                /* FIXME: maybe return a boolean to indicate failure and use
+                 * that to stop the submission for this draw call.
+                 */
+                fprintf(stderr, "Failed to allocate VPM memory.\n");
+                abort();
+        }
+
+        assert(final_vpm_cfg);
+        assert(final_vpm_cfg->Gd <= 16);
+        assert(final_vpm_cfg->Gv < 4);
+        assert(final_vpm_cfg->Ve < 4);
+        assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
+        assert(final_vpm_cfg->gs_width == 1 ||
+               final_vpm_cfg->gs_width == 4 ||
+               final_vpm_cfg->gs_width == 8 ||
+               final_vpm_cfg->gs_width == 16);
+
+        *vpm_cfg_out = *final_vpm_cfg;
+}
 #endif
 
 static void
@@ -498,20 +672,51 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
          * compile time, so that we mostly just have to OR the VS and FS
          * records together at draw time.
          */
+
+        struct vpm_config vpm_cfg_bin, vpm_cfg;
+
+        assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs);
+        if (!v3d->prog.gs) {
+                vpm_cfg_bin.As = 1;
+                vpm_cfg_bin.Ve = 0;
+                vpm_cfg_bin.Vc = v3d->prog.cs->prog_data.vs->vcm_cache_size;
+
+                vpm_cfg.As = 1;
+                vpm_cfg.Ve = 0;
+                vpm_cfg.Vc = v3d->prog.vs->prog_data.vs->vcm_cache_size;
+        }
 #if V3D_VERSION >= 41
-        if (v3d->prog.gs) {
-            v3d_emit_gs_state_record(v3d->job,
-                                     v3d->prog.gs_bin, gs_bin_uniforms,
-                                     v3d->prog.gs, gs_uniforms);
-
-            struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs;
-            struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs;
-
-            v3d_emit_tes_gs_common_params(v3d->job,
-                                          gs->out_prim_type,
-                                          gs->num_invocations);
-            v3d_emit_tes_gs_shader_params(v3d->job, gs_bin);
-            v3d_emit_tes_gs_shader_params(v3d->job, gs);
+        else {
+                v3d_emit_gs_state_record(v3d->job,
+                                         v3d->prog.gs_bin, gs_bin_uniforms,
+                                         v3d->prog.gs, gs_uniforms);
+
+                struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs;
+                struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs;
+
+                v3d_emit_tes_gs_common_params(v3d->job,
+                                              gs->out_prim_type,
+                                              gs->num_invocations);
+
+                /* Bin Tes/Gs params */
+                struct v3d_vs_prog_data *vs_bin = v3d->prog.cs->prog_data.vs;
+                compute_vpm_config_gs(&v3d->screen->devinfo,
+                                      vs_bin, gs_bin, &vpm_cfg_bin);
+
+                v3d_emit_tes_gs_shader_params(v3d->job,
+                                              vpm_cfg_bin.gs_width,
+                                              vpm_cfg_bin.Gd,
+                                              vpm_cfg_bin.Gv);
+
+                /* Render Tes/Gs params */
+                struct v3d_vs_prog_data *vs = v3d->prog.vs->prog_data.vs;
+                compute_vpm_config_gs(&v3d->screen->devinfo,
+                                      vs, gs, &vpm_cfg);
+
+                v3d_emit_tes_gs_shader_params(v3d->job,
+                                              vpm_cfg.gs_width,
+                                              vpm_cfg.Gd,
+                                              vpm_cfg.Gv);
         }
 #endif
 
@@ -593,8 +798,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                 shader.fragment_shader_uniforms_address = fs_uniforms;
 
 #if V3D_VERSION >= 41
-                shader.min_coord_shader_input_segments_required_in_play = 1;
-                shader.min_vertex_shader_input_segments_required_in_play = 1;
+                shader.min_coord_shader_input_segments_required_in_play =
+                        vpm_cfg_bin.As;
+                shader.min_vertex_shader_input_segments_required_in_play =
+                        vpm_cfg.As;
+
+                shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
+                        vpm_cfg_bin.Ve;
+                shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
+                        vpm_cfg.Ve;
 
                 shader.coordinate_shader_4_way_threadable =
                         v3d->prog.cs->prog_data.vs->base.threads == 4;
@@ -698,10 +910,8 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
         }
 
         cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
-                vcm.number_of_16_vertex_batches_for_binning =
-                        v3d->prog.cs->prog_data.vs->vcm_cache_size;
-                vcm.number_of_16_vertex_batches_for_rendering =
-                        v3d->prog.vs->prog_data.vs->vcm_cache_size;
+                vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc;
+                vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc;
         }
 
 #if V3D_VERSION >= 41