X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fv3d%2Fv3dx_draw.c;h=a614a6c815866e9af0e905e22203b8702b79ff1d;hp=07d2749a87a0ffeef63dd496e9c054b08e4981c4;hb=76fc8c8bb1979122af40ed143fed726050b293b9;hpb=76f4c83815a005f37b58c54d51ca6c4982546e54 diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 07d2749a87a..a614a6c8158 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -328,6 +328,16 @@ v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job) } } +struct vpm_config { + uint32_t As; + uint32_t Vc; + uint32_t Gs; + uint32_t Gd; + uint32_t Gv; + uint32_t Ve; + uint32_t gs_width; +}; + #if V3D_VERSION >= 41 static void v3d_emit_gs_state_record(struct v3d_job *job, @@ -398,9 +408,28 @@ v3d_emit_tes_gs_common_params(struct v3d_job *job, } } +static uint8_t +simd_width_to_gs_pack_mode(uint32_t width) +{ + switch (width) { + case 16: + return V3D_PACK_MODE_16_WAY; + case 8: + return V3D_PACK_MODE_8_WAY; + case 4: + return V3D_PACK_MODE_4_WAY; + case 1: + return V3D_PACK_MODE_1_WAY; + default: + unreachable("Invalid SIMD width"); + }; +} + static void v3d_emit_tes_gs_shader_params(struct v3d_job *job, - struct v3d_gs_prog_data *gs) + uint32_t gs_simd, + uint32_t gs_vpm_output_size, + uint32_t gs_max_vpm_input_size_per_batch) { cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; @@ -409,9 +438,9 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job, shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; shader.tes_output_segment_size_in_sectors = 1; shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; - shader.gs_output_segment_size_in_sectors = - gs->vpm_output_size; - shader.gs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; /* FIXME*/ + shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; + shader.gs_output_segment_pack_mode = + simd_width_to_gs_pack_mode(gs_simd); shader.tbg_max_patches_per_tcs_batch = 1; shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; shader.tbg_min_tcs_output_segments_required_in_play = 1; @@ -420,11 +449,156 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job, shader.tpg_max_vertex_segments_per_tes_batch = 0; shader.tpg_max_tcs_output_segments_per_tes_batch = 1; shader.tpg_min_tes_output_segments_required_in_play = 1; - shader.gbg_max_tes_output_vertex_segments_per_gs_batch = 0; + shader.gbg_max_tes_output_vertex_segments_per_gs_batch = + gs_max_vpm_input_size_per_batch; shader.gbg_min_gs_output_segments_required_in_play = 1; } } +static inline uint32_t +compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) +{ + assert(devinfo->vpm_size > 0); + const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; + return devinfo->vpm_size / sector_size; +} + +/* Computes various parameters affecting VPM memory configuration for programs + * involving geometry shaders to ensure the program fits in memory and honors + * requirements described in section "VPM usage" of the programming manual. + */ +static void +compute_vpm_config_gs(struct v3d_device_info *devinfo, + struct v3d_vs_prog_data *vs, + struct v3d_gs_prog_data *gs, + struct vpm_config *vpm_cfg_out) +{ + const uint32_t A = vs->separate_segments ? 1 : 0; + const uint32_t Ad = vs->vpm_input_size; + const uint32_t Vd = vs->vpm_output_size; + + const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); + + /* Try to fit program into our VPM memory budget by adjusting + * configurable parameters iteratively. We do this in two phases: + * the first phase tries to fit the program into the total available + * VPM memory. If we suceed at that, then the second phase attempts + * to fit the program into half of that budget so we can run bin and + * render programs in parallel. + */ + struct vpm_config vpm_cfg[2]; + struct vpm_config *final_vpm_cfg = NULL; + uint32_t phase = 0; + + vpm_cfg[phase].As = 1; + vpm_cfg[phase].Gs = 1; + vpm_cfg[phase].Gd = gs->vpm_output_size; + vpm_cfg[phase].gs_width = gs->simd_width; + + /* While there is a requirement that Vc >= [Vn / 16], this is + * always the case when tessellation is not present because in that + * case Vn can only be 6 at most (when input primitive is triangles + * with adjacency). + * + * We always choose Vc=2. We can't go lower than this due to GFXH-1744, + * and Broadcom has not found it worth it to increase it beyond this + * in general. Increasing Vc also increases VPM memory pressure which + * can turn up being detrimental for performance in some scenarios. + */ + vpm_cfg[phase].Vc = 2; + + /* Gv is a constraint on the hardware to not exceed the + * specified number of vertex segments per GS batch. If adding a + * new primitive to a GS batch would result in a range of more + * than Gv vertex segments being referenced by the batch, then + * the hardware will flush the batch and start a new one. This + * means that we can choose any value we want, we just need to + * be aware that larger values improve GS batch utilization + * at the expense of more VPM memory pressure (which can affect + * other performance aspects, such as GS dispatch width). + * We start with the largest value, and will reduce it if we + * find that total memory pressure is too high. + */ + vpm_cfg[phase].Gv = 3; + do { + /* When GS is present in absence of TES, then we need to satisfy + * that Ve >= Gv. We go with the smallest value of Ve to avoid + * increasing memory pressure. + */ + vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; + + uint32_t vpm_sectors = + A * vpm_cfg[phase].As * Ad + + (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + + vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; + + /* Ideally we want to use no more than half of the available + * memory so we can execute a bin and render program in parallel + * without stalls. If we achieved that then we are done. + */ + if (vpm_sectors <= vpm_size / 2) { + final_vpm_cfg = &vpm_cfg[phase]; + break; + } + + /* At the very least, we should not allocate more than the + * total available VPM memory. If we have a configuration that + * succeeds at this we save it and continue to see if we can + * meet the half-memory-use criteria too. + */ + if (phase == 0 && vpm_sectors <= vpm_size) { + vpm_cfg[1] = vpm_cfg[0]; + phase = 1; + } + + /* Try lowering Gv */ + if (vpm_cfg[phase].Gv > 0) { + vpm_cfg[phase].Gv--; + continue; + } + + /* Try lowering GS dispatch width */ + if (vpm_cfg[phase].gs_width > 1) { + do { + vpm_cfg[phase].gs_width >>= 1; + vpm_cfg[phase].Gd = + align(vpm_cfg[phase].Gd, 2) / 2; + } while (vpm_cfg[phase].gs_width == 2); + + /* Reset Gv to max after dropping dispatch width */ + vpm_cfg[phase].Gv = 3; + continue; + } + + /* We ran out of options to reduce memory pressure. If we + * are at phase 1 we have at least a valid configuration, so we + * we use that. + */ + if (phase == 1) + final_vpm_cfg = &vpm_cfg[0]; + break; + } while (true); + + if (!final_vpm_cfg) { + /* FIXME: maybe return a boolean to indicate failure and use + * that to stop the submission for this draw call. + */ + fprintf(stderr, "Failed to allocate VPM memory.\n"); + abort(); + } + + assert(final_vpm_cfg); + assert(final_vpm_cfg->Gd <= 16); + assert(final_vpm_cfg->Gv < 4); + assert(final_vpm_cfg->Ve < 4); + assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); + assert(final_vpm_cfg->gs_width == 1 || + final_vpm_cfg->gs_width == 4 || + final_vpm_cfg->gs_width == 8 || + final_vpm_cfg->gs_width == 16); + + *vpm_cfg_out = *final_vpm_cfg; +} #endif static void @@ -498,20 +672,51 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, * compile time, so that we mostly just have to OR the VS and FS * records together at draw time. */ + + struct vpm_config vpm_cfg_bin, vpm_cfg; + + assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs); + if (!v3d->prog.gs) { + vpm_cfg_bin.As = 1; + vpm_cfg_bin.Ve = 0; + vpm_cfg_bin.Vc = v3d->prog.cs->prog_data.vs->vcm_cache_size; + + vpm_cfg.As = 1; + vpm_cfg.Ve = 0; + vpm_cfg.Vc = v3d->prog.vs->prog_data.vs->vcm_cache_size; + } #if V3D_VERSION >= 41 - if (v3d->prog.gs) { - v3d_emit_gs_state_record(v3d->job, - v3d->prog.gs_bin, gs_bin_uniforms, - v3d->prog.gs, gs_uniforms); - - struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; - struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; - - v3d_emit_tes_gs_common_params(v3d->job, - gs->out_prim_type, - gs->num_invocations); - v3d_emit_tes_gs_shader_params(v3d->job, gs_bin); - v3d_emit_tes_gs_shader_params(v3d->job, gs); + else { + v3d_emit_gs_state_record(v3d->job, + v3d->prog.gs_bin, gs_bin_uniforms, + v3d->prog.gs, gs_uniforms); + + struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; + struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; + + v3d_emit_tes_gs_common_params(v3d->job, + gs->out_prim_type, + gs->num_invocations); + + /* Bin Tes/Gs params */ + struct v3d_vs_prog_data *vs_bin = v3d->prog.cs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs_bin, gs_bin, &vpm_cfg_bin); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg_bin.gs_width, + vpm_cfg_bin.Gd, + vpm_cfg_bin.Gv); + + /* Render Tes/Gs params */ + struct v3d_vs_prog_data *vs = v3d->prog.vs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs, gs, &vpm_cfg); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg.gs_width, + vpm_cfg.Gd, + vpm_cfg.Gv); } #endif @@ -593,8 +798,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, shader.fragment_shader_uniforms_address = fs_uniforms; #if V3D_VERSION >= 41 - shader.min_coord_shader_input_segments_required_in_play = 1; - shader.min_vertex_shader_input_segments_required_in_play = 1; + shader.min_coord_shader_input_segments_required_in_play = + vpm_cfg_bin.As; + shader.min_vertex_shader_input_segments_required_in_play = + vpm_cfg.As; + + shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg_bin.Ve; + shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg.Ve; shader.coordinate_shader_4_way_threadable = v3d->prog.cs->prog_data.vs->base.threads == 4; @@ -698,10 +910,8 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, } cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) { - vcm.number_of_16_vertex_batches_for_binning = - v3d->prog.cs->prog_data.vs->vcm_cache_size; - vcm.number_of_16_vertex_batches_for_rendering = - v3d->prog.vs->prog_data.vs->vcm_cache_size; + vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc; + vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc; } #if V3D_VERSION >= 41