X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fv3d%2Fv3dx_draw.c;h=a614a6c815866e9af0e905e22203b8702b79ff1d;hb=76fc8c8bb1979122af40ed143fed726050b293b9;hp=0c8eb66b939bef5789123183500dce4ec41a3755;hpb=42210a4351fbb53a44eb49f31a12e86d7a84ffa4;p=mesa.git diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 0c8eb66b939..a614a6c8158 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -23,7 +23,7 @@ #include "util/u_blitter.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" #include "util/u_prim_restart.h" #include "util/u_upload_mgr.h" @@ -145,11 +145,6 @@ v3d_predraw_check_stage_inputs(struct pipe_context *pctx, { struct v3d_context *v3d = v3d_context(pctx); - /* XXX perf: If we're reading from the output of TF in this job, we - * should instead be using the wait for transform feedback - * functionality. - */ - /* Flush writes to textures we're sampling. */ for (int i = 0; i < v3d->tex[s].num_textures; i++) { struct pipe_sampler_view *pview = v3d->tex[s].textures[i]; @@ -157,26 +152,454 @@ v3d_predraw_check_stage_inputs(struct pipe_context *pctx, continue; struct v3d_sampler_view *view = v3d_sampler_view(pview); - if (view->texture != view->base.texture) + if (view->texture != view->base.texture && + view->base.format != PIPE_FORMAT_X32_S8X24_UINT) v3d_update_shadow_texture(pctx, &view->base); - v3d_flush_jobs_writing_resource(v3d, view->texture); + v3d_flush_jobs_writing_resource(v3d, view->texture, + V3D_FLUSH_DEFAULT); } /* Flush writes to UBOs. */ foreach_bit(i, v3d->constbuf[s].enabled_mask) { struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i]; - if (cb->buffer) - v3d_flush_jobs_writing_resource(v3d, cb->buffer); + if (cb->buffer) { + v3d_flush_jobs_writing_resource(v3d, cb->buffer, + V3D_FLUSH_DEFAULT); + } } - /* Flush writes to our image views */ + /* Flush reads/writes to our SSBOs */ + foreach_bit(i, v3d->ssbo[s].enabled_mask) { + struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i]; + if (sb->buffer) { + v3d_flush_jobs_reading_resource(v3d, sb->buffer, + V3D_FLUSH_NOT_CURRENT_JOB); + } + } + + /* Flush reads/writes to our image views */ foreach_bit(i, v3d->shaderimg[s].enabled_mask) { struct v3d_image_view *view = &v3d->shaderimg[s].si[i]; - v3d_flush_jobs_writing_resource(v3d, view->base.resource); + v3d_flush_jobs_reading_resource(v3d, view->base.resource, + V3D_FLUSH_NOT_CURRENT_JOB); + } + + /* Flush writes to our vertex buffers (i.e. from transform feedback) */ + if (s == PIPE_SHADER_VERTEX) { + foreach_bit(i, v3d->vertexbuf.enabled_mask) { + struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i]; + + v3d_flush_jobs_writing_resource(v3d, vb->buffer.resource, + V3D_FLUSH_DEFAULT); + } + } +} + +static void +v3d_predraw_check_outputs(struct pipe_context *pctx) +{ + struct v3d_context *v3d = v3d_context(pctx); + + /* Flush jobs reading from TF buffers that we are about to write. */ + if (v3d_transform_feedback_enabled(v3d)) { + struct v3d_streamout_stateobj *so = &v3d->streamout; + + for (int i = 0; i < so->num_targets; i++) { + if (!so->targets[i]) + continue; + + const struct pipe_stream_output_target *target = + so->targets[i]; + v3d_flush_jobs_reading_resource(v3d, target->buffer, + V3D_FLUSH_DEFAULT); + } + } +} + +/** + * Checks if the state for the current draw reads a particular resource in + * in the given shader stage. + */ +static bool +v3d_state_reads_resource(struct v3d_context *v3d, + struct pipe_resource *prsc, + enum pipe_shader_type s) +{ + struct v3d_resource *rsc = v3d_resource(prsc); + + /* Vertex buffers */ + if (s == PIPE_SHADER_VERTEX) { + foreach_bit(i, v3d->vertexbuf.enabled_mask) { + struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i]; + if (!vb->buffer.resource) + continue; + + struct v3d_resource *vb_rsc = + v3d_resource(vb->buffer.resource); + if (rsc->bo == vb_rsc->bo) + return true; + } + } + + /* Constant buffers */ + foreach_bit(i, v3d->constbuf[s].enabled_mask) { + struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i]; + if (!cb->buffer) + continue; + + struct v3d_resource *cb_rsc = v3d_resource(cb->buffer); + if (rsc->bo == cb_rsc->bo) + return true; + } + + /* Shader storage buffers */ + foreach_bit(i, v3d->ssbo[s].enabled_mask) { + struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i]; + if (!sb->buffer) + continue; + + struct v3d_resource *sb_rsc = v3d_resource(sb->buffer); + if (rsc->bo == sb_rsc->bo) + return true; + } + + /* Textures */ + for (int i = 0; i < v3d->tex[s].num_textures; i++) { + struct pipe_sampler_view *pview = v3d->tex[s].textures[i]; + if (!pview) + continue; + + struct v3d_sampler_view *view = v3d_sampler_view(pview); + struct v3d_resource *v_rsc = v3d_resource(view->texture); + if (rsc->bo == v_rsc->bo) + return true; + } + + return false; +} + +static void +v3d_emit_wait_for_tf(struct v3d_job *job) +{ + /* XXX: we might be able to skip this in some cases, for now we + * always emit it. + */ + cl_emit(&job->bcl, FLUSH_TRANSFORM_FEEDBACK_DATA, flush); + + cl_emit(&job->bcl, WAIT_FOR_TRANSFORM_FEEDBACK, wait) { + /* XXX: Wait for all outstanding writes... maybe we can do + * better in some cases. + */ + wait.block_count = 255; + } + + /* We have just flushed all our outstanding TF work in this job so make + * sure we don't emit TF flushes again for any of it again. + */ + _mesa_set_clear(job->tf_write_prscs, NULL); +} + +static void +v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job) +{ + if (!job->tf_enabled) + return; + + set_foreach(job->tf_write_prscs, entry) { + struct pipe_resource *prsc = (struct pipe_resource *)entry->key; + for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) { + /* Fragment shaders can only start executing after all + * binning (and thus TF) is complete. + * + * XXX: For VS/GS/TES, if the binning shader does not + * read the resource then we could also avoid emitting + * the wait. + */ + if (s == PIPE_SHADER_FRAGMENT) + continue; + + if (v3d_state_reads_resource(v3d, prsc, s)) { + v3d_emit_wait_for_tf(job); + return; + } + } + } +} + +struct vpm_config { + uint32_t As; + uint32_t Vc; + uint32_t Gs; + uint32_t Gd; + uint32_t Gv; + uint32_t Ve; + uint32_t gs_width; +}; + +#if V3D_VERSION >= 41 +static void +v3d_emit_gs_state_record(struct v3d_job *job, + struct v3d_compiled_shader *gs_bin, + struct v3d_cl_reloc gs_bin_uniforms, + struct v3d_compiled_shader *gs, + struct v3d_cl_reloc gs_render_uniforms) +{ + cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) { + shader.geometry_bin_mode_shader_code_address = + cl_address(v3d_resource(gs_bin->resource)->bo, + gs_bin->offset); + shader.geometry_bin_mode_shader_4_way_threadable = + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; + shader.geometry_bin_mode_shader_propagate_nans = true; + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + + shader.geometry_render_mode_shader_code_address = + cl_address(v3d_resource(gs->resource)->bo, gs->offset); + shader.geometry_render_mode_shader_4_way_threadable = + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; + shader.geometry_render_mode_shader_propagate_nans = true; + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +} + +static uint8_t +v3d_gs_output_primitive(uint32_t prim_type) +{ + switch (prim_type) { + case GL_POINTS: + return GEOMETRY_SHADER_POINTS; + case GL_LINE_STRIP: + return GEOMETRY_SHADER_LINE_STRIP; + case GL_TRIANGLE_STRIP: + return GEOMETRY_SHADER_TRI_STRIP; + default: + unreachable("Unsupported primitive type"); + } +} + +static void +v3d_emit_tes_gs_common_params(struct v3d_job *job, + uint8_t gs_out_prim_type, + uint8_t gs_num_invocations) +{ + /* This, and v3d_emit_tes_gs_shader_params below, fill in default + * values for tessellation fields even though we don't support + * tessellation yet because our packing functions (and the simulator) + * complain if we don't. + */ + cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) { + shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE; + shader.tessellation_point_mode = false; + shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN; + shader.tessellation_clockwise = true; + shader.tessellation_invocations = 1; + + shader.geometry_shader_output_format = + v3d_gs_output_primitive(gs_out_prim_type); + shader.geometry_shader_instances = gs_num_invocations & 0x1F; + } +} + +static uint8_t +simd_width_to_gs_pack_mode(uint32_t width) +{ + switch (width) { + case 16: + return V3D_PACK_MODE_16_WAY; + case 8: + return V3D_PACK_MODE_8_WAY; + case 4: + return V3D_PACK_MODE_4_WAY; + case 1: + return V3D_PACK_MODE_1_WAY; + default: + unreachable("Invalid SIMD width"); + }; +} + +static void +v3d_emit_tes_gs_shader_params(struct v3d_job *job, + uint32_t gs_simd, + uint32_t gs_vpm_output_size, + uint32_t gs_max_vpm_input_size_per_batch) +{ + cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { + shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; + shader.per_patch_data_column_depth = 1; + shader.tcs_output_segment_size_in_sectors = 1; + shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; + shader.tes_output_segment_size_in_sectors = 1; + shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; + shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; + shader.gs_output_segment_pack_mode = + simd_width_to_gs_pack_mode(gs_simd); + shader.tbg_max_patches_per_tcs_batch = 1; + shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; + shader.tbg_min_tcs_output_segments_required_in_play = 1; + shader.tbg_min_per_patch_data_segments_required_in_play = 1; + shader.tpg_max_patches_per_tes_batch = 1; + shader.tpg_max_vertex_segments_per_tes_batch = 0; + shader.tpg_max_tcs_output_segments_per_tes_batch = 1; + shader.tpg_min_tes_output_segments_required_in_play = 1; + shader.gbg_max_tes_output_vertex_segments_per_gs_batch = + gs_max_vpm_input_size_per_batch; + shader.gbg_min_gs_output_segments_required_in_play = 1; + } +} + +static inline uint32_t +compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) +{ + assert(devinfo->vpm_size > 0); + const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; + return devinfo->vpm_size / sector_size; +} + +/* Computes various parameters affecting VPM memory configuration for programs + * involving geometry shaders to ensure the program fits in memory and honors + * requirements described in section "VPM usage" of the programming manual. + */ +static void +compute_vpm_config_gs(struct v3d_device_info *devinfo, + struct v3d_vs_prog_data *vs, + struct v3d_gs_prog_data *gs, + struct vpm_config *vpm_cfg_out) +{ + const uint32_t A = vs->separate_segments ? 1 : 0; + const uint32_t Ad = vs->vpm_input_size; + const uint32_t Vd = vs->vpm_output_size; + + const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); + + /* Try to fit program into our VPM memory budget by adjusting + * configurable parameters iteratively. We do this in two phases: + * the first phase tries to fit the program into the total available + * VPM memory. If we suceed at that, then the second phase attempts + * to fit the program into half of that budget so we can run bin and + * render programs in parallel. + */ + struct vpm_config vpm_cfg[2]; + struct vpm_config *final_vpm_cfg = NULL; + uint32_t phase = 0; + + vpm_cfg[phase].As = 1; + vpm_cfg[phase].Gs = 1; + vpm_cfg[phase].Gd = gs->vpm_output_size; + vpm_cfg[phase].gs_width = gs->simd_width; + + /* While there is a requirement that Vc >= [Vn / 16], this is + * always the case when tessellation is not present because in that + * case Vn can only be 6 at most (when input primitive is triangles + * with adjacency). + * + * We always choose Vc=2. We can't go lower than this due to GFXH-1744, + * and Broadcom has not found it worth it to increase it beyond this + * in general. Increasing Vc also increases VPM memory pressure which + * can turn up being detrimental for performance in some scenarios. + */ + vpm_cfg[phase].Vc = 2; + + /* Gv is a constraint on the hardware to not exceed the + * specified number of vertex segments per GS batch. If adding a + * new primitive to a GS batch would result in a range of more + * than Gv vertex segments being referenced by the batch, then + * the hardware will flush the batch and start a new one. This + * means that we can choose any value we want, we just need to + * be aware that larger values improve GS batch utilization + * at the expense of more VPM memory pressure (which can affect + * other performance aspects, such as GS dispatch width). + * We start with the largest value, and will reduce it if we + * find that total memory pressure is too high. + */ + vpm_cfg[phase].Gv = 3; + do { + /* When GS is present in absence of TES, then we need to satisfy + * that Ve >= Gv. We go with the smallest value of Ve to avoid + * increasing memory pressure. + */ + vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; + + uint32_t vpm_sectors = + A * vpm_cfg[phase].As * Ad + + (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + + vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; + + /* Ideally we want to use no more than half of the available + * memory so we can execute a bin and render program in parallel + * without stalls. If we achieved that then we are done. + */ + if (vpm_sectors <= vpm_size / 2) { + final_vpm_cfg = &vpm_cfg[phase]; + break; + } + + /* At the very least, we should not allocate more than the + * total available VPM memory. If we have a configuration that + * succeeds at this we save it and continue to see if we can + * meet the half-memory-use criteria too. + */ + if (phase == 0 && vpm_sectors <= vpm_size) { + vpm_cfg[1] = vpm_cfg[0]; + phase = 1; + } + + /* Try lowering Gv */ + if (vpm_cfg[phase].Gv > 0) { + vpm_cfg[phase].Gv--; + continue; + } + + /* Try lowering GS dispatch width */ + if (vpm_cfg[phase].gs_width > 1) { + do { + vpm_cfg[phase].gs_width >>= 1; + vpm_cfg[phase].Gd = + align(vpm_cfg[phase].Gd, 2) / 2; + } while (vpm_cfg[phase].gs_width == 2); + + /* Reset Gv to max after dropping dispatch width */ + vpm_cfg[phase].Gv = 3; + continue; + } + + /* We ran out of options to reduce memory pressure. If we + * are at phase 1 we have at least a valid configuration, so we + * we use that. + */ + if (phase == 1) + final_vpm_cfg = &vpm_cfg[0]; + break; + } while (true); + + if (!final_vpm_cfg) { + /* FIXME: maybe return a boolean to indicate failure and use + * that to stop the submission for this draw call. + */ + fprintf(stderr, "Failed to allocate VPM memory.\n"); + abort(); } + + assert(final_vpm_cfg); + assert(final_vpm_cfg->Gd <= 16); + assert(final_vpm_cfg->Gv < 4); + assert(final_vpm_cfg->Ve < 4); + assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); + assert(final_vpm_cfg->gs_width == 1 || + final_vpm_cfg->gs_width == 4 || + final_vpm_cfg->gs_width == 8 || + final_vpm_cfg->gs_width == 16); + + *vpm_cfg_out = *final_vpm_cfg; } +#endif static void v3d_emit_gl_shader_state(struct v3d_context *v3d, @@ -190,20 +613,57 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, /* Upload the uniforms to the indirect CL first */ struct v3d_cl_reloc fs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.fs, + v3d_write_uniforms(v3d, job, v3d->prog.fs, PIPE_SHADER_FRAGMENT); + + struct v3d_cl_reloc gs_uniforms = { NULL, 0 }; + struct v3d_cl_reloc gs_bin_uniforms = { NULL, 0 }; + if (v3d->prog.gs) { + gs_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs, + PIPE_SHADER_GEOMETRY); + } + if (v3d->prog.gs_bin) { + gs_bin_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs_bin, + PIPE_SHADER_GEOMETRY); + } + struct v3d_cl_reloc vs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.vs, + v3d_write_uniforms(v3d, job, v3d->prog.vs, PIPE_SHADER_VERTEX); struct v3d_cl_reloc cs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.cs, + v3d_write_uniforms(v3d, job, v3d->prog.cs, PIPE_SHADER_VERTEX); + /* Update the cache dirty flag based on the shader progs data */ + job->tmu_dirty_rcl |= v3d->prog.cs->prog_data.vs->base.tmu_dirty_rcl; + job->tmu_dirty_rcl |= v3d->prog.vs->prog_data.vs->base.tmu_dirty_rcl; + if (v3d->prog.gs_bin) { + job->tmu_dirty_rcl |= + v3d->prog.gs_bin->prog_data.gs->base.tmu_dirty_rcl; + } + if (v3d->prog.gs) { + job->tmu_dirty_rcl |= + v3d->prog.gs->prog_data.gs->base.tmu_dirty_rcl; + } + job->tmu_dirty_rcl |= v3d->prog.fs->prog_data.fs->base.tmu_dirty_rcl; + /* See GFXH-930 workaround below */ uint32_t num_elements_to_emit = MAX2(vtx->num_elements, 1); + + uint32_t shader_state_record_length = + cl_packet_length(GL_SHADER_STATE_RECORD); +#if V3D_VERSION >= 41 + if (v3d->prog.gs) { + shader_state_record_length += + cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) + + cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) + + 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS); + } +#endif + uint32_t shader_rec_offset = - v3d_cl_ensure_space(&job->indirect, - cl_packet_length(GL_SHADER_STATE_RECORD) + + v3d_cl_ensure_space(&job->indirect, + shader_state_record_length + num_elements_to_emit * cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD), 32); @@ -212,6 +672,54 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, * compile time, so that we mostly just have to OR the VS and FS * records together at draw time. */ + + struct vpm_config vpm_cfg_bin, vpm_cfg; + + assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs); + if (!v3d->prog.gs) { + vpm_cfg_bin.As = 1; + vpm_cfg_bin.Ve = 0; + vpm_cfg_bin.Vc = v3d->prog.cs->prog_data.vs->vcm_cache_size; + + vpm_cfg.As = 1; + vpm_cfg.Ve = 0; + vpm_cfg.Vc = v3d->prog.vs->prog_data.vs->vcm_cache_size; + } +#if V3D_VERSION >= 41 + else { + v3d_emit_gs_state_record(v3d->job, + v3d->prog.gs_bin, gs_bin_uniforms, + v3d->prog.gs, gs_uniforms); + + struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; + struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; + + v3d_emit_tes_gs_common_params(v3d->job, + gs->out_prim_type, + gs->num_invocations); + + /* Bin Tes/Gs params */ + struct v3d_vs_prog_data *vs_bin = v3d->prog.cs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs_bin, gs_bin, &vpm_cfg_bin); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg_bin.gs_width, + vpm_cfg_bin.Gd, + vpm_cfg_bin.Gv); + + /* Render Tes/Gs params */ + struct v3d_vs_prog_data *vs = v3d->prog.vs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs, gs, &vpm_cfg); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg.gs_width, + vpm_cfg.Gd, + vpm_cfg.Gv); + } +#endif + cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) { shader.enable_clipping = true; /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */ @@ -235,6 +743,19 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 = v3d->prog.fs->prog_data.fs->uses_center_w; +#if V3D_VERSION >= 41 + shader.any_shader_reads_hardware_written_primitive_id = + v3d->prog.gs ? v3d->prog.gs->prog_data.gs->uses_pid : + false; +#endif + +#if V3D_VERSION >= 40 + shader.do_scoreboard_wait_on_first_thread_switch = + v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw; + shader.disable_implicit_point_line_varyings = + !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings; +#endif + shader.number_of_varyings_in_fragment_shader = v3d->prog.fs->prog_data.fs->num_inputs; @@ -277,8 +798,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, shader.fragment_shader_uniforms_address = fs_uniforms; #if V3D_VERSION >= 41 - shader.min_coord_shader_input_segments_required_in_play = 1; - shader.min_vertex_shader_input_segments_required_in_play = 1; + shader.min_coord_shader_input_segments_required_in_play = + vpm_cfg_bin.As; + shader.min_vertex_shader_input_segments_required_in_play = + vpm_cfg.As; + + shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg_bin.Ve; + shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg.Ve; shader.coordinate_shader_4_way_threadable = v3d->prog.cs->prog_data.vs->base.threads == 4; @@ -382,43 +910,56 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, } cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) { - vcm.number_of_16_vertex_batches_for_binning = - v3d->prog.cs->prog_data.vs->vcm_cache_size; - vcm.number_of_16_vertex_batches_for_rendering = - v3d->prog.vs->prog_data.vs->vcm_cache_size; + vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc; + vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc; } +#if V3D_VERSION >= 41 + if (v3d->prog.gs) { + cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) { + state.address = cl_address(job->indirect.bo, + shader_rec_offset); + state.number_of_attribute_arrays = num_elements_to_emit; + } + } else { + cl_emit(&job->bcl, GL_SHADER_STATE, state) { + state.address = cl_address(job->indirect.bo, + shader_rec_offset); + state.number_of_attribute_arrays = num_elements_to_emit; + } + } +#else + assert(!v3d->prog.gs); cl_emit(&job->bcl, GL_SHADER_STATE, state) { state.address = cl_address(job->indirect.bo, shader_rec_offset); state.number_of_attribute_arrays = num_elements_to_emit; } +#endif v3d_bo_unreference(&cs_uniforms.bo); v3d_bo_unreference(&vs_uniforms.bo); + if (gs_uniforms.bo) + v3d_bo_unreference(&gs_uniforms.bo); + if (gs_bin_uniforms.bo) + v3d_bo_unreference(&gs_bin_uniforms.bo); v3d_bo_unreference(&fs_uniforms.bo); - - job->shader_rec_count++; } /** - * Computes the various transform feedback statistics, since they can't be - * recorded by CL packets. + * Updates the number of primitvies generated from the number of vertices + * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because + * using the GPU packet for this might require sync waits and this is trivial + * to handle in the CPU instead. */ static void -v3d_tf_statistics_record(struct v3d_context *v3d, - const struct pipe_draw_info *info, - bool prim_tf) +v3d_update_primitives_generated_counter(struct v3d_context *v3d, + const struct pipe_draw_info *info) { if (!v3d->active_queries) return; uint32_t prims = u_prims_for_vertices(info->mode, info->count); v3d->prims_generated += prims; - - if (prim_tf) { - /* XXX: Only count if we didn't overflow. */ - v3d->tf_prims_generated += prims; - } } static void @@ -503,14 +1044,29 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) return; } - /* Before setting up the draw, flush anything writing to the textures - * that we read from. + /* Before setting up the draw, flush anything writing to the resources + * that we read from or reading from resources we write to. */ for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) v3d_predraw_check_stage_inputs(pctx, s); - if (info->indirect) - v3d_flush_jobs_writing_resource(v3d, info->indirect->buffer); + if (info->indirect) { + v3d_flush_jobs_writing_resource(v3d, info->indirect->buffer, + V3D_FLUSH_DEFAULT); + } + + v3d_predraw_check_outputs(pctx); + + /* If transform feedback is active and we are switching primitive type + * we need to submit the job before drawing and update the vertex count + * written to TF based on the primitive type since we will need to + * know the exact vertex count if the application decides to call + * glDrawTransformFeedback() later. + */ + if (v3d->streamout.num_targets > 0 && + u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) { + v3d_tf_update_counters(v3d); + } struct v3d_job *job = v3d_get_job_for_fbo(v3d); @@ -528,8 +1084,8 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) job->submit.in_sync_bcl = v3d->out_sync; } - /* Mark SSBOs as being written. We don't actually know which ones are - * read vs written, so just assume the worst + /* Mark SSBOs and images as being written. We don't actually know + * which ones are read vs written, so just assume the worst. */ for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) { foreach_bit(i, v3d->ssbo[s].enabled_mask) { @@ -559,6 +1115,16 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) v3d_update_compiled_shaders(v3d, info->mode); v3d_update_job_ez(v3d, job); + /* If this job was writing to transform feedback buffers before this + * draw and we are reading from them here, then we need to wait for TF + * to complete before we emit this draw. + * + * Notice this check needs to happen before we emit state for the + * current draw call, where we update job->tf_enabled, so we can ensure + * that we only check TF writes for prior draws. + */ + v3d_emit_wait_for_tf_if_needed(v3d, job); + #if V3D_VERSION >= 41 v3d41_emit_state(pctx); #else @@ -571,9 +1137,15 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) VC5_DIRTY_RASTERIZER | VC5_DIRTY_COMPILED_CS | VC5_DIRTY_COMPILED_VS | + VC5_DIRTY_COMPILED_GS_BIN | + VC5_DIRTY_COMPILED_GS | VC5_DIRTY_COMPILED_FS | v3d->prog.cs->uniform_dirty_bits | v3d->prog.vs->uniform_dirty_bits | + (v3d->prog.gs_bin ? + v3d->prog.gs_bin->uniform_dirty_bits : 0) | + (v3d->prog.gs ? + v3d->prog.gs->uniform_dirty_bits : 0) | v3d->prog.fs->uniform_dirty_bits)) { v3d_emit_gl_shader_state(v3d, info); } @@ -599,7 +1171,7 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS); #endif - v3d_tf_statistics_record(v3d, info, v3d->streamout.num_targets); + v3d_update_primitives_generated_counter(v3d, info); /* Note that the primitive type fields match with OpenGL/gallium * definitions, up to but not including QUADS. @@ -674,8 +1246,6 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) } } - job->draw_calls_queued++; - if (info->has_user_indices) pipe_resource_reference(&prsc, NULL); } else { @@ -689,16 +1259,26 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) info->indirect->offset); } } else if (info->instance_count > 1) { + struct pipe_stream_output_target *so = + info->count_from_stream_output; + uint32_t vert_count = so ? + v3d_stream_output_target_get_vertex_count(so) : + info->count; cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) { prim.mode = info->mode | prim_tf_enable; prim.index_of_first_vertex = info->start; prim.number_of_instances = info->instance_count; - prim.instance_length = info->count; + prim.instance_length = vert_count; } } else { + struct pipe_stream_output_target *so = + info->count_from_stream_output; + uint32_t vert_count = so ? + v3d_stream_output_target_get_vertex_count(so) : + info->count; cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) { prim.mode = info->mode | prim_tf_enable; - prim.length = info->count; + prim.length = vert_count; prim.index_of_first_vertex = info->start; } } @@ -711,6 +1291,8 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) cl_emit(&job->bcl, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT, flush); job->draw_calls_queued++; + if (v3d->streamout.num_targets) + job->tf_draw_calls_queued++; /* Increment the TF offsets by how many verts we wrote. XXX: This * needs some clamping to the buffer size. @@ -767,6 +1349,176 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) v3d_flush(pctx); } +#if V3D_VERSION >= 41 +#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 +#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 +/* Allow this dispatch to start while the last one is still running. */ +#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) +/* Maximum supergroup ID. 6 bits. */ +#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 +/* Batches per supergroup minus 1. 8 bits. */ +#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 +/* Workgroups per supergroup, 0 means 16 */ +#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 +#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 + +#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) +#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) +#define V3D_CSD_CFG5_THREADING (1 << 0) + +static void +v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) +{ + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + + v3d_predraw_check_stage_inputs(pctx, PIPE_SHADER_COMPUTE); + + v3d_update_compiled_cs(v3d); + + if (!v3d->prog.compute->resource) { + static bool warned = false; + if (!warned) { + fprintf(stderr, + "Compute shader failed to compile. " + "Expect corruption.\n"); + warned = true; + } + return; + } + + /* Some of the units of scale: + * + * - Batches of 16 work items (shader invocations) that will be queued + * to the run on a QPU at once. + * + * - Workgroups composed of work items based on the shader's layout + * declaration. + * + * - Supergroups of 1-16 workgroups. There can only be 16 supergroups + * running at a time on the core, so we want to keep them large to + * keep the QPUs busy, but a whole supergroup will sync at a barrier + * so we want to keep them small if one is present. + */ + struct drm_v3d_submit_csd submit = { 0 }; + struct v3d_job *job = v3d_job_create(v3d); + + /* Set up the actual number of workgroups, synchronously mapping the + * indirect buffer if necessary to get the dimensions. + */ + if (info->indirect) { + struct pipe_transfer *transfer; + uint32_t *map = pipe_buffer_map_range(pctx, info->indirect, + info->indirect_offset, + 3 * sizeof(uint32_t), + PIPE_TRANSFER_READ, + &transfer); + memcpy(v3d->compute_num_workgroups, map, 3 * sizeof(uint32_t)); + pipe_buffer_unmap(pctx, transfer); + + if (v3d->compute_num_workgroups[0] == 0 || + v3d->compute_num_workgroups[1] == 0 || + v3d->compute_num_workgroups[2] == 0) { + /* Nothing to dispatch, so skip the draw (CSD can't + * handle 0 workgroups). + */ + return; + } + } else { + v3d->compute_num_workgroups[0] = info->grid[0]; + v3d->compute_num_workgroups[1] = info->grid[1]; + v3d->compute_num_workgroups[2] = info->grid[2]; + } + + for (int i = 0; i < 3; i++) { + submit.cfg[i] |= (v3d->compute_num_workgroups[i] << + V3D_CSD_CFG012_WG_COUNT_SHIFT); + } + + perf_debug("CSD only using single WG per SG currently, " + "should increase that when possible."); + int wgs_per_sg = 1; + int wg_size = info->block[0] * info->block[1] * info->block[2]; + submit.cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; + submit.cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) << + V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT); + submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; + + int batches_per_wg = DIV_ROUND_UP(wg_size, 16); + /* Number of batches the dispatch will invoke (minus 1). */ + submit.cfg[4] = batches_per_wg * (v3d->compute_num_workgroups[0] * + v3d->compute_num_workgroups[1] * + v3d->compute_num_workgroups[2]) - 1; + + /* Make sure we didn't accidentally underflow. */ + assert(submit.cfg[4] != ~0); + + v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); + submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + + v3d->prog.compute->offset); + submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (v3d->prog.compute->prog_data.base->single_seg) + submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (v3d->prog.compute->prog_data.base->threads == 4) + submit.cfg[5] |= V3D_CSD_CFG5_THREADING; + + if (v3d->prog.compute->prog_data.compute->shared_size) { + v3d->compute_shared_memory = + v3d_bo_alloc(v3d->screen, + v3d->prog.compute->prog_data.compute->shared_size * + wgs_per_sg, + "shared_vars"); + } + + struct v3d_cl_reloc uniforms = v3d_write_uniforms(v3d, job, + v3d->prog.compute, + PIPE_SHADER_COMPUTE); + v3d_job_add_bo(job, uniforms.bo); + submit.cfg[6] = uniforms.bo->offset + uniforms.offset; + + /* Pull some job state that was stored in a SUBMIT_CL struct out to + * our SUBMIT_CSD struct + */ + submit.bo_handles = job->submit.bo_handles; + submit.bo_handle_count = job->submit.bo_handle_count; + + /* Serialize this in the rest of our command stream. */ + submit.in_sync = v3d->out_sync; + submit.out_sync = v3d->out_sync; + + if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) { + int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_CSD, + &submit); + static bool warned = false; + if (ret && !warned) { + fprintf(stderr, "CSD submit call returned %s. " + "Expect corruption.\n", strerror(errno)); + warned = true; + } + } + + v3d_job_free(v3d, job); + + /* Mark SSBOs as being written.. we don't actually know which ones are + * read vs written, so just assume the worst + */ + foreach_bit(i, v3d->ssbo[PIPE_SHADER_COMPUTE].enabled_mask) { + struct v3d_resource *rsc = v3d_resource( + v3d->ssbo[PIPE_SHADER_COMPUTE].sb[i].buffer); + rsc->writes++; /* XXX */ + } + + foreach_bit(i, v3d->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) { + struct v3d_resource *rsc = v3d_resource( + v3d->shaderimg[PIPE_SHADER_COMPUTE].si[i].base.resource); + rsc->writes++; + } + + v3d_bo_unreference(&uniforms.bo); + v3d_bo_unreference(&v3d->compute_shared_memory); +} +#endif + /** * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles. */ @@ -789,7 +1541,8 @@ v3d_draw_clear(struct v3d_context *v3d, v3d->framebuffer.width, v3d->framebuffer.height, util_framebuffer_get_num_layers(&v3d->framebuffer), - buffers, color, depth, stencil); + buffers, color, depth, stencil, + util_framebuffer_get_num_samples(&v3d->framebuffer) > 1); } /** @@ -942,4 +1695,8 @@ v3dX(draw_init)(struct pipe_context *pctx) pctx->clear = v3d_clear; pctx->clear_render_target = v3d_clear_render_target; pctx->clear_depth_stencil = v3d_clear_depth_stencil; +#if V3D_VERSION >= 41 + if (v3d_context(pctx)->screen->has_csd) + pctx->launch_grid = v3d_launch_grid; +#endif }