From 2e04492a142102823dfb8fc8599cfd417b84c97a Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 28 Jul 2015 11:00:58 -0700 Subject: [PATCH] vc4: Skip re-emitting the shader_rec if it's unchanged. It's a bunch of work for us to emit it (and its uniforms), more work for the kernel to validate it, and additional work for the CLE to read it. Improves es2gears framerate by about 50%. Signed-off-by: Eric Anholt --- src/gallium/drivers/vc4/vc4_context.h | 16 +++- src/gallium/drivers/vc4/vc4_draw.c | 112 ++++++++++++++++--------- src/gallium/drivers/vc4/vc4_program.c | 17 +++- src/gallium/drivers/vc4/vc4_resource.c | 8 ++ src/gallium/drivers/vc4/vc4_uniforms.c | 48 +++++++++++ 5 files changed, 158 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 7faf5223630..30fb285eefe 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -67,7 +67,9 @@ #define VC4_DIRTY_CLIP (1 << 20) #define VC4_DIRTY_UNCOMPILED_VS (1 << 21) #define VC4_DIRTY_UNCOMPILED_FS (1 << 22) -#define VC4_DIRTY_COMPILED_FS (1 << 24) +#define VC4_DIRTY_COMPILED_CS (1 << 23) +#define VC4_DIRTY_COMPILED_VS (1 << 24) +#define VC4_DIRTY_COMPILED_FS (1 << 25) struct vc4_sampler_view { struct pipe_sampler_view base; @@ -132,6 +134,12 @@ struct vc4_compiled_shader { struct vc4_ubo_range *ubo_ranges; uint32_t num_ubo_ranges; uint32_t ubo_size; + /** + * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the + * uniforms have to be rewritten (and therefore the shader state + * reemitted). + */ + uint32_t uniform_dirty_bits; /** bitmask of which inputs are color inputs, for flat shade handling. */ uint32_t color_inputs; @@ -249,6 +257,11 @@ struct vc4_context { */ bool draw_call_queued; + /** Maximum index buffer valid for the current shader_rec. */ + uint32_t max_index; + /** Last index bias baked into the current shader_rec. */ + uint32_t last_index_bias; + struct primconvert_context *primconvert; struct hash_table *fs_cache, *vs_cache; @@ -360,6 +373,7 @@ void vc4_simulator_init(struct vc4_screen *screen); int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args); +void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader); void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index ff749fdd0d1..22ae8f27e4a 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -122,49 +122,13 @@ vc4_update_shadow_textures(struct pipe_context *pctx, } static void -vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info) { - struct vc4_context *vc4 = vc4_context(pctx); - - if (info->mode >= PIPE_PRIM_QUADS) { - util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); - util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); - util_primconvert_draw_vbo(vc4->primconvert, info); - perf_debug("Fallback conversion for %d %s vertices\n", - info->count, u_prim_name(info->mode)); - return; - } - - /* Before setting up the draw, do any fixup blits necessary. */ - vc4_update_shadow_textures(pctx, &vc4->verttex); - vc4_update_shadow_textures(pctx, &vc4->fragtex); - - vc4_get_draw_cl_space(vc4); - + /* VC4_DIRTY_VTXSTATE */ struct vc4_vertex_stateobj *vtx = vc4->vtx; + /* VC4_DIRTY_VTXBUF */ struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf; - if (vc4->prim_mode != info->mode) { - vc4->prim_mode = info->mode; - vc4->dirty |= VC4_DIRTY_PRIM_MODE; - } - - vc4_start_draw(vc4); - vc4_update_compiled_shaders(vc4, info->mode); - - vc4_emit_state(pctx); - vc4->dirty = 0; - - vc4_write_uniforms(vc4, vc4->prog.fs, - &vc4->constbuf[PIPE_SHADER_FRAGMENT], - &vc4->fragtex); - vc4_write_uniforms(vc4, vc4->prog.vs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - vc4_write_uniforms(vc4, vc4->prog.cs, - &vc4->constbuf[PIPE_SHADER_VERTEX], - &vc4->verttex); - /* The simulator throws a fit if VS or CS don't read an attribute, so * we emit a dummy read. */ @@ -172,22 +136,27 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) /* Emit the shader record. */ struct vc4_cl_out *shader_rec = cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); + /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */ cl_u16(&shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING | ((info->mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex) ? VC4_SHADER_FLAG_VS_POINT_SIZE : 0)); + + /* VC4_DIRTY_COMPILED_FS */ cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */ cl_u8(&shader_rec, vc4->prog.fs->num_inputs); cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0); cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + /* VC4_DIRTY_COMPILED_VS */ cl_u16(&shader_rec, 0); /* vs num uniforms */ cl_u8(&shader_rec, vc4->prog.vs->vattrs_live); cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]); cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0); cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + /* VC4_DIRTY_COMPILED_CS */ cl_u16(&shader_rec, 0); /* cs num uniforms */ cl_u8(&shader_rec, vc4->prog.cs->vattrs_live); cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]); @@ -200,6 +169,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct pipe_vertex_buffer *vb = &vertexbuf->vb[elem->vertex_buffer_index]; struct vc4_resource *rsc = vc4_resource(vb->buffer); + /* not vc4->dirty tracked: vc4->last_index_bias */ uint32_t offset = (vb->buffer_offset + elem->src_offset + vb->stride * info->index_bias); @@ -239,10 +209,72 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) * attributes. This field also contains the offset into shader_rec. */ cl_u32(&bcl, num_elements_emit & 0x7); + cl_end(&vc4->bcl, bcl); + + vc4_write_uniforms(vc4, vc4->prog.fs, + &vc4->constbuf[PIPE_SHADER_FRAGMENT], + &vc4->fragtex); + vc4_write_uniforms(vc4, vc4->prog.vs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + vc4_write_uniforms(vc4, vc4->prog.cs, + &vc4->constbuf[PIPE_SHADER_VERTEX], + &vc4->verttex); + + vc4->last_index_bias = info->index_bias; + vc4->max_index = max_index; +} + +static void +vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +{ + struct vc4_context *vc4 = vc4_context(pctx); + + if (info->mode >= PIPE_PRIM_QUADS) { + util_primconvert_save_index_buffer(vc4->primconvert, &vc4->indexbuf); + util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); + util_primconvert_draw_vbo(vc4->primconvert, info); + perf_debug("Fallback conversion for %d %s vertices\n", + info->count, u_prim_name(info->mode)); + return; + } + + /* Before setting up the draw, do any fixup blits necessary. */ + vc4_update_shadow_textures(pctx, &vc4->verttex); + vc4_update_shadow_textures(pctx, &vc4->fragtex); + + vc4_get_draw_cl_space(vc4); + + if (vc4->prim_mode != info->mode) { + vc4->prim_mode = info->mode; + vc4->dirty |= VC4_DIRTY_PRIM_MODE; + } + + vc4_start_draw(vc4); + vc4_update_compiled_shaders(vc4, info->mode); + + vc4_emit_state(pctx); + + if ((vc4->dirty & (VC4_DIRTY_VTXBUF | + VC4_DIRTY_VTXSTATE | + VC4_DIRTY_PRIM_MODE | + VC4_DIRTY_RASTERIZER | + VC4_DIRTY_COMPILED_CS | + VC4_DIRTY_COMPILED_VS | + VC4_DIRTY_COMPILED_FS | + vc4->prog.cs->uniform_dirty_bits | + vc4->prog.vs->uniform_dirty_bits | + vc4->prog.fs->uniform_dirty_bits)) || + vc4->last_index_bias != info->index_bias) { + vc4_emit_gl_shader_state(vc4, info); + } + + vc4->dirty = 0; /* Note that the primitive type fields match with OpenGL/gallium * definitions, up to but not including QUADS. */ + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (info->indexed) { uint32_t offset = vc4->indexbuf.offset; uint32_t index_size = vc4->indexbuf.index_size; @@ -265,7 +297,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) VC4_INDEX_BUFFER_U8)); cl_u32(&bcl, info->count); cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset); - cl_u32(&bcl, max_index); + cl_u32(&bcl, vc4->max_index); if (vc4->indexbuf.index_size == 4) pipe_resource_reference(&prsc, NULL); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 561da1074ce..a35b50cd39b 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2205,6 +2205,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, memcpy(uinfo->contents, c->uniform_contents, count * sizeof(*uinfo->contents)); uinfo->num_texture_samples = c->num_texture_samples; + + vc4_set_shader_uniform_dirty_flags(shader); } static struct vc4_compiled_shader * @@ -2440,9 +2442,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) (prim_mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex); - vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + struct vc4_compiled_shader *vs = + vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); + if (vs != vc4->prog.vs) { + vc4->prog.vs = vs; + vc4->dirty |= VC4_DIRTY_COMPILED_VS; + } + key->is_coord = true; - vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + struct vc4_compiled_shader *cs = + vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); + if (cs != vc4->prog.cs) { + vc4->prog.cs = cs; + vc4->dirty |= VC4_DIRTY_COMPILED_CS; + } } void diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index cab76406055..5d5166fd818 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx, if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { vc4_resource_bo_alloc(rsc); + + /* If it might be bound as one of our vertex buffers, make + * sure we re-emit vertex buffer state. + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (vc4_cl_references_bo(pctx, rsc->bo)) { if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && @@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, prsc->height0 == box->height && prsc->depth0 == box->depth) { vc4_resource_bo_alloc(rsc); + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; } else { vc4_flush(pctx); } diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c index 5613d6b28c0..d4c71376a55 100644 --- a/src/gallium/drivers/vc4/vc4_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -284,3 +284,51 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, cl_end(&vc4->uniforms, uniforms); } + +void +vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader) +{ + uint32_t dirty = 0; + + for (int i = 0; i < shader->uniforms.count; i++) { + switch (shader->uniforms.contents[i]) { + case QUNIFORM_CONSTANT: + break; + case QUNIFORM_UNIFORM: + case QUNIFORM_UBO_ADDR: + dirty |= VC4_DIRTY_CONSTBUF; + break; + + case QUNIFORM_VIEWPORT_X_SCALE: + case QUNIFORM_VIEWPORT_Y_SCALE: + case QUNIFORM_VIEWPORT_Z_OFFSET: + case QUNIFORM_VIEWPORT_Z_SCALE: + dirty |= VC4_DIRTY_VIEWPORT; + break; + + case QUNIFORM_USER_CLIP_PLANE: + dirty |= VC4_DIRTY_CLIP; + break; + + case QUNIFORM_TEXTURE_CONFIG_P0: + case QUNIFORM_TEXTURE_CONFIG_P1: + case QUNIFORM_TEXTURE_CONFIG_P2: + case QUNIFORM_TEXTURE_BORDER_COLOR: + case QUNIFORM_TEXRECT_SCALE_X: + case QUNIFORM_TEXRECT_SCALE_Y: + dirty |= VC4_DIRTY_TEXSTATE; + break; + + case QUNIFORM_BLEND_CONST_COLOR: + dirty |= VC4_DIRTY_BLEND_COLOR; + break; + + case QUNIFORM_STENCIL: + case QUNIFORM_ALPHA_REF: + dirty |= VC4_DIRTY_ZSA; + break; + } + } + + shader->uniform_dirty_bits = dirty; +} -- 2.30.2