From 229bf4475ff0a5dbeb9bc95250f7a40a983c2e28 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 22 Dec 2014 10:09:10 -0800 Subject: [PATCH] vc4: Optimize CL emits by doing size checks up front. The optimizer obviously doesn't have the ability to rewrite these to skip the size checks per call, so we have to do it manually. Improves a norast benchmark on simulation by 0.779706% +/- 0.405838% (n=6087). --- src/gallium/drivers/vc4/vc4_cl.c | 12 +++++++---- src/gallium/drivers/vc4/vc4_cl.h | 17 +++++++-------- src/gallium/drivers/vc4/vc4_context.c | 20 ++++++++++++++++-- src/gallium/drivers/vc4/vc4_draw.c | 30 +++++++++++++++++++++++++++ src/gallium/drivers/vc4/vc4_program.c | 3 +++ 5 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c index 36dd28c48c0..0700e885cbf 100644 --- a/src/gallium/drivers/vc4/vc4_cl.c +++ b/src/gallium/drivers/vc4/vc4_cl.c @@ -29,17 +29,21 @@ void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl) { cl->base = ralloc_size(vc4, 1); - cl->end = cl->next = cl->base; + cl->next = cl->base; + cl->size = 0; } void -vc4_grow_cl(struct vc4_cl *cl) +cl_ensure_space(struct vc4_cl *cl, uint32_t space) { - uint32_t size = MAX2((cl->end - cl->base) * 2, 4096); + if ((cl->next - cl->base) + space <= cl->size) + return; + + uint32_t size = MAX2(cl->size + space, cl->size * 2); uint32_t offset = cl->next -cl->base; cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size); - cl->end = cl->base + size; + cl->size = size; cl->next = cl->base + offset; } diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h index 86cd0c797a6..33b37298406 100644 --- a/src/gallium/drivers/vc4/vc4_cl.h +++ b/src/gallium/drivers/vc4/vc4_cl.h @@ -35,13 +35,12 @@ struct vc4_bo; struct vc4_cl { void *base; void *next; - void *end; + uint32_t size; uint32_t reloc_next; uint32_t reloc_count; }; void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl); -void vc4_grow_cl(struct vc4_cl *cl); void vc4_reset_cl(struct vc4_cl *cl); void vc4_dump_cl(void *cl, uint32_t size, bool is_render); uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo); @@ -49,8 +48,7 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo); static inline void cl_u8(struct vc4_cl *cl, uint8_t n) { - if (cl->next + 1 > cl->end) - vc4_grow_cl(cl); + assert((cl->next - cl->base) + 1 <= cl->size); *(uint8_t *)cl->next = n; cl->next++; @@ -59,8 +57,7 @@ cl_u8(struct vc4_cl *cl, uint8_t n) static inline void cl_u16(struct vc4_cl *cl, uint32_t n) { - if (cl->next + 2 > cl->end) - vc4_grow_cl(cl); + assert((cl->next - cl->base) + 2 <= cl->size); *(uint16_t *)cl->next = n; cl->next += 2; @@ -69,8 +66,7 @@ cl_u16(struct vc4_cl *cl, uint32_t n) static inline void cl_u32(struct vc4_cl *cl, uint32_t n) { - if (cl->next + 4 > cl->end) - vc4_grow_cl(cl); + assert((cl->next - cl->base) + 4 <= cl->size); *(uint32_t *)cl->next = n; cl->next += 4; @@ -79,8 +75,7 @@ cl_u32(struct vc4_cl *cl, uint32_t n) static inline void cl_ptr(struct vc4_cl *cl, void *ptr) { - if (cl->next + sizeof(void *) > cl->end) - vc4_grow_cl(cl); + assert((cl->next - cl->base) + sizeof(void *) <= cl->size); *(void **)cl->next = ptr; cl->next += sizeof(void *); @@ -134,4 +129,6 @@ cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset); } +void cl_ensure_space(struct vc4_cl *cl, uint32_t size); + #endif /* VC4_CL_H */ diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 906af05b44b..d4a9eec7b08 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -104,6 +104,22 @@ vc4_setup_rcl(struct vc4_context *vc4) resolve_uncleared); #endif + uint32_t reloc_size = 9; + uint32_t clear_size = 14; + uint32_t config_size = 11 + reloc_size; + uint32_t loadstore_size = 7 + reloc_size; + uint32_t tilecoords_size = 3; + uint32_t branch_size = 5 + reloc_size; + uint32_t color_store_size = 1; + cl_ensure_space(&vc4->rcl, + clear_size + + config_size + + loadstore_size + + xtiles * ytiles * (loadstore_size * 4 + + tilecoords_size * 3 + + branch_size + + color_store_size)); + cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS); cl_u32(&vc4->rcl, vc4->clear_color[0]); cl_u32(&vc4->rcl, vc4->clear_color[1]); @@ -290,9 +306,9 @@ vc4_flush(struct pipe_context *pctx) if (vc4_debug & VC4_DEBUG_CL) { fprintf(stderr, "BCL:\n"); - vc4_dump_cl(vc4->bcl.base, vc4->bcl.end - vc4->bcl.base, false); + vc4_dump_cl(vc4->bcl.base, vc4->bcl.size, false); fprintf(stderr, "RCL:\n"); - vc4_dump_cl(vc4->rcl.base, vc4->rcl.end - vc4->rcl.base, true); + vc4_dump_cl(vc4->rcl.base, vc4->rcl.size, true); } struct drm_vc4_submit_cl submit; diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 79d7d73c660..d99faa41c54 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -29,6 +29,32 @@ #include "vc4_context.h" #include "vc4_resource.h" +static void +vc4_get_draw_cl_space(struct vc4_context *vc4) +{ + /* Binner gets our packet state -- vc4_emit.c contents, + * and the primitive itself. + */ + cl_ensure_space(&vc4->bcl, 256); + + /* Nothing for rcl -- that's covered by vc4_context.c */ + + /* shader_rec gets up to 12 dwords of reloc handles plus a maximally + * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of + * vattr stride). + */ + cl_ensure_space(&vc4->shader_rec, 12 * sizeof(uint32_t) + 104 + 8 * 32); + + /* Uniforms are covered by vc4_write_uniforms(). */ + + /* There could be up to 16 textures per stage, plus misc other + * pointers. + */ + cl_ensure_space(&vc4->bo_handles, (2 * 16 + 20) * sizeof(uint32_t)); + cl_ensure_space(&vc4->bo_pointers, + (2 * 16 + 20) * sizeof(struct vc4_bo *)); +} + /** * Does the initial bining command list setup for drawing to a given FBO. */ @@ -38,6 +64,8 @@ vc4_start_draw(struct vc4_context *vc4) if (vc4->needs_flush) return; + vc4_get_draw_cl_space(vc4); + uint32_t width = vc4->framebuffer.width; uint32_t height = vc4->framebuffer.height; uint32_t tilew = align(width, 64) / 64; @@ -114,6 +142,8 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) return; } + vc4_get_draw_cl_space(vc4); + struct vc4_vertex_stateobj *vtx = vc4->vtx; struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf; diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 4b547c506f4..570c76a459b 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2729,6 +2729,9 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms); + cl_ensure_space(&vc4->uniforms, (uinfo->count + + uinfo->num_texture_samples) * 4); + cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); for (int i = 0; i < uinfo->count; i++) { -- 2.30.2