From 7432017f65174e82a3de7afef3e4e6f60932356c Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 9 Jul 2015 22:51:06 -0700 Subject: [PATCH] vc4: Rework cl handling to be friendlier to the compiler. Drops 680 bytes of code, from avoiding a bunch of extra updates to the next pointer in the struct. --- src/gallium/drivers/vc4/vc4_cl.c | 11 ++- src/gallium/drivers/vc4/vc4_cl.h | 113 +++++++++++++++----------- src/gallium/drivers/vc4/vc4_context.c | 6 +- src/gallium/drivers/vc4/vc4_draw.c | 109 +++++++++++++------------ src/gallium/drivers/vc4/vc4_emit.c | 57 +++++++------ src/gallium/drivers/vc4/vc4_program.c | 59 ++++++++------ 6 files changed, 203 insertions(+), 152 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c index 97f6b89024c..ced4f2dfa86 100644 --- a/src/gallium/drivers/vc4/vc4_cl.c +++ b/src/gallium/drivers/vc4/vc4_cl.c @@ -66,8 +66,15 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo) return hindex; } - cl_u32(&vc4->bo_handles, bo->handle); - cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo)); + struct vc4_cl_out *out; + + out = cl_start(&vc4->bo_handles); + cl_u32(&out, bo->handle); + cl_end(&vc4->bo_handles, out); + + out = cl_start(&vc4->bo_pointers); + cl_ptr(&out, vc4_bo_reference(bo)); + cl_end(&vc4->bo_pointers, out); return hindex; } diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h index b914745ed4f..95f1a531d34 100644 --- a/src/gallium/drivers/vc4/vc4_cl.h +++ b/src/gallium/drivers/vc4/vc4_cl.h @@ -33,10 +33,16 @@ struct vc4_bo; +/** + * Undefined structure, used for typechecking that you're passing the pointers + * to these functions correctly. + */ +struct vc4_cl_out; + struct vc4_cl { void *base; - void *next; - void *reloc_next; + struct vc4_cl_out *next; + struct vc4_cl_out *reloc_next; uint32_t size; uint32_t reloc_count; }; @@ -55,122 +61,135 @@ static inline uint32_t cl_offset(struct vc4_cl *cl) } static inline void -put_unaligned_32(void *ptr, uint32_t val) +cl_advance(struct vc4_cl_out **cl, uint32_t n) { - struct unaligned_32 *p = ptr; - p->x = val; + (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n); } -static inline void -put_unaligned_16(void *ptr, uint16_t val) +static inline struct vc4_cl_out * +cl_start(struct vc4_cl *cl) { - struct unaligned_16 *p = ptr; - p->x = val; + return cl->next; } static inline void -cl_u8(struct vc4_cl *cl, uint8_t n) +cl_end(struct vc4_cl *cl, struct vc4_cl_out *next) { - assert(cl_offset(cl) + 1 <= cl->size); - - *(uint8_t *)cl->next = n; - cl->next++; + cl->next = next; + assert(cl_offset(cl) <= cl->size); } + static inline void -cl_u16(struct vc4_cl *cl, uint16_t n) +put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val) { - assert(cl_offset(cl) + 2 <= cl->size); - - put_unaligned_16(cl->next, n); - cl->next += 2; + struct unaligned_32 *p = (void *)ptr; + p->x = val; } static inline void -cl_u32(struct vc4_cl *cl, uint32_t n) +put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val) { - assert(cl_offset(cl) + 4 <= cl->size); + struct unaligned_16 *p = (void *)ptr; + p->x = val; +} - put_unaligned_32(cl->next, n); - cl->next += 4; +static inline void +cl_u8(struct vc4_cl_out **cl, uint8_t n) +{ + *(uint8_t *)(*cl) = n; + cl_advance(cl, 1); } static inline void -cl_aligned_u32(struct vc4_cl *cl, uint32_t n) +cl_u16(struct vc4_cl_out **cl, uint16_t n) { - assert(cl_offset(cl) + 4 <= cl->size); + put_unaligned_16(*cl, n); + cl_advance(cl, 2); +} - *(uint32_t *)cl->next = n; - cl->next += 4; +static inline void +cl_u32(struct vc4_cl_out **cl, uint32_t n) +{ + put_unaligned_32(*cl, n); + cl_advance(cl, 4); } static inline void -cl_ptr(struct vc4_cl *cl, void *ptr) +cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n) { - assert(cl_offset(cl) + sizeof(void *) <= cl->size); + *(uint32_t *)(*cl) = n; + cl_advance(cl, 4); +} - *(void **)cl->next = ptr; - cl->next += sizeof(void *); +static inline void +cl_ptr(struct vc4_cl_out **cl, void *ptr) +{ + *(struct vc4_cl_out **)(*cl) = ptr; + cl_advance(cl, sizeof(void *)); } static inline void -cl_f(struct vc4_cl *cl, float f) +cl_f(struct vc4_cl_out **cl, float f) { cl_u32(cl, fui(f)); } static inline void -cl_aligned_f(struct vc4_cl *cl, float f) +cl_aligned_f(struct vc4_cl_out **cl, float f) { cl_aligned_u32(cl, fui(f)); } static inline void -cl_start_reloc(struct vc4_cl *cl, uint32_t n) +cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n) { assert(n == 1 || n == 2); assert(cl->reloc_count == 0); cl->reloc_count = n; - cl_u8(cl, VC4_PACKET_GEM_HANDLES); - cl->reloc_next = cl->next; - cl_u32(cl, 0); /* Space where hindex will be written. */ - cl_u32(cl, 0); /* Space where hindex will be written. */ + cl_u8(out, VC4_PACKET_GEM_HANDLES); + cl->reloc_next = *out; + cl_u32(out, 0); /* Space where hindex will be written. */ + cl_u32(out, 0); /* Space where hindex will be written. */ } -static inline void +static inline struct vc4_cl_out * cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n) { assert(cl->reloc_count == 0); cl->reloc_count = n; cl->reloc_next = cl->next; - /* Space where hindex will be written. */ - cl->next += n * 4; + /* Reserve the space where hindex will be written. */ + cl_advance(&cl->next, n * 4); + + return cl->next; } static inline void -cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, +cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out, struct vc4_bo *bo, uint32_t offset) { *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); - cl->reloc_next += 4; + cl_advance(&cl->reloc_next, 4); cl->reloc_count--; - cl_u32(cl, offset); + cl_u32(cl_out, offset); } static inline void cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl, - struct vc4_bo *bo, uint32_t offset) + struct vc4_cl_out **cl_out, + struct vc4_bo *bo, uint32_t offset) { *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo); - cl->reloc_next += 4; + cl_advance(&cl->reloc_next, 4); cl->reloc_count--; - cl_aligned_u32(cl, offset); + cl_aligned_u32(cl_out, offset); } void cl_ensure_space(struct vc4_cl *cl, uint32_t size); diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 60da218e59e..fff63158c9d 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx) * FLUSH completes. */ cl_ensure_space(&vc4->bcl, 8); - cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); + cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE); /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */ - cl_u8(&vc4->bcl, VC4_PACKET_FLUSH); + cl_u8(&bcl, VC4_PACKET_FLUSH); + cl_end(&vc4->bcl, bcl); if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_write, cbuf); diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 5e6d70d6f33..fc3c2321abb 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4) uint32_t height = vc4->framebuffer.height; uint32_t tilew = align(width, 64) / 64; uint32_t tileh = align(height, 64) / 64; + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); // Tile state data is 48 bytes per tile, I think it can be thrown away // as soon as binning is finished. - cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); - cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */ - cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */ - cl_u8(&vc4->bcl, tilew); - cl_u8(&vc4->bcl, tileh); - cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */ + cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG); + cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */ + cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */ + cl_u32(&bcl, 0); /* tile state addr, filled by kernel */ + cl_u8(&bcl, tilew); + cl_u8(&bcl, tileh); + cl_u8(&bcl, 0); /* flags, filled by kernel. */ /* START_TILE_BINNING resets the statechange counters in the hardware, * which are what is used when a primitive is binned to a tile to * figure out what new state packets need to be written to that tile's * command list. */ - cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING); + cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING); /* Reset the current compressed primitives format. This gets modified * by VC4_PACKET_GL_INDEXED_PRIMITIVE and * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start * of every tile. */ - cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); - cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | - VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); + cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT); + cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX | + VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES)); vc4->needs_flush = true; vc4->draw_call_queued = true; vc4->draw_width = width; vc4->draw_height = height; + + cl_end(&vc4->bcl, bcl); } static void @@ -167,28 +170,29 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) */ uint32_t num_elements_emit = MAX2(vtx->num_elements, 1); /* Emit the shader record. */ - cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); - cl_u16(&vc4->shader_rec, + struct vc4_cl_out *shader_rec = + cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit); + cl_u16(&shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING | ((info->mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex) ? VC4_SHADER_FLAG_VS_POINT_SIZE : 0)); - cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */ - cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ - - cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */ - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]); - cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0); - cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ + cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */ + cl_u8(&shader_rec, vc4->prog.fs->num_inputs); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + cl_u16(&shader_rec, 0); /* vs num uniforms */ + cl_u8(&shader_rec, vc4->prog.vs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ + + cl_u16(&shader_rec, 0); /* cs num uniforms */ + cl_u8(&shader_rec, vc4->prog.cs->vattrs_live); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0); + cl_u32(&shader_rec, 0); /* UBO offset written by kernel */ uint32_t max_index = 0xffff; uint32_t vpm_offset = 0; @@ -202,11 +206,11 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) uint32_t elem_size = util_format_get_blocksize(elem->src_format); - cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset); - cl_u8(&vc4->shader_rec, elem_size - 1); - cl_u8(&vc4->shader_rec, vb->stride); - cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]); - cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]); + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset); + cl_u8(&shader_rec, elem_size - 1); + cl_u8(&shader_rec, vb->stride); + cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]); + cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]); vpm_offset += align(elem_size, 4); @@ -219,21 +223,23 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (vtx->num_elements == 0) { assert(num_elements_emit == 1); struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO"); - cl_reloc(vc4, &vc4->shader_rec, bo, 0); - cl_u8(&vc4->shader_rec, 16 - 1); /* element size */ - cl_u8(&vc4->shader_rec, 0); /* stride */ - cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */ - cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */ + cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0); + cl_u8(&shader_rec, 16 - 1); /* element size */ + cl_u8(&shader_rec, 0); /* stride */ + cl_u8(&shader_rec, 0); /* VS VPM offset */ + cl_u8(&shader_rec, 0); /* CS VPM offset */ vc4_bo_unreference(&bo); } + cl_end(&vc4->shader_rec, shader_rec); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); /* the actual draw call. */ - cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE); + cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE); assert(vtx->num_elements <= 8); /* Note that number of attributes == 0 in the packet means 8 * attributes. This field also contains the offset into shader_rec. */ - cl_u32(&vc4->bcl, num_elements_emit & 0x7); + cl_u32(&bcl, num_elements_emit & 0x7); /* Note that the primitive type fields match with OpenGL/gallium * definitions, up to but not including QUADS. @@ -251,25 +257,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) } struct vc4_resource *rsc = vc4_resource(prsc); - cl_start_reloc(&vc4->bcl, 1); - cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); - cl_u8(&vc4->bcl, + cl_start_reloc(&vc4->bcl, &bcl, 1); + cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); + cl_u8(&bcl, info->mode | (index_size == 2 ? VC4_INDEX_BUFFER_U16: VC4_INDEX_BUFFER_U8)); - cl_u32(&vc4->bcl, info->count); - cl_reloc(vc4, &vc4->bcl, rsc->bo, offset); - cl_u32(&vc4->bcl, max_index); + cl_u32(&bcl, info->count); + cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset); + cl_u32(&bcl, max_index); if (vc4->indexbuf.index_size == 4) pipe_resource_reference(&prsc, NULL); } else { - cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); - cl_u8(&vc4->bcl, info->mode); - cl_u32(&vc4->bcl, info->count); - cl_u32(&vc4->bcl, info->start); + cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); + cl_u8(&bcl, info->mode); + cl_u32(&bcl, info->count); + cl_u32(&bcl, info->start); } + cl_end(&vc4->bcl, bcl); if (vc4->zsa && vc4->zsa->base.depth.enabled) { vc4->resolve |= PIPE_CLEAR_DEPTH; diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c index d2b54fccf91..f5925734415 100644 --- a/src/gallium/drivers/vc4/vc4_emit.c +++ b/src/gallium/drivers/vc4/vc4_emit.c @@ -28,6 +28,7 @@ vc4_emit_state(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_cl_out *bcl = cl_start(&vc4->bcl); if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) { float *vpscale = vc4->viewport.scale; float *vptranslate = vc4->viewport.translate; @@ -40,11 +41,11 @@ vc4_emit_state(struct pipe_context *pctx) uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx); uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy); - cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW); - cl_u16(&vc4->bcl, minx); - cl_u16(&vc4->bcl, miny); - cl_u16(&vc4->bcl, maxx - minx); - cl_u16(&vc4->bcl, maxy - miny); + cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW); + cl_u16(&bcl, minx); + cl_u16(&bcl, miny); + cl_u16(&bcl, maxx - minx); + cl_u16(&bcl, maxy - miny); vc4->draw_min_x = MIN2(vc4->draw_min_x, minx); vc4->draw_min_y = MIN2(vc4->draw_min_y, miny); @@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx) } if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) { - cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS); - cl_u8(&vc4->bcl, + cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS); + cl_u8(&bcl, vc4->rasterizer->config_bits[0] | vc4->zsa->config_bits[0]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[1] | vc4->zsa->config_bits[1]); - cl_u8(&vc4->bcl, + cl_u8(&bcl, vc4->rasterizer->config_bits[2] | vc4->zsa->config_bits[2]); } if (vc4->dirty & VC4_DIRTY_RASTERIZER) { - cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor); - cl_u16(&vc4->bcl, vc4->rasterizer->offset_units); + cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET); + cl_u16(&bcl, vc4->rasterizer->offset_factor); + cl_u16(&bcl, vc4->rasterizer->offset_units); - cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE); - cl_f(&vc4->bcl, vc4->rasterizer->point_size); + cl_u8(&bcl, VC4_PACKET_POINT_SIZE); + cl_f(&bcl, vc4->rasterizer->point_size); - cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH); - cl_f(&vc4->bcl, vc4->rasterizer->base.line_width); + cl_u8(&bcl, VC4_PACKET_LINE_WIDTH); + cl_f(&bcl, vc4->rasterizer->base.line_width); } if (vc4->dirty & VC4_DIRTY_VIEWPORT) { - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING); - cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f); - cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f); + cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING); + cl_f(&bcl, vc4->viewport.scale[0] * 16.0f); + cl_f(&bcl, vc4->viewport.scale[1] * 16.0f); - cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING); - cl_f(&vc4->bcl, vc4->viewport.translate[2]); - cl_f(&vc4->bcl, vc4->viewport.scale[2]); + cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING); + cl_f(&bcl, vc4->viewport.translate[2]); + cl_f(&bcl, vc4->viewport.scale[2]); - cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]); - cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]); + cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET); + cl_u16(&bcl, 16 * vc4->viewport.translate[0]); + cl_u16(&bcl, 16 * vc4->viewport.translate[1]); } if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) { - cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS); - cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ? + cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS); + cl_u32(&bcl, vc4->rasterizer->base.flatshade ? vc4->prog.fs->color_inputs : 0); } + + cl_end(&vc4->bcl, bcl); } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index a7aa3172a75..e61ea2170ff 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2530,13 +2530,14 @@ static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest) static void write_texture_p0(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, struct vc4_texture_stateobj *texstate, uint32_t unit) { struct pipe_sampler_view *texture = texstate->textures[unit]; struct vc4_resource *rsc = vc4_resource(texture->texture); - cl_reloc(vc4, &vc4->uniforms, rsc->bo, + cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | VC4_SET_FIELD(texture->u.tex.last_level - texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) | @@ -2547,6 +2548,7 @@ write_texture_p0(struct vc4_context *vc4, static void write_texture_p1(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, struct vc4_texture_stateobj *texstate, uint32_t unit) { @@ -2570,7 +2572,7 @@ write_texture_p1(struct vc4_context *vc4, (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST || sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST); - cl_aligned_u32(&vc4->uniforms, + cl_aligned_u32(uniforms, VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) | VC4_SET_FIELD(texture->texture->height0 & 2047, VC4_TEX_P1_HEIGHT) | @@ -2589,6 +2591,7 @@ write_texture_p1(struct vc4_context *vc4, static void write_texture_p2(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, struct vc4_texture_stateobj *texstate, uint32_t data) { @@ -2596,7 +2599,7 @@ write_texture_p2(struct vc4_context *vc4, struct pipe_sampler_view *texture = texstate->textures[unit]; struct vc4_resource *rsc = vc4_resource(texture->texture); - cl_aligned_u32(&vc4->uniforms, + cl_aligned_u32(uniforms, VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE, VC4_TEX_P2_PTYPE) | VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) | @@ -2613,6 +2616,7 @@ write_texture_p2(struct vc4_context *vc4, static void write_texture_border_color(struct vc4_context *vc4, + struct vc4_cl_out **uniforms, struct vc4_texture_stateobj *texstate, uint32_t unit) { @@ -2673,7 +2677,7 @@ write_texture_border_color(struct vc4_context *vc4, } } - cl_aligned_u32(&vc4->uniforms, uc.ui[0]); + cl_aligned_u32(uniforms, uc.ui[0]); } static uint32_t @@ -2693,7 +2697,8 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate, } static struct vc4_bo * -vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader, +vc4_upload_ubo(struct vc4_context *vc4, + struct vc4_compiled_shader *shader, const uint32_t *gallium_uniforms) { if (!shader->ubo_size) @@ -2722,72 +2727,78 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, cl_ensure_space(&vc4->uniforms, (uinfo->count + uinfo->num_texture_samples) * 4); - cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); + struct vc4_cl_out *uniforms = + cl_start_shader_reloc(&vc4->uniforms, + uinfo->num_texture_samples); for (int i = 0; i < uinfo->count; i++) { switch (uinfo->contents[i]) { case QUNIFORM_CONSTANT: - cl_aligned_u32(&vc4->uniforms, uinfo->data[i]); + cl_aligned_u32(&uniforms, uinfo->data[i]); break; case QUNIFORM_UNIFORM: - cl_aligned_u32(&vc4->uniforms, + cl_aligned_u32(&uniforms, gallium_uniforms[uinfo->data[i]]); break; case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f); + cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f); break; case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f); + cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f); break; case QUNIFORM_VIEWPORT_Z_OFFSET: - cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]); + cl_aligned_f(&uniforms, vc4->viewport.translate[2]); break; case QUNIFORM_VIEWPORT_Z_SCALE: - cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]); + cl_aligned_f(&uniforms, vc4->viewport.scale[2]); break; case QUNIFORM_USER_CLIP_PLANE: - cl_aligned_f(&vc4->uniforms, + cl_aligned_f(&uniforms, vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); break; case QUNIFORM_TEXTURE_CONFIG_P0: - write_texture_p0(vc4, texstate, uinfo->data[i]); + write_texture_p0(vc4, &uniforms, texstate, + uinfo->data[i]); break; case QUNIFORM_TEXTURE_CONFIG_P1: - write_texture_p1(vc4, texstate, uinfo->data[i]); + write_texture_p1(vc4, &uniforms, texstate, + uinfo->data[i]); break; case QUNIFORM_TEXTURE_CONFIG_P2: - write_texture_p2(vc4, texstate, uinfo->data[i]); + write_texture_p2(vc4, &uniforms, texstate, + uinfo->data[i]); break; case QUNIFORM_UBO_ADDR: - cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0); + cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0); break; case QUNIFORM_TEXTURE_BORDER_COLOR: - write_texture_border_color(vc4, texstate, uinfo->data[i]); + write_texture_border_color(vc4, &uniforms, + texstate, uinfo->data[i]); break; case QUNIFORM_TEXRECT_SCALE_X: case QUNIFORM_TEXRECT_SCALE_Y: - cl_aligned_u32(&vc4->uniforms, + cl_aligned_u32(&uniforms, get_texrect_scale(texstate, uinfo->contents[i], uinfo->data[i])); break; case QUNIFORM_BLEND_CONST_COLOR: - cl_aligned_f(&vc4->uniforms, + cl_aligned_f(&uniforms, CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1)); break; case QUNIFORM_STENCIL: - cl_aligned_u32(&vc4->uniforms, + cl_aligned_u32(&uniforms, vc4->zsa->stencil_uniforms[uinfo->data[i]] | (uinfo->data[i] <= 1 ? (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : @@ -2795,16 +2806,18 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, break; case QUNIFORM_ALPHA_REF: - cl_aligned_f(&vc4->uniforms, + cl_aligned_f(&uniforms, vc4->zsa->base.alpha.ref_value); break; } #if 0 - uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4); + uint32_t written_val = *((uint32_t *)uniforms - 1); fprintf(stderr, "%p: %d / 0x%08x (%f)\n", shader, i, written_val, uif(written_val)); #endif } + + cl_end(&vc4->uniforms, uniforms); } static void -- 2.30.2