From 1c21577246691589f0295081d208894082444a02 Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Fri, 15 May 2020 15:11:55 -0700 Subject: [PATCH] freedreno/a6xx: Emit VFD setup as array writes We can use only one PKT4 for each of VFD_FETCH, VFD_DECODE and VFD_DEST_CNTL and write all the elements if we split the loop into three loops. Part-of: --- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index 6105fcd8cfd..52a44c67535 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -568,38 +568,48 @@ build_vbo_state(struct fd6_emit *emit, const struct ir3_shader_variant *vp) } struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit, - 4 * (2 + cnt * 10), FD_RINGBUFFER_STREAMING); + 4 * (5 + cnt * 7), FD_RINGBUFFER_STREAMING); OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(cnt) | A6XX_VFD_CONTROL_0_DECODE_CNT(cnt)); + OUT_PKT4(ring, REG_A6XX_VFD_FETCH(0), 4 * cnt); for (int32_t j = 0; j < cnt; j++) { int32_t i = map[j]; struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[elem->vertex_buffer_index]; struct fd_resource *rsc = fd_resource(vb->buffer.resource); - enum pipe_format pfmt = elem->src_format; - enum a6xx_format fmt = fd6_pipe2vtx(pfmt); - bool isint = util_format_is_pure_integer(pfmt); uint32_t off = vb->buffer_offset + elem->src_offset; uint32_t size = fd_bo_size(rsc->bo) - off; - debug_assert(fmt != ~0); #ifdef DEBUG /* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 */ - if (off > fd_bo_size(rsc->bo)) + if (off > fd_bo_size(rsc->bo)) { + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); continue; + } #endif - OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 4); OUT_RELOC(ring, rsc->bo, off, 0, 0); OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ + } + + OUT_PKT4(ring, REG_A6XX_VFD_DECODE(0), 2 * cnt); + for (int32_t j = 0; j < cnt; j++) { + int32_t i = map[j]; + struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; + enum pipe_format pfmt = elem->src_format; + enum a6xx_format fmt = fd6_pipe2vtx(pfmt); + bool isint = util_format_is_pure_integer(pfmt); + debug_assert(fmt != ~0); - OUT_PKT4(ring, REG_A6XX_VFD_DECODE(j), 2); OUT_RING(ring, A6XX_VFD_DECODE_INSTR_IDX(j) | A6XX_VFD_DECODE_INSTR_FORMAT(fmt) | COND(elem->instance_divisor, A6XX_VFD_DECODE_INSTR_INSTANCED) | @@ -607,8 +617,12 @@ build_vbo_state(struct fd6_emit *emit, const struct ir3_shader_variant *vp) A6XX_VFD_DECODE_INSTR_UNK30 | COND(!isint, A6XX_VFD_DECODE_INSTR_FLOAT)); OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */ + } + + OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), cnt); + for (int32_t j = 0; j < cnt; j++) { + int32_t i = map[j]; - OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(j), 1); OUT_RING(ring, A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vp->inputs[i].compmask) | A6XX_VFD_DEST_CNTL_INSTR_REGID(vp->inputs[i].regid)); } -- 2.30.2