From 78fede86d998b2c26aff237224ec6214bbddd4d3 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Tue, 13 Nov 2018 11:04:32 -0500 Subject: [PATCH] freedreno: a2xx: fd2_draw update Signed-off-by: Jonathan Marek Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 98 ++++++++++++++++--- .../drivers/freedreno/freedreno_batch.c | 1 + .../drivers/freedreno/freedreno_batch.h | 1 + .../drivers/freedreno/freedreno_draw.c | 2 + .../drivers/freedreno/freedreno_draw.h | 24 ++++- .../drivers/freedreno/freedreno_util.h | 8 +- 6 files changed, 114 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index f00bec6efcd..6dac8ca6a9d 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -77,29 +77,44 @@ emit_vertexbufs(struct fd_context *ctx) fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); } -static bool -fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, - unsigned index_offset) +static void +draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, + struct fd_ringbuffer *ring, unsigned index_offset) { - struct fd_ringbuffer *ring = ctx->batch->draw; - - if (ctx->dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); - - fd2_emit_state(ctx, ctx->dirty); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, info->start); + OUT_RING(ring, info->index_size ? 0 : info->start); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); + OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - if (!is_a20x(ctx->screen)) { + if (is_a20x(ctx->screen)) { + /* wait for DMA to finish and + * dummy draw one triangle with indexes 0,0,0. + * with PRE_FETCH_CULL_ENABLE | GRP_CULL_ENABLE. + * + * this workaround is for a HW bug related to DMA alignment: + * it is necessary for indexed draws and possibly also + * draws that read binning data + */ + OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); + OUT_RING(ring, 0x000005d0); /* RBBM_STATUS */ + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00001000); /* bit: 12: VGT_BUSY_NO_DMA */ + OUT_RING(ring, 0x00000001); + + OUT_PKT3(ring, CP_DRAW_INDX_BIN, 6); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x0003c004); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000003); + OUT_RELOC(ring, fd_resource(fd2_context(ctx)->solid_vertexbuf)->bo, 0x80, 0, 0); + OUT_RING(ring, 0x00000006); + } else { OUT_WFI (ring); OUT_PKT3(ring, CP_SET_CONSTANT, 3); @@ -111,11 +126,62 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], IGNORE_VISIBILITY, info, index_offset); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); - OUT_RING(ring, 0x00000000); + if (is_a20x(ctx->screen)) { + /* not sure why this is required, but it fixes some hangs */ + OUT_WFI(ring); + } else { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); + OUT_RING(ring, 0x00000000); + } emit_cacheflush(ring); +} + + +static bool +fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, + unsigned index_offset) +{ + if (!ctx->prog.fp || !ctx->prog.vp) + return false; + + if (ctx->dirty & FD_DIRTY_VTXBUF) + emit_vertexbufs(ctx); + + fd2_emit_state(ctx, ctx->dirty); + + /* a2xx can draw only 65535 vertices at once + * on a22x the field in the draw command is 32bits but seems limited too + * using a limit of 32k because it fixes an unexplained hang + * 32766 works for all primitives (multiple of 2 and 3) + */ + if (pinfo->count > 32766) { + static const uint16_t step_tbl[PIPE_PRIM_MAX] = { + [0 ... PIPE_PRIM_MAX - 1] = 32766, + [PIPE_PRIM_LINE_STRIP] = 32765, + [PIPE_PRIM_TRIANGLE_STRIP] = 32764, + + /* needs more work */ + [PIPE_PRIM_TRIANGLE_FAN] = 0, + [PIPE_PRIM_LINE_LOOP] = 0, + }; + + struct pipe_draw_info info = *pinfo; + unsigned count = info.count; + unsigned step = step_tbl[info.mode]; + + if (!step) + return false; + + for (; count + step > 32766; count -= step) { + info.count = MIN2(count, 32766); + draw_impl(ctx, &info, ctx->batch->draw, index_offset); + info.start += step; + } + } else { + draw_impl(ctx, pinfo, ctx->batch->draw, index_offset); + } fd_context_all_clean(ctx); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c index 28b4942f9f8..eae2f68ce11 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.c +++ b/src/gallium/drivers/freedreno/freedreno_batch.c @@ -83,6 +83,7 @@ batch_init(struct fd_batch *batch) batch->flushed = false; batch->gmem_reason = 0; batch->num_draws = 0; + batch->num_vertices = 0; batch->stage = FD_STAGE_NULL; fd_reset_wfi(batch); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index d4feadd5590..a40d36094cd 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -124,6 +124,7 @@ struct fd_batch { FD_GMEM_LOGICOP_ENABLED = 0x20, } gmem_reason; unsigned num_draws; /* number of draws in current batch */ + unsigned num_vertices; /* number of vertices in current batch */ /* Track the maximal bounds of the scissor of all the draws within a * batch. Used at the tile rendering step (fd_gmem_render_tiles(), diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 90d0e440361..f17cb563063 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -291,6 +291,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (ctx->draw_vbo(ctx, info, index_offset)) batch->needs_flush = true; + batch->num_vertices += info->count * info->instance_count; + for (i = 0; i < ctx->streamout.num_targets; i++) ctx->streamout.offsets[i] += info->count; diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h index 8ae13411cef..c2197f21a9f 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.h +++ b/src/gallium/drivers/freedreno/freedreno_draw.h @@ -73,9 +73,29 @@ fd_draw(struct fd_batch *batch, struct fd_ringbuffer *ring, } if (is_a20x(batch->ctx->screen)) { - OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 4 : 2); + /* a20x has a different draw command for drawing with binning data + * note: if we do patching we will have to insert a NOP + * + * binning data is is 1 byte/vertex (8x8x4 bin position of vertex) + * base ptr set by the CP_SET_DRAW_INIT_FLAGS command + * + * TODO: investigate the faceness_cull_select parameter to see how + * it is used with hw binning to use "faceness" bits + */ + uint32_t size = 2; + if (vismode) + size += 2; + if (idx_buffer) + size += 2; + + OUT_PKT3(ring, vismode ? CP_DRAW_INDX_BIN : CP_DRAW_INDX, size); OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW_A20X(primtype, src_sel, idx_type, vismode, count)); + OUT_RING(ring, DRAW_A20X(primtype, DI_FACE_CULL_NONE, src_sel, + idx_type, vismode, vismode, count)); + if (vismode == USE_VISIBILITY) { + OUT_RING(ring, batch->num_vertices); + OUT_RING(ring, count); + } } else { OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 5 : 3); OUT_RING(ring, 0x00000000); /* viz query info. */ diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 125ad83523c..b0ed3fa8e4f 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -114,15 +114,19 @@ static inline uint32_t DRAW(enum pc_di_primtype prim_type, } static inline uint32_t DRAW_A20X(enum pc_di_primtype prim_type, + enum pc_di_face_cull_sel faceness_cull_select, enum pc_di_src_sel source_select, enum pc_di_index_size index_size, - enum pc_di_vis_cull_mode vis_cull_mode, + bool pre_fetch_cull_enable, + bool grp_cull_enable, uint16_t count) { return (prim_type << 0) | (source_select << 6) | + (faceness_cull_select << 8) | ((index_size & 1) << 11) | ((index_size >> 1) << 13) | - (vis_cull_mode << 9) | + (pre_fetch_cull_enable << 14) | + (grp_cull_enable << 15) | (count << 16); } -- 2.30.2