The binning pass sorts vertices into which bins/tiles they apply to.
The visibility information generated during the binning pass can be
used to speed up the rendering pass by filtering out vertices which
do not apply to the current tile. See:
https://github.com/freedreno/freedreno/wiki/Adreno-tiling#optimized-approach
This brings a significant fps boost. A rough assortment of tests
(supertuxkart, etracer, tremulous, glmark2 'build' test, etc) seems
to yield a ~35-45% fps improvement.
For now, to be conservative, the binning pass is not enabled yet by
default. To enable it use:
FD_MESA_DEBUG=binning
So far I haven't found anything that breaks with binning enabled,
but I'd like a bit more testing before I enable it as default.
Signed-off-by: Rob Clark <robclark@freedesktop.org>
LIBDRM_INTEL_REQUIRED=2.4.49
LIBDRM_NVVIEUX_REQUIRED=2.4.33
LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
-LIBDRM_FREEDRENO_REQUIRED=2.4.39
+LIBDRM_FREEDRENO_REQUIRED=2.4.51
DRI2PROTO_REQUIRED=2.6
DRI3PROTO_REQUIRED=1.0
PRESENTPROTO_REQUIRED=1.0
OUT_RING(ring, info->max_index); /* VGT_MAX_VTX_INDX */
OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */
- fd_draw_emit(ctx, info);
+ fd_draw_emit(ctx, ring, IGNORE_VISIBILITY, info);
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010));
OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */
OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */
OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
}
static void
OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */
OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 3,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 3, INDEX_SIZE_IGN, 0, 0, NULL);
}
static void
static void
-emit_vertexbufs(struct fd_context *ctx)
+emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring)
{
struct fd_vertex_stateobj *vtx = ctx->vtx;
struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf;
bufs[i].format = elem->src_format;
}
- fd3_emit_vertex_bufs(ctx->ring, &ctx->prog, bufs, vtx->num_elements);
+ fd3_emit_vertex_bufs(ring, &ctx->prog, bufs, vtx->num_elements);
}
static void
-fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
+draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
+ struct fd_ringbuffer *ring, unsigned dirty, bool binning)
{
- struct fd_ringbuffer *ring = ctx->ring;
- unsigned dirty = ctx->dirty;
-
- fd3_emit_state(ctx, dirty);
+ fd3_emit_state(ctx, ring, dirty, binning);
if (dirty & FD_DIRTY_VTXBUF)
- emit_vertexbufs(ctx);
+ emit_vertexbufs(ctx, ring);
OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1);
OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */
OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
info->restart_index : 0xffffffff);
- fd_draw_emit(ctx, info);
+ fd_draw_emit(ctx, ring, binning ? IGNORE_VISIBILITY : USE_VISIBILITY, info);
+}
+
+static void
+fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
+{
+ unsigned dirty = ctx->dirty;
+ draw_impl(ctx, info, ctx->binning_ring,
+ dirty & ~(FD_DIRTY_BLEND), true);
+ draw_impl(ctx, info, ctx->ring, dirty, false);
+}
+
+/* binning pass cmds for a clear:
+ * NOTE: newer blob drivers don't use binning for clear, which is probably
+ * preferable since it is low vtx count. However that doesn't seem to
+ * actually work for me. Not sure if it is depending on support for
+ * clear pass (rather than using solid-fill shader), or something else
+ * that newer blob is doing differently. Once that is figured out, we
+ * can remove fd3_clear_binning().
+ */
+static void
+fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
+{
+ struct fd3_context *fd3_ctx = fd3_context(ctx);
+ struct fd_ringbuffer *ring = ctx->binning_ring;
+
+ fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT |
+ FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), true);
+
+ fd3_program_emit(ring, &ctx->solid_prog, true);
+
+ fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
+ { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
+ }, 1);
+
+ OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
+ OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
+ A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
+ A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+ A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
+ OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4);
+ OUT_RING(ring, 0); /* VFD_INDEX_MIN */
+ OUT_RING(ring, 2); /* VFD_INDEX_MAX */
+ OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */
+ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */
+ OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1);
+ OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */
+
+ OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+ OUT_RING(ring, PERFCOUNTER_STOP);
+
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
}
static void
{
struct fd3_context *fd3_ctx = fd3_context(ctx);
struct fd_ringbuffer *ring = ctx->ring;
+ unsigned dirty = ctx->dirty;
unsigned ce, i;
+ fd3_clear_binning(ctx, dirty);
+
/* emit generic state now: */
- fd3_emit_state(ctx, ctx->dirty & (FD_DIRTY_VIEWPORT |
- FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR));
+ fd3_emit_state(ctx, ring, dirty & (FD_DIRTY_VIEWPORT |
+ FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR), false);
OUT_PKT0(ring, REG_A3XX_RB_BLEND_ALPHA, 1);
OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) |
OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1);
OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0));
- fd3_program_emit(ring, &ctx->solid_prog);
+ fd3_program_emit(ring, &ctx->solid_prog, false);
fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
{ .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
OUT_PKT3(ring, CP_EVENT_WRITE, 1);
OUT_RING(ring, PERFCOUNTER_STOP);
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, USE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
}
void
}
void
-fd3_emit_state(struct fd_context *ctx, uint32_t dirty)
+fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+ uint32_t dirty, bool binning)
{
- struct fd_ringbuffer *ring = ctx->ring;
-
emit_marker(ring, 5);
if (dirty & FD_DIRTY_SAMPLE_MASK) {
struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa);
struct pipe_stencil_ref *sr = &ctx->stencil_ref;
- fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control);
+ if (!binning)
+ fd3_emit_rbrc_draw_state(ctx, ring, zsa->rb_render_control);
OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1);
OUT_RING(ring, zsa->rb_alpha_ref);
}
if (dirty & FD_DIRTY_PROG)
- fd3_program_emit(ring, &ctx->prog);
+ fd3_program_emit(ring, &ctx->prog, binning);
+
+ OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+ OUT_RING(ring, HLSQ_FLUSH);
if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
struct fd_program_stateobj *prog = &ctx->prog;
OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) |
A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0));
- OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 1);
- OUT_RING(ring, 0x00000001); /* UCHE_CACHE_MODE_CONTROL_REG */
-
- OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1);
- OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */
+ OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2);
+ OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0));
+ OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) |
+ A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) |
+ A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE);
OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */
OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */
}
+ OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+ OUT_RING(ring, 0x00000000);
+
emit_cache_flush(ring);
fd_rmw_wfi(ctx, ring);
}
void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
struct fd_program_stateobj *prog,
struct fd3_vertex_buf *vbufs, uint32_t n);
-void fd3_emit_state(struct fd_context *ctx, uint32_t dirty);
+void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+ uint32_t dirty, bool binning);
void fd3_emit_restore(struct fd_context *ctx);
return align(gmem->bin_w * gmem->bin_h, 0x4000);
}
+static bool
+use_hw_binning(struct fd_context *ctx)
+{
+ struct fd_gmem_stateobj *gmem = &ctx->gmem;
+ return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
+}
+
+/* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */
+static void update_vsc_pipe(struct fd_context *ctx);
+static void
+emit_binning_workaround(struct fd_context *ctx)
+{
+ struct fd3_context *fd3_ctx = fd3_context(ctx);
+ struct fd_gmem_stateobj *gmem = &ctx->gmem;
+ struct fd_ringbuffer *ring = ctx->ring;
+
+ OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2);
+ OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
+ A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+ OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) |
+ A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
+ A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER));
+
+ OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4);
+ OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) |
+ A3XX_RB_COPY_CONTROL_MODE(0) |
+ A3XX_RB_COPY_CONTROL_GMEM_BASE(0));
+ OUT_RELOC(ring, fd_resource(fd3_ctx->solid_vbuf)->bo, 0x20, 0, -1); /* RB_COPY_DEST_BASE */
+ OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128));
+ OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) |
+ A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) |
+ A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) |
+ A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) |
+ A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+ OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) |
+ A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+ A3XX_GRAS_SC_CONTROL_RASTER_MODE(1));
+
+ fd3_program_emit(ring, &ctx->solid_prog, false);
+
+ fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
+ { .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
+ }, 1);
+
+ OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4);
+ OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+ A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
+ A3XX_HLSQ_CONTROL_0_REG_RESERVED2 |
+ A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
+ OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
+ A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE);
+ OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
+ OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */
+
+ OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1);
+ OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) |
+ A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20));
+
+ OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE |
+ A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) |
+ A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff));
+
+ OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER));
+
+ OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) |
+ A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) |
+ A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) |
+ A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) |
+ A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) |
+ A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) |
+ A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) |
+ A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1);
+ OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0));
+
+ OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4);
+ OUT_RING(ring, 0); /* VFD_INDEX_MIN */
+ OUT_RING(ring, 2); /* VFD_INDEX_MAX */
+ OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */
+ OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */
+
+ OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
+ OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
+ A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
+ A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+ A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
+ OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) |
+ A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1));
+ OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) |
+ A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+ OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
+ A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
+ OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) |
+ A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6);
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0));
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0));
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0));
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0));
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0));
+ OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
+ OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE |
+ A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE |
+ A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE |
+ A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE |
+ A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE);
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1);
+ OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) |
+ A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0));
+
+ OUT_PKT3(ring, CP_DRAW_INDX_2, 5);
+ OUT_RING(ring, 0x00000000); /* viz query info. */
+ OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE,
+ INDEX_SIZE_32_BIT, IGNORE_VISIBILITY));
+ OUT_RING(ring, 2); /* NumIndices */
+ OUT_RING(ring, 2);
+ OUT_RING(ring, 1);
+
+ OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1);
+ OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS));
+
+ OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
+ OUT_RING(ring, 0x00000000);
+
+ OUT_WFI(ring);
+
+ OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1);
+ OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
+ A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+ OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+ A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+ A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
+ OUT_RING(ring, 0x00000000);
+}
+
/* transfer from gmem to system memory (ie. normal RAM) */
static void
A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) |
A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(psurf->format)));
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
}
static void
OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */
OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */
- fd3_program_emit(ring, &ctx->solid_prog);
+ fd3_program_emit(ring, &ctx->solid_prog, false);
fd3_emit_vertex_bufs(ring, &ctx->solid_prog, (struct fd3_vertex_buf[]) {
{ .prsc = fd3_ctx->solid_vbuf, .stride = 12, .format = PIPE_FORMAT_R32G32B32_FLOAT },
fd3_emit_gmem_restore_tex(ring, psurf);
- fd_draw(ctx, DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, 2,
- INDEX_SIZE_IGN, 0, 0, NULL);
+ fd_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+ DI_SRC_SEL_AUTO_INDEX, 2, INDEX_SIZE_IGN, 0, 0, NULL);
}
static void
OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */
OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */
- fd3_program_emit(ring, &ctx->blit_prog);
+ fd3_program_emit(ring, &ctx->blit_prog, false);
fd3_emit_vertex_bufs(ring, &ctx->blit_prog, (struct fd3_vertex_buf[]) {
{ .prsc = fd3_ctx->blit_texcoord_vbuf, .stride = 8, .format = PIPE_FORMAT_R32G32_FLOAT },
A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
}
+static void
+patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode)
+{
+ unsigned i;
+ for (i = 0; i < fd_patch_num_elements(&ctx->draw_patches); i++) {
+ struct fd_cs_patch *patch = fd_patch_element(&ctx->draw_patches, i);
+ *patch->cs = patch->val | DRAW(0, 0, 0, vismode);
+ }
+ util_dynarray_resize(&ctx->draw_patches, 0);
+}
+
+/* for rendering directly to system memory: */
+static void
+fd3_emit_sysmem_prep(struct fd_context *ctx)
+{
+ struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+ struct fd_ringbuffer *ring = ctx->ring;
+ uint32_t pitch = 0;
+
+ if (pfb->cbufs[0])
+ pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch;
+
+ fd3_emit_restore(ctx);
+
+ OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
+ OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
+ A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
+
+ emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+
+ OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
+ A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch));
+
+ /* setup scissor/offset for current tile: */
+ OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1);
+ OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) |
+ A3XX_RB_WINDOW_OFFSET_Y(0));
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+ OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
+ A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
+ OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) |
+ A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));
+
+ OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+ A3XX_RB_MODE_CONTROL_GMEM_BYPASS |
+ A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+
+ patch_draws(ctx, IGNORE_VISIBILITY);
+}
+
static void
update_vsc_pipe(struct fd_context *ctx)
{
+ struct fd3_context *fd3_ctx = fd3_context(ctx);
struct fd_ringbuffer *ring = ctx->ring;
int i;
+ OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1);
+ OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */
+
for (i = 0; i < 8; i++) {
struct fd_vsc_pipe *pipe = &ctx->pipe[i];
DRM_FREEDRENO_GEM_TYPE_KMEM);
}
- OUT_PKT0(ring, REG_A3XX_VSC_PIPE(0), 3);
+ OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3);
OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) |
A3XX_VSC_PIPE_CONFIG_Y(pipe->y) |
A3XX_VSC_PIPE_CONFIG_W(pipe->w) |
}
}
-/* for rendering directly to system memory: */
static void
-fd3_emit_sysmem_prep(struct fd_context *ctx)
+emit_binning_pass(struct fd_context *ctx)
{
struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
struct fd_ringbuffer *ring = ctx->ring;
- uint32_t pitch = 0;
+ int i;
- if (pfb->cbufs[0])
- pitch = fd_resource(pfb->cbufs[0]->texture)->slices[0].pitch;
+ if (ctx->screen->gpu_id == 320) {
+ emit_binning_workaround(ctx);
- fd3_emit_restore(ctx);
+ OUT_PKT3(ring, CP_INVALIDATE_STATE, 1);
+ OUT_RING(ring, 0x00007fff);
+ }
+
+ OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1);
+ OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE);
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+ OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) |
+ A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+ A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1);
OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
- emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
-
OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1);
OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
- A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch));
+ A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
+ A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w));
- /* setup scissor/offset for current tile: */
+ /* setup scissor/offset for whole screen: */
OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1);
OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) |
A3XX_RB_WINDOW_OFFSET_Y(0));
+ OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE);
+
OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) |
A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0));
A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1));
OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1);
+ OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) |
+ A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+
+ for (i = 0; i < 4; i++) {
+ OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
+ OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(0) |
+ A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) |
+ A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0));
+ }
+
+ OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+ OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) |
+ A3XX_PC_VSTREAM_CONTROL_N(0));
+
+ /* emit IB to binning drawcmds: */
+ OUT_IB(ring, ctx->binning_start, ctx->binning_end);
+
+ /* and then put stuff back the way it was: */
+
+ OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1);
+ OUT_RING(ring, 0x00000000);
+
+ OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
+ OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE |
+ A3XX_SP_SP_CTRL_REG_CONSTMODE(1) |
+ A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
+ A3XX_SP_SP_CTRL_REG_L0MODE(0));
+
+ OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1);
+ OUT_RING(ring, 0x00000000);
+
+ OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1);
+ OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+ A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+ A3XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+ OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2);
OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
- A3XX_RB_MODE_CONTROL_GMEM_BYPASS |
A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE);
+ OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM |
+ A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) |
+ A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w));
+
+ OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+ OUT_RING(ring, CACHE_FLUSH);
+
+ if (ctx->screen->gpu_id == 320) {
+ /* dummy-draw workaround: */
+ OUT_PKT3(ring, CP_DRAW_INDX, 3);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
+ INDEX_SIZE_IGN, IGNORE_VISIBILITY));
+ OUT_RING(ring, 0); /* NumIndices */
+ }
+
+ OUT_PKT3(ring, CP_NOP, 4);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, 0x00000000);
+
+ OUT_WFI(ring);
+
+ if (ctx->screen->gpu_id == 320) {
+ emit_binning_workaround(ctx);
+ }
}
/* before first tile */
A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
update_vsc_pipe(ctx);
+
+ if (use_hw_binning(ctx)) {
+ /* mark the end of the binning cmds: */
+ fd_ringmarker_mark(ctx->binning_end);
+
+ /* emit hw binning pass: */
+ emit_binning_pass(ctx);
+
+ patch_draws(ctx, USE_VISIBILITY);
+ } else {
+ patch_draws(ctx, IGNORE_VISIBILITY);
+ }
}
/* before mem2gmem */
struct fd_gmem_stateobj *gmem = &ctx->gmem;
uint32_t reg;
-
OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2);
reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(gmem));
if (pfb->zsbuf) {
static void
fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
{
+ struct fd3_context *fd3_ctx = fd3_context(ctx);
struct fd_ringbuffer *ring = ctx->ring;
struct fd_gmem_stateobj *gmem = &ctx->gmem;
struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
uint32_t x2 = tile->xoff + tile->bin_w - 1;
uint32_t y2 = tile->yoff + tile->bin_h - 1;
+ if (use_hw_binning(ctx)) {
+ struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p];
+
+ assert(pipe->w * pipe->h);
+
+ OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+ OUT_RING(ring, HLSQ_FLUSH);
+
+ OUT_WFI(ring);
+
+ OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+ OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) |
+ A3XX_PC_VSTREAM_CONTROL_N(tile->n));
+
+ OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+ OUT_RING(ring, CACHE_FLUSH);
+
+ OUT_PKT3(ring, CP_SET_BIN_DATA, 2);
+ OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */
+ OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */
+ (tile->p * 4), 0, 0);
+ } else {
+ OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
+ OUT_RING(ring, 0x00000000);
+ }
+
OUT_PKT3(ring, CP_SET_BIN, 3);
OUT_RING(ring, 0x00000000);
OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
#include "fd3_program.h"
#include "fd3_compiler.h"
+#include "fd3_emit.h"
#include "fd3_texture.h"
#include "fd3_util.h"
}
static void
-emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so)
+emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_stateobj *so)
{
- struct ir3_shader_info *si = &so->info;
+ const struct ir3_shader_info *si = &so->info;
enum adreno_state_block sb;
enum adreno_state_src src;
uint32_t i, sz, *bin;
}
static int
-find_output(struct fd3_shader_stateobj *so, fd3_semantic semantic)
+find_output(const struct fd3_shader_stateobj *so, fd3_semantic semantic)
{
int j;
for (j = 0; j < so->outputs_count; j++)
void
fd3_program_emit(struct fd_ringbuffer *ring,
- struct fd_program_stateobj *prog)
+ struct fd_program_stateobj *prog, bool binning)
{
- struct fd3_shader_stateobj *vp = prog->vp;
- struct fd3_shader_stateobj *fp = prog->fp;
- struct ir3_shader_info *vsi = &vp->info;
- struct ir3_shader_info *fsi = &fp->info;
+ const struct fd3_shader_stateobj *vp = prog->vp;
+ const struct fd3_shader_stateobj *fp = prog->fp;
+ const struct ir3_shader_info *vsi = &vp->info;
+ const struct ir3_shader_info *fsi = &fp->info;
int i;
+ if (binning) {
+ /* use dummy stateobj to simplify binning vs non-binning: */
+ static const struct fd3_shader_stateobj binning_fp = {};
+ fp = &binning_fp;
+ fsi = &fp->info;
+ }
+
/* we could probably divide this up into things that need to be
* emitted if frag-prog is dirty vs if vert-prog is dirty..
*/
OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
+ COND(binning, A3XX_SP_SP_CTRL_REG_BINNING) |
A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
- // XXX "resolve" (?) bit set on gmem->mem pass..
-// COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) |
- // XXX sometimes 0, sometimes 1:
- A3XX_SP_SP_CTRL_REG_LOMODE(1));
+ A3XX_SP_SP_CTRL_REG_L0MODE(0));
OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1);
OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen));
OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
+ A3XX_SP_VS_CTRL_REG0_CACHEINVALID |
A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */
- OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
- OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));
-
- OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
- OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
- A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
- A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
- A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
- A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
- A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
- A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
- COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
- A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
- OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
- A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
- A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) |
- A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
-
- OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
- OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
- A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
- OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */
+ if (binning) {
+ OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
+ OUT_RING(ring, 0x00000000);
+
+ OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
+ OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+ A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
+ OUT_RING(ring, 0x00000000);
+ } else {
+ OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
+ OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));
+
+ OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
+ OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+ A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
+ A3XX_SP_FS_CTRL_REG0_CACHEINVALID |
+ A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
+ A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
+ A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+ A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+ A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
+ COND(fp->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
+ A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
+ OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
+ A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
+ A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi->max_const, 0)) |
+ A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
+ OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
+ OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
+ A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
+ OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */
+ }
OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2);
OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */
OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0));
- OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
- OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
- A3XX_VPC_ATTR_THRDASSIGN(1) |
- A3XX_VPC_ATTR_LMSIZE(1));
- OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
- A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));
-
- OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
- OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */
- OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */
- OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */
- OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */
-
- OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
- OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */
- OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */
- OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */
- OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */
+ if (binning) {
+ OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
+ OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) |
+ A3XX_VPC_ATTR_LMSIZE(1));
+ OUT_RING(ring, 0x00000000);
+ } else {
+ OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
+ OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) |
+ A3XX_VPC_ATTR_THRDASSIGN(1) |
+ A3XX_VPC_ATTR_LMSIZE(1));
+ OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) |
+ A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));
+
+ OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
+ OUT_RING(ring, fp->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */
+ OUT_RING(ring, fp->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */
+ OUT_RING(ring, fp->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */
+ OUT_RING(ring, fp->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */
+
+ OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
+ OUT_RING(ring, fp->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */
+ OUT_RING(ring, fp->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */
+ OUT_RING(ring, fp->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */
+ OUT_RING(ring, fp->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */
+ }
OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */
- emit_shader(ring, fp);
+ if (!binning) {
+ emit_shader(ring, fp);
- OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
- OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */
+ OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
+ OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */
+ }
OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
};
void fd3_program_emit(struct fd_ringbuffer *ring,
- struct fd_program_stateobj *prog);
+ struct fd_program_stateobj *prog, bool binning);
void fd3_prog_init(struct pipe_context *pctx);
void fd3_prog_fini(struct pipe_context *pctx);
#include "freedreno_gmem.h"
#include "freedreno_util.h"
-static void
-fd_context_next_rb(struct pipe_context *pctx)
+static struct fd_ringbuffer *next_rb(struct fd_context *ctx)
{
- struct fd_context *ctx = fd_context(pctx);
struct fd_ringbuffer *ring;
uint32_t ts;
- fd_ringmarker_del(ctx->draw_start);
- fd_ringmarker_del(ctx->draw_end);
-
/* grab next ringbuffer: */
ring = ctx->rings[(ctx->rings_idx++) % ARRAY_SIZE(ctx->rings)];
fd_ringbuffer_reset(ring);
+ return ring;
+}
+
+static void
+fd_context_next_rb(struct pipe_context *pctx)
+{
+ struct fd_context *ctx = fd_context(pctx);
+ struct fd_ringbuffer *ring;
+
+ fd_ringmarker_del(ctx->draw_start);
+ fd_ringmarker_del(ctx->draw_end);
+
+ ring = next_rb(ctx);
+
ctx->draw_start = fd_ringmarker_new(ring);
ctx->draw_end = fd_ringmarker_new(ring);
+ fd_ringbuffer_set_parent(ring, NULL);
ctx->ring = ring;
+
+ fd_ringmarker_del(ctx->binning_start);
+ fd_ringmarker_del(ctx->binning_end);
+
+ ring = next_rb(ctx);
+
+ ctx->binning_start = fd_ringmarker_new(ring);
+ ctx->binning_end = fd_ringmarker_new(ring);
+
+ fd_ringbuffer_set_parent(ring, ctx->ring);
+ ctx->binning_ring = ring;
}
/* emit accumulated render cmds, needed for example if render target has
DBG("");
+ util_slab_destroy(&ctx->transfer_pool);
+
+ util_dynarray_fini(&ctx->draw_patches);
+
if (ctx->blitter)
util_blitter_destroy(ctx->blitter);
fd_ringmarker_del(ctx->draw_start);
fd_ringmarker_del(ctx->draw_end);
- fd_ringbuffer_del(ctx->ring);
+ fd_ringmarker_del(ctx->binning_start);
+ fd_ringmarker_del(ctx->binning_end);
+
+ for (i = 0; i < ARRAY_SIZE(ctx->rings); i++)
+ fd_ringbuffer_del(ctx->rings[i]);
for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
struct fd_vsc_pipe *pipe = &ctx->pipe[i];
fd_context_next_rb(pctx);
fd_reset_rmw_state(ctx);
+ util_dynarray_init(&ctx->draw_patches);
+
util_slab_create(&ctx->transfer_pool, sizeof(struct pipe_transfer),
16, UTIL_SLAB_SINGLETHREADED);
*/
enum {
/* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */
- FD_BUFFER_COLOR = PIPE_CLEAR_COLOR,
+ FD_BUFFER_COLOR = PIPE_CLEAR_COLOR0,
FD_BUFFER_DEPTH = PIPE_CLEAR_DEPTH,
FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL,
FD_BUFFER_ALL = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL,
struct fd_ringbuffer *rings[4];
unsigned rings_idx;
+ /* normal draw/clear cmds: */
struct fd_ringbuffer *ring;
struct fd_ringmarker *draw_start, *draw_end;
+ /* binning pass draw/clear cmds: */
+ struct fd_ringbuffer *binning_ring;
+ struct fd_ringmarker *binning_start, *binning_end;
+
/* Keep track if WAIT_FOR_IDLE is needed for registers we need
* to update via RMW:
*/
uint32_t rbrc_draw;
} rmw;
+ /* Keep track of DRAW initiators that need to be patched up depending
+ * on whether we using binning or not:
+ */
+ struct util_dynarray draw_patches;
+
struct pipe_scissor_state scissor;
/* we don't have a disable/enable bit for scissor, so instead we keep
/* this is same for a2xx/a3xx, so split into helper: */
void
-fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info)
+fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+ enum pc_di_vis_cull_mode vismode,
+ const struct pipe_draw_info *info)
{
struct pipe_index_buffer *idx = &ctx->indexbuf;
struct fd_bo *idx_bo = NULL;
src_sel = DI_SRC_SEL_AUTO_INDEX;
}
- fd_draw(ctx, ctx->primtypes[info->mode], src_sel, info->count,
- idx_type, idx_size, idx_offset, idx_bo);
+ fd_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+ info->count, idx_type, idx_size, idx_offset, idx_bo);
}
static void
ctx->clear(ctx, buffers, color, depth, stencil);
ctx->dirty |= FD_DIRTY_ZSA |
+ FD_DIRTY_VIEWPORT |
FD_DIRTY_RASTERIZER |
FD_DIRTY_SAMPLE_MASK |
FD_DIRTY_PROG |
struct fd_ringbuffer;
-void fd_draw_emit(struct fd_context *ctx, const struct pipe_draw_info *info);
+void fd_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+ enum pc_di_vis_cull_mode vismode,
+ const struct pipe_draw_info *info);
void fd_draw_init(struct pipe_context *pctx);
static inline void
-fd_draw(struct fd_context *ctx, enum pc_di_primtype primtype,
+fd_draw(struct fd_context *ctx, struct fd_ringbuffer *ring,
+ enum pc_di_primtype primtype,
+ enum pc_di_vis_cull_mode vismode,
enum pc_di_src_sel src_sel, uint32_t count,
enum pc_di_index_size idx_type,
uint32_t idx_size, uint32_t idx_offset,
struct fd_bo *idx_bo)
{
- struct fd_ringbuffer *ring = ctx->ring;
-
/* for debug after a lock up, write a unique counter value
* to scratch7 for each draw, to make it easier to match up
* register dumps to cmdstream. The combination of IB
OUT_PKT3(ring, CP_DRAW_INDX, 3);
OUT_RING(ring, 0x00000000);
OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX,
- INDEX_SIZE_IGN, IGNORE_VISIBILITY));
+ INDEX_SIZE_IGN, USE_VISIBILITY));
OUT_RING(ring, 0); /* NumIndices */
/* ugg, hard-code register offset to avoid pulling in the
OUT_PKT3(ring, CP_DRAW_INDX, idx_bo ? 5 : 3);
OUT_RING(ring, 0x00000000); /* viz query info. */
- OUT_RING(ring, DRAW(primtype, src_sel,
- idx_type, IGNORE_VISIBILITY));
+ if (vismode == USE_VISIBILITY) {
+ /* leave vis mode blank for now, it will be patched up when
+ * we know if we are binning or not
+ */
+ OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0),
+ &ctx->draw_patches);
+ } else {
+ OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode));
+ }
OUT_RING(ring, count); /* NumIndices */
if (idx_bo) {
OUT_RELOC(ring, idx_bo, idx_offset, 0, 0);
uint32_t bin_w, bin_h;
uint32_t max_width = bin_width(ctx);
uint32_t cpp = 4;
- uint32_t i, j, t, p, n, xoff, yoff;
+ uint32_t i, j, t, xoff, yoff;
+ uint32_t tpp_x, tpp_y;
bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
if (pfb->cbufs[0])
gmem->width = width;
gmem->height = height;
- /* Assign tiles and pipes:
- * NOTE we currently take a rather simplistic approach of
- * mapping rows of tiles to a pipe. At some point it might
- * be worth playing with different strategies and seeing if
- * that makes much impact on performance.
+ /*
+ * Assign tiles and pipes:
+ *
+ * At some point it might be worth playing with different
+ * strategies and seeing if that makes much impact on
+ * performance.
*/
- t = p = n = 0;
+
+#define div_round_up(v, a) (((v) + (a) - 1) / (a))
+ /* figure out number of tiles per pipe: */
+ tpp_x = tpp_y = 1;
+ while (div_round_up(nbins_y, tpp_y) > 8)
+ tpp_y += 2;
+ while ((div_round_up(nbins_y, tpp_y) *
+ div_round_up(nbins_x, tpp_x)) > 8)
+ tpp_x += 1;
+
+ /* configure pipes: */
+ xoff = yoff = 0;
+ for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+ struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+
+ if (xoff >= nbins_x) {
+ xoff = 0;
+ yoff += tpp_y;
+ }
+
+ if (yoff >= nbins_y) {
+ break;
+ }
+
+ pipe->x = xoff;
+ pipe->y = yoff;
+ pipe->w = MIN2(tpp_x, nbins_x - xoff);
+ pipe->h = MIN2(tpp_y, nbins_y - yoff);
+
+ xoff += tpp_x;
+ }
+
+ for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+ struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+ pipe->x = pipe->y = pipe->w = pipe->h = 0;
+ }
+
+#if 0 /* debug */
+ printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
+ for (i = 0; i < 8; i++) {
+ struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+ printf("pipe[%d]: %ux%u @ %u,%u\n", i,
+ pipe->w, pipe->h, pipe->x, pipe->y);
+ }
+#endif
+
+ /* configure tiles: */
+ t = 0;
yoff = miny;
for (i = 0; i < nbins_y; i++) {
- struct fd_vsc_pipe *pipe = &ctx->pipe[p];
uint32_t bw, bh;
- assert(p < ARRAY_SIZE(ctx->pipe));
-
xoff = minx;
/* clip bin height: */
for (j = 0; j < nbins_x; j++) {
struct fd_tile *tile = &ctx->tile[t];
+ uint32_t n, p;
assert(t < ARRAY_SIZE(ctx->tile));
+ /* pipe number: */
+ p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
+
+ /* slot number: */
+ n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
+
/* clip bin width: */
bw = MIN2(bin_w, minx + width - xoff);
- tile->n = n++;
+ tile->n = n;
tile->p = p;
tile->bin_w = bw;
tile->bin_h = bh;
xoff += bw;
}
- /* one pipe per row: */
- pipe->x = 0;
- pipe->y = i;
- pipe->w = nbins_x;
- pipe->h = 1;
-
- p++;
- n = 0;
-
yoff += bh;
}
- for (; p < ARRAY_SIZE(ctx->pipe); p++) {
- struct fd_vsc_pipe *pipe = &ctx->pipe[p];
- pipe->x = pipe->y = pipe->w = pipe->h = 0;
+#if 0 /* debug */
+ t = 0;
+ for (i = 0; i < nbins_y; i++) {
+ for (j = 0; j < nbins_x; j++) {
+ struct fd_tile *tile = &ctx->tile[t++];
+ printf("|p:%u n:%u|", tile->p, tile->n);
+ }
+ printf("\n");
}
+#endif
}
static void
/* mark the end of the clear/draw cmds before emitting per-tile cmds: */
fd_ringmarker_mark(ctx->draw_end);
+ fd_ringmarker_mark(ctx->binning_end);
if (sysmem) {
DBG("rendering sysmem (%s/%s)",
/* GPU executes starting from tile cmds, which IB back to draw cmds: */
fd_ringmarker_flush(ctx->draw_end);
- /* mark start for next draw cmds: */
+ /* mark start for next draw/binning cmds: */
fd_ringmarker_mark(ctx->draw_start);
+ fd_ringmarker_mark(ctx->binning_start);
fd_reset_rmw_state(ctx);
{"direct", FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"},
{"dbypass", FD_DBG_DBYPASS,"Disable GMEM bypass"},
{"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
+ {"binning", FD_DBG_BINNING, "Enable hw binning"},
+ {"dbinning", FD_DBG_DBINNING, "Disable hw binning"},
DEBUG_NAMED_VALUE_END
};
DEBUG_GET_ONCE_FLAGS_OPTION(fd_mesa_debug, "FD_MESA_DEBUG", debug_options, 0)
int fd_mesa_debug = 0;
+bool fd_binning_enabled = false; /* default to off for now */
static const char *
fd_screen_get_name(struct pipe_screen *pscreen)
fd_mesa_debug = debug_get_option_fd_mesa_debug();
+ if (fd_mesa_debug & FD_DBG_BINNING)
+ fd_binning_enabled = true;
+
+ if (fd_mesa_debug & FD_DBG_DBINNING)
+ fd_binning_enabled = false;
+
if (!screen)
return NULL;
#include "util/u_debug.h"
#include "util/u_math.h"
#include "util/u_half.h"
+#include "util/u_dynarray.h"
#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
/* TBD if it is same on a2xx, but for now: */
#define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS
-#define FD_DBG_MSGS 0x01
-#define FD_DBG_DISASM 0x02
-#define FD_DBG_DCLEAR 0x04
-#define FD_DBG_DGMEM 0x08
-#define FD_DBG_DSCIS 0x10
-#define FD_DBG_DIRECT 0x20
-#define FD_DBG_DBYPASS 0x40
-#define FD_DBG_FRAGHALF 0x80
+#define FD_DBG_MSGS 0x0001
+#define FD_DBG_DISASM 0x0002
+#define FD_DBG_DCLEAR 0x0004
+#define FD_DBG_DGMEM 0x0008
+#define FD_DBG_DSCIS 0x0010
+#define FD_DBG_DIRECT 0x0020
+#define FD_DBG_DBYPASS 0x0040
+#define FD_DBG_FRAGHALF 0x0080
+#define FD_DBG_BINNING 0x0100
+#define FD_DBG_DBINNING 0x0200
extern int fd_mesa_debug;
+extern bool fd_binning_enabled;
#define DBG(fmt, ...) \
do { if (fd_mesa_debug & FD_DBG_MSGS) \
(1 << 14);
}
+/* for tracking cmdstream positions that need to be patched: */
+struct fd_cs_patch {
+ uint32_t *cs;
+ uint32_t val;
+};
+#define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch))
+#define fd_patch_element(buf, i) util_dynarray_element(buf, struct fd_cs_patch, i)
static inline enum pipe_format
pipe_surface_format(struct pipe_surface *psurf)
*(ring->cur++) = data;
}
+/* like OUT_RING() but appends a cmdstream patch point to 'buf' */
+static inline void
+OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data,
+ struct util_dynarray *buf)
+{
+ if (LOG_DWORDS) {
+ DBG("ring[%p]: OUT_RINGP %04x: %08x", ring,
+ (uint32_t)(ring->cur - ring->last_start), data);
+ }
+ util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){
+ .cs = ring->cur++,
+ .val = data,
+ }));
+}
+
static inline void
OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
uint32_t offset, uint32_t or, int32_t shift)
uint32_t offset, uint32_t or, int32_t shift)
{
if (LOG_DWORDS) {
- DBG("ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring,
+ DBG("ring[%p]: OUT_RELOCW %04x: %p+%u << %d", ring,
(uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
}
fd_ringbuffer_reloc(ring, &(struct fd_reloc){