X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fgallium%2Fdrivers%2Ffreedreno%2Ffreedreno_gmem.c;h=4040d1f76152f6862188cdb9d639020394ee561c;hb=f24e910da40c7c813b7da009269cd994cf6ff375;hp=52b637629f2bd5c8c4db8bf103fc2e09e800d828;hpb=afc1b7c21f795c1bf18f1edc376624011e2dfd7c;p=mesa.git diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 52b637629f2..4040d1f7615 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -30,14 +30,12 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_pack_color.h" +#include "util/u_format.h" #include "freedreno_gmem.h" #include "freedreno_context.h" -#include "freedreno_state.h" -#include "freedreno_program.h" #include "freedreno_resource.h" -#include "freedreno_zsa.h" +#include "freedreno_query_hw.h" #include "freedreno_util.h" /* @@ -69,423 +67,365 @@ * resolve. */ -/* transfer from gmem to system memory (ie. normal RAM) */ - -static void -emit_gmem2mem_surf(struct fd_ringbuffer *ring, uint32_t swap, uint32_t base, - struct pipe_surface *psurf) +static uint32_t bin_width(struct fd_context *ctx) { - struct fd_resource *rsc = fd_resource(psurf->texture); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_COLOR_INFO)); - OUT_RING(ring, RB_COLOR_INFO_COLOR_SWAP(swap) | - RB_COLOR_INFO_COLOR_BASE(base / 1024) | - RB_COLOR_INFO_COLOR_FORMAT(fd_pipe2color(psurf->format))); - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_RB_COPY_CONTROL)); - OUT_RING(ring, 0x00000000); /* RB_COPY_CONTROL */ - OUT_RELOC(ring, rsc->bo, 0, 0); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, rsc->pitch >> 5); /* RB_COPY_DEST_PITCH */ - OUT_RING(ring, RB_COPY_DEST_INFO_FORMAT(fd_pipe2color(psurf->format)) | - RB_COPY_DEST_INFO_LINEAR | /* RB_COPY_DEST_INFO */ - RB_COPY_DEST_INFO_SWAP(swap) | - RB_COPY_DEST_INFO_WRITE_RED | - RB_COPY_DEST_INFO_WRITE_GREEN | - RB_COPY_DEST_INFO_WRITE_BLUE | - RB_COPY_DEST_INFO_WRITE_ALPHA); - - OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1); - OUT_RING(ring, 0x0000000); - - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY)); - OUT_RING(ring, 3); /* NumIndices */ + if (is_a4xx(ctx->screen)) + return 1024; + if (is_a3xx(ctx->screen)) + return 992; + return 512; } static void -emit_gmem2mem(struct fd_context *ctx, struct fd_ringbuffer *ring, - uint32_t xoff, uint32_t yoff, uint32_t bin_w, uint32_t bin_h) +calculate_tiles(struct fd_context *ctx) { - struct fd_framebuffer_stateobj *fb = &ctx->framebuffer; - struct pipe_framebuffer_state *pfb = &fb->base; - - fd_emit_vertex_bufs(ring, 0x9c, (struct fd_vertex_buf[]) { - { .prsc = ctx->solid_vertexbuf, .size = 48 }, - }, 1); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_VGT_INDX_OFFSET)); - OUT_RING(ring, 0); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000028f); - - fd_program_emit(ring, &ctx->solid_prog); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SC_AA_MASK)); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_DEPTHCONTROL)); - OUT_RING(ring, RB_DEPTHCONTROL_EARLY_Z_ENABLE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SU_SC_MODE_CNTL)); - OUT_RING(ring, PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ - PA_SU_SC_MODE_CNTL_POLYMODE_FRONT_PTYPE(DRAW_TRIANGLES) | - PA_SU_SC_MODE_CNTL_POLYMODE_BACK_PTYPE(DRAW_TRIANGLES)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(0, 0)); /* PA_SC_WINDOW_SCISSOR_TL */ - OUT_RING(ring, xy2d(pfb->width, pfb->height)); /* PA_SC_WINDOW_SCISSOR_BR */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_CL_VTE_CNTL)); - OUT_RING(ring, PA_CL_VTE_CNTL_VTX_W0_FMT | - PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | - PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | - PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | - PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_CL_CLIP_CNTL)); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_MODECONTROL)); - OUT_RING(ring, RB_MODECONTROL_EDRAM_MODE(EDRAM_COPY)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_COPY_DEST_OFFSET)); - OUT_RING(ring, RB_COPY_DEST_OFFSET_X(xoff) | RB_COPY_DEST_OFFSET_Y(yoff)); - - if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) - emit_gmem2mem_surf(ring, 0, bin_w * bin_h, pfb->zsbuf); - - if (ctx->resolve & FD_BUFFER_COLOR) - emit_gmem2mem_surf(ring, 1, 0, pfb->cbufs[0]); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_MODECONTROL)); - OUT_RING(ring, RB_MODECONTROL_EDRAM_MODE(COLOR_DEPTH)); -} + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct pipe_scissor_state *scissor = &ctx->max_scissor; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + uint32_t gmem_size = ctx->screen->gmemsize_bytes; + uint32_t minx, miny, width, height; + uint32_t nbins_x = 1, nbins_y = 1; + uint32_t bin_w, bin_h; + uint32_t max_width = bin_width(ctx); + uint32_t cpp = 4; + uint32_t i, j, t, xoff, yoff; + uint32_t tpp_x, tpp_y; + bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)); + + if (pfb->cbufs[0]) + cpp = util_format_get_blocksize(pfb->cbufs[0]->format); + + if ((gmem->cpp == cpp) && (gmem->has_zs == has_zs) && + !memcmp(&gmem->scissor, scissor, sizeof(gmem->scissor))) { + /* everything is up-to-date */ + return; + } -/* transfer from system memory to gmem */ + /* if have depth/stencil, we need to leave room: */ + if (has_zs) { + gmem_size /= 2; + max_width /= 2; + } -static void -emit_mem2gmem_surf(struct fd_ringbuffer *ring, uint32_t swap, uint32_t base, - struct pipe_surface *psurf) -{ - struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t swiz; - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_COLOR_INFO)); - OUT_RING(ring, RB_COLOR_INFO_COLOR_SWAP(swap) | - RB_COLOR_INFO_COLOR_BASE(base / 1024) | - RB_COLOR_INFO_COLOR_FORMAT(fd_pipe2color(psurf->format))); - - swiz = fd_tex_swiz(psurf->format, PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN, - PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA); - - /* emit fb as a texture: */ - OUT_PKT3(ring, CP_SET_CONSTANT, 7); - OUT_RING(ring, 0x00010000); - OUT_RING(ring, SQ_TEX0_CLAMP_X(SQ_TEX_WRAP) | - SQ_TEX0_CLAMP_Y(SQ_TEX_WRAP) | - SQ_TEX0_CLAMP_Z(SQ_TEX_WRAP) | - SQ_TEX0_PITCH(rsc->pitch)); - OUT_RELOC(ring, rsc->bo, 0, - fd_pipe2surface(psurf->format) | 0x800); - OUT_RING(ring, SQ_TEX2_WIDTH(psurf->width) | - SQ_TEX2_HEIGHT(psurf->height)); - OUT_RING(ring, 0x01000000 | // XXX - swiz | - SQ_TEX3_XY_MAG_FILTER(SQ_TEX_FILTER_POINT) | - SQ_TEX3_XY_MIN_FILTER(SQ_TEX_FILTER_POINT)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000200); - - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY)); - OUT_RING(ring, 3); /* NumIndices */ -} + if (fd_mesa_debug & FD_DBG_NOSCIS) { + minx = 0; + miny = 0; + width = pfb->width; + height = pfb->height; + } else { + minx = scissor->minx & ~31; /* round down to multiple of 32 */ + miny = scissor->miny & ~31; + width = scissor->maxx - minx; + height = scissor->maxy - miny; + } -static void -emit_mem2gmem(struct fd_context *ctx, struct fd_ringbuffer *ring, - uint32_t xoff, uint32_t yoff, uint32_t bin_w, uint32_t bin_h) -{ - struct fd_framebuffer_stateobj *fb = &ctx->framebuffer; - struct pipe_framebuffer_state *pfb = &fb->base; - float x0, y0, x1, y1; - - fd_emit_vertex_bufs(ring, 0x9c, (struct fd_vertex_buf[]) { - { .prsc = ctx->solid_vertexbuf, .size = 48, .offset = 0x30 }, - { .prsc = ctx->solid_vertexbuf, .size = 32, .offset = 0x60 }, - }, 2); - - /* write texture coordinates to vertexbuf: */ - x0 = ((float)xoff) / ((float)pfb->width); - x1 = ((float)xoff + bin_w) / ((float)pfb->width); - y0 = ((float)yoff) / ((float)pfb->height); - y1 = ((float)yoff + bin_h) / ((float)pfb->height); - OUT_PKT3(ring, CP_MEM_WRITE, 9); - OUT_RELOC(ring, fd_resource(ctx->solid_vertexbuf)->bo, 0x60, 0); - OUT_RING(ring, f2d(x0)); - OUT_RING(ring, f2d(y0)); - OUT_RING(ring, f2d(x1)); - OUT_RING(ring, f2d(y0)); - OUT_RING(ring, f2d(x0)); - OUT_RING(ring, f2d(y1)); - OUT_RING(ring, f2d(x1)); - OUT_RING(ring, f2d(y1)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_VGT_INDX_OFFSET)); - OUT_RING(ring, 0); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); - - fd_program_emit(ring, &ctx->blit_prog); - - OUT_PKT0(ring, REG_TC_CNTL_STATUS, 1); - OUT_RING(ring, TC_CNTL_STATUS_L2_INVALIDATE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_DEPTHCONTROL)); - OUT_RING(ring, RB_DEPTHCONTROL_EARLY_Z_ENABLE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SU_SC_MODE_CNTL)); - OUT_RING(ring, PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | - PA_SU_SC_MODE_CNTL_POLYMODE_FRONT_PTYPE(DRAW_TRIANGLES) | - PA_SU_SC_MODE_CNTL_POLYMODE_BACK_PTYPE(DRAW_TRIANGLES)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SC_AA_MASK)); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_COLORCONTROL)); - OUT_RING(ring, RB_COLORCONTROL_ALPHA_FUNC(PIPE_FUNC_ALWAYS) | - RB_COLORCONTROL_BLEND_DISABLE | - RB_COLORCONTROL_ROP_CODE(12) | - RB_COLORCONTROL_DITHER_MODE(DITHER_DISABLE) | - RB_COLORCONTROL_DITHER_TYPE(DITHER_PIXEL)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_RB_BLEND_CONTROL)); - OUT_RING(ring, RB_BLENDCONTROL_COLOR_SRCBLEND(RB_BLEND_ONE) | - RB_BLENDCONTROL_COLOR_COMB_FCN(COMB_DST_PLUS_SRC) | - RB_BLENDCONTROL_COLOR_DESTBLEND(RB_BLEND_ZERO) | - RB_BLENDCONTROL_ALPHA_SRCBLEND(RB_BLEND_ONE) | - RB_BLENDCONTROL_ALPHA_COMB_FCN(COMB_DST_PLUS_SRC) | - RB_BLENDCONTROL_ALPHA_DESTBLEND(RB_BLEND_ZERO)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, PA_SC_WINDOW_OFFSET_DISABLE | - xy2d(0,0)); /* PA_SC_WINDOW_SCISSOR_TL */ - OUT_RING(ring, xy2d(bin_w, bin_h)); /* PA_SC_WINDOW_SCISSOR_BR */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, f2d((float)bin_w/2.0)); /* PA_CL_VPORT_XSCALE */ - OUT_RING(ring, f2d((float)bin_w/2.0)); /* PA_CL_VPORT_XOFFSET */ - OUT_RING(ring, f2d(-(float)bin_h/2.0)); /* PA_CL_VPORT_YSCALE */ - OUT_RING(ring, f2d((float)bin_h/2.0)); /* PA_CL_VPORT_YOFFSET */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_CL_VTE_CNTL)); - OUT_RING(ring, PA_CL_VTE_CNTL_VTX_XY_FMT | - PA_CL_VTE_CNTL_VTX_Z_FMT | // XXX check this??? - PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | - PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | - PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | - PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_CL_CLIP_CNTL)); - OUT_RING(ring, 0x00000000); - - if (ctx->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) - emit_mem2gmem_surf(ring, 0, bin_w * bin_h, pfb->zsbuf); - - if (ctx->restore & FD_BUFFER_COLOR) - emit_mem2gmem_surf(ring, 1, 0, pfb->cbufs[0]); - - /* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */ -} + bin_w = align(width, 32); + bin_h = align(height, 32); -void -fd_gmem_render_tiles(struct pipe_context *pctx) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd_framebuffer_stateobj *fb = &ctx->framebuffer; - struct pipe_framebuffer_state *pfb = &fb->base; - struct fd_ringbuffer *ring; - uint32_t i, yoff = 0; - uint32_t timestamp; - ring = ctx->ring; - - DBG("rendering %dx%d tiles (%s/%s)", fb->nbins_x, fb->nbins_y, - util_format_name(pfb->cbufs[0]->format), - pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none"); + /* first, find a bin width that satisfies the maximum width + * restrictions: + */ + while (bin_w > max_width) { + nbins_x++; + bin_w = align(width / nbins_x, 32); + } - /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ - fd_ringmarker_mark(ctx->draw_end); + /* then find a bin width/height that satisfies the memory + * constraints: + */ + while ((bin_w * bin_h * cpp) > gmem_size) { + if (bin_w > bin_h) { + nbins_x++; + bin_w = align(width / nbins_x, 32); + } else { + nbins_y++; + bin_h = align(height / nbins_y, 32); + } + } - for (i = 0; i < fb->nbins_y; i++) { - uint32_t j, xoff = 0; - uint32_t bin_h = fb->bin_h; + DBG("using %d bins of size %dx%d", nbins_x*nbins_y, bin_w, bin_h); - /* clip bin height: */ - bin_h = min(bin_h, pfb->height - yoff); + gmem->scissor = *scissor; + gmem->cpp = cpp; + gmem->has_zs = has_zs; + gmem->bin_h = bin_h; + gmem->bin_w = bin_w; + gmem->nbins_x = nbins_x; + gmem->nbins_y = nbins_y; + gmem->minx = minx; + gmem->miny = miny; + gmem->width = width; + gmem->height = height; + + /* + * Assign tiles and pipes: + * + * At some point it might be worth playing with different + * strategies and seeing if that makes much impact on + * performance. + */ + +#define div_round_up(v, a) (((v) + (a) - 1) / (a)) + /* figure out number of tiles per pipe: */ + tpp_x = tpp_y = 1; + while (div_round_up(nbins_y, tpp_y) > 8) + tpp_y += 2; + while ((div_round_up(nbins_y, tpp_y) * + div_round_up(nbins_x, tpp_x)) > 8) + tpp_x += 1; + + /* configure pipes: */ + xoff = yoff = 0; + for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + + if (xoff >= nbins_x) { + xoff = 0; + yoff += tpp_y; + } + + if (yoff >= nbins_y) { + break; + } - for (j = 0; j < fb->nbins_x; j++) { - uint32_t bin_w = fb->bin_w; + pipe->x = xoff; + pipe->y = yoff; + pipe->w = MIN2(tpp_x, nbins_x - xoff); + pipe->h = MIN2(tpp_y, nbins_y - yoff); - /* clip bin width: */ - bin_w = min(bin_w, pfb->width - xoff); + xoff += tpp_x; + } + + for (; i < ARRAY_SIZE(ctx->pipe); i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + pipe->x = pipe->y = pipe->w = pipe->h = 0; + } - DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d", - bin_h, yoff, bin_w, xoff); +#if 0 /* debug */ + printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y); + for (i = 0; i < 8; i++) { + struct fd_vsc_pipe *pipe = &ctx->pipe[i]; + printf("pipe[%d]: %ux%u @ %u,%u\n", i, + pipe->w, pipe->h, pipe->x, pipe->y); + } +#endif + + /* configure tiles: */ + t = 0; + yoff = miny; + for (i = 0; i < nbins_y; i++) { + uint32_t bw, bh; - fd_emit_framebuffer_state(ring, &ctx->framebuffer); + xoff = minx; - /* setup screen scissor for current tile (same for mem2gmem): */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_PA_SC_SCREEN_SCISSOR_TL)); - OUT_RING(ring, xy2d(0,0)); /* PA_SC_SCREEN_SCISSOR_TL */ - OUT_RING(ring, xy2d(bin_w, bin_h)); /* PA_SC_SCREEN_SCISSOR_BR */ + /* clip bin height: */ + bh = MIN2(bin_h, miny + height - yoff); - if (ctx->restore) - emit_mem2gmem(ctx, ring, xoff, yoff, bin_w, bin_h); + for (j = 0; j < nbins_x; j++) { + struct fd_tile *tile = &ctx->tile[t]; + uint32_t n, p; - /* setup window scissor and offset for current tile (different - * from mem2gmem): - */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, PA_SC_WINDOW_OFFSET_X(-xoff) | - PA_SC_WINDOW_OFFSET_Y(-yoff));/* PA_SC_WINDOW_OFFSET */ + assert(t < ARRAY_SIZE(ctx->tile)); - /* emit IB to drawcmds: */ - OUT_IB (ring, ctx->draw_start, ctx->draw_end); + /* pipe number: */ + p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, 0x00000000); /* PA_SC_WINDOW_OFFSET */ + /* slot number: */ + n = ((i % tpp_y) * tpp_x) + (j % tpp_x); - /* emit gmem2mem to transfer tile back to system memory: */ - emit_gmem2mem(ctx, ring, xoff, yoff, bin_w, bin_h); + /* clip bin width: */ + bw = MIN2(bin_w, minx + width - xoff); - xoff += bin_w; + tile->n = n; + tile->p = p; + tile->bin_w = bw; + tile->bin_h = bh; + tile->xoff = xoff; + tile->yoff = yoff; + + t++; + + xoff += bw; } - yoff += bin_h; + yoff += bh; } - /* GPU executes starting from tile cmds, which IB back to draw cmds: */ - fd_ringmarker_flush(ctx->draw_end); +#if 0 /* debug */ + t = 0; + for (i = 0; i < nbins_y; i++) { + for (j = 0; j < nbins_x; j++) { + struct fd_tile *tile = &ctx->tile[t++]; + printf("|p:%u n:%u|", tile->p, tile->n); + } + printf("\n"); + } +#endif +} - /* mark start for next draw cmds: */ - fd_ringmarker_mark(ctx->draw_start); +static void +render_tiles(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + int i; - /* update timestamps on render targets: */ - fd_pipe_timestamp(ctx->screen->pipe, ×tamp); - fd_resource(pfb->cbufs[0]->texture)->timestamp = timestamp; - if (pfb->zsbuf) - fd_resource(pfb->zsbuf->texture)->timestamp = timestamp; + ctx->emit_tile_init(ctx); - /* Note that because the per-tile setup and mem2gmem/gmem2mem are emitted - * after the draw/clear calls, but executed before, we need to preemptively - * flag some state as dirty before the first draw/clear call. - * - * TODO maybe we need to mark all state as dirty to not worry about state - * being clobbered by other contexts? - */ - ctx->dirty |= FD_DIRTY_ZSA | - FD_DIRTY_RASTERIZER | - FD_DIRTY_FRAMEBUFFER | - FD_DIRTY_SAMPLE_MASK | - FD_DIRTY_VIEWPORT | - FD_DIRTY_CONSTBUF | - FD_DIRTY_PROG | - FD_DIRTY_SCISSOR | - /* probably only needed if we need to mem2gmem on the next - * draw.. but not sure if there is a good way to know? - */ - FD_DIRTY_VERTTEX | - FD_DIRTY_FRAGTEX | - FD_DIRTY_BLEND; + if (ctx->restore) + ctx->stats.batch_restore++; + + for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) { + struct fd_tile *tile = &ctx->tile[i]; + + DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d", + tile->bin_h, tile->yoff, tile->bin_w, tile->xoff); + + ctx->emit_tile_prep(ctx, tile); + + if (ctx->restore) { + fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_MEM2GMEM); + ctx->emit_tile_mem2gmem(ctx, tile); + fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL); + } + + ctx->emit_tile_renderprep(ctx, tile); + + fd_hw_query_prepare_tile(ctx, i, ctx->ring); + + /* emit IB to drawcmds: */ + OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end); + fd_reset_wfi(ctx); + + /* emit gmem2mem to transfer tile back to system memory: */ + fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_GMEM2MEM); + ctx->emit_tile_gmem2mem(ctx, tile); + fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL); + } } -void -fd_gmem_calculate_tiles(struct pipe_context *pctx) +static void +render_sysmem(struct fd_context *ctx) { - struct fd_context *ctx = fd_context(pctx); - struct fd_framebuffer_stateobj *fb = &ctx->framebuffer; - struct pipe_framebuffer_state *pfb = &fb->base; - uint32_t nbins_x = 1, nbins_y = 1; - uint32_t bin_w, bin_h; - uint32_t cpp = util_format_get_blocksize(pfb->cbufs[0]->format); - uint32_t gmem_size = ctx->screen->gmemsize_bytes; - uint32_t max_width = 992; + ctx->emit_sysmem_prep(ctx); -// TODO we probably could optimize this a bit if we know that -// Z or stencil is not enabled for any of the draw calls.. -// if (fd_stencil_enabled(ctx->zsa) || fd_depth_enabled(ctx->zsa)) { - gmem_size /= 2; - max_width = 256; -// } + fd_hw_query_prepare_tile(ctx, 0, ctx->ring); - bin_w = ALIGN(pfb->width, 32); - bin_h = ALIGN(pfb->height, 32); + /* emit IB to drawcmds: */ + OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end); + fd_reset_wfi(ctx); +} - /* first, find a bin width that satisfies the maximum width - * restrictions: - */ - while (bin_w > max_width) { - nbins_x++; - bin_w = ALIGN(pfb->width / nbins_x, 32); +void +fd_gmem_render_tiles(struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + uint32_t timestamp = 0; + bool sysmem = false; + + if (ctx->emit_sysmem_prep) { + if (ctx->cleared || ctx->gmem_reason || (ctx->num_draws > 5)) { + DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u", + ctx->cleared, ctx->gmem_reason, ctx->num_draws); + } else if (!(fd_mesa_debug & FD_DBG_NOBYPASS)) { + sysmem = true; + } } - /* then find a bin height that satisfies the memory constraints: + /* close out the draw cmds by making sure any active queries are + * paused: */ - while ((bin_w * bin_h * cpp) > gmem_size) { - nbins_y++; - bin_h = ALIGN(pfb->height / nbins_y, 32); - } + fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL); + + /* mark the end of the clear/draw cmds before emitting per-tile cmds: */ + fd_ringmarker_mark(ctx->draw_end); + fd_ringmarker_mark(ctx->binning_end); - if ((nbins_x > 1) || (nbins_y > 1)) { - fb->pa_su_sc_mode_cntl |= PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE; + fd_reset_wfi(ctx); + + ctx->stats.batch_total++; + + if (sysmem) { + DBG("rendering sysmem (%s/%s)", + util_format_short_name(pipe_surface_format(pfb->cbufs[0])), + util_format_short_name(pipe_surface_format(pfb->zsbuf))); + fd_hw_query_prepare(ctx, 1); + render_sysmem(ctx); + ctx->stats.batch_sysmem++; } else { - fb->pa_su_sc_mode_cntl &= ~PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE; + struct fd_gmem_stateobj *gmem = &ctx->gmem; + calculate_tiles(ctx); + DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y, + util_format_short_name(pipe_surface_format(pfb->cbufs[0])), + util_format_short_name(pipe_surface_format(pfb->zsbuf))); + fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y); + render_tiles(ctx); + ctx->stats.batch_gmem++; } - DBG("using %d bins of size %dx%d", nbins_x*nbins_y, bin_w, bin_h); + /* GPU executes starting from tile cmds, which IB back to draw cmds: */ + fd_ringmarker_flush(ctx->draw_end); + + /* mark start for next draw/binning cmds: */ + fd_ringmarker_mark(ctx->draw_start); + fd_ringmarker_mark(ctx->binning_start); -//if we use hw binning, tile sizes (in multiple of 32) need to -//fit in 5 bits.. for now don't care because we aren't using -//that: -// assert(!(bin_h/32 & ~0x1f)); -// assert(!(bin_w/32 & ~0x1f)); + fd_reset_wfi(ctx); - fb->nbins_x = nbins_x; - fb->nbins_y = nbins_y; - fb->bin_w = bin_w; - fb->bin_h = bin_h; + /* update timestamps on render targets: */ + timestamp = fd_ringbuffer_timestamp(ctx->ring); + if (pfb->cbufs[0]) + fd_resource(pfb->cbufs[0]->texture)->timestamp = timestamp; + if (pfb->zsbuf) + fd_resource(pfb->zsbuf->texture)->timestamp = timestamp; + + /* reset maximal bounds: */ + ctx->max_scissor.minx = ctx->max_scissor.miny = ~0; + ctx->max_scissor.maxx = ctx->max_scissor.maxy = 0; + + ctx->dirty = ~0; +} + +/* tile needs restore if it isn't completely contained within the + * cleared scissor: + */ +static bool +skip_restore(struct pipe_scissor_state *scissor, struct fd_tile *tile) +{ + unsigned minx = tile->xoff; + unsigned maxx = tile->xoff + tile->bin_w; + unsigned miny = tile->yoff; + unsigned maxy = tile->yoff + tile->bin_h; + return (minx >= scissor->minx) && (maxx <= scissor->maxx) && + (miny >= scissor->miny) && (maxy <= scissor->maxy); +} +/* When deciding whether a tile needs mem2gmem, we need to take into + * account the scissor rect(s) that were cleared. To simplify we only + * consider the last scissor rect for each buffer, since the common + * case would be a single clear. + */ +bool +fd_gmem_needs_restore(struct fd_context *ctx, struct fd_tile *tile, + uint32_t buffers) +{ + if (!(ctx->restore & buffers)) + return false; + + /* if buffers partially cleared, then slow-path to figure out + * if this particular tile needs restoring: + */ + if ((buffers & FD_BUFFER_COLOR) && + (ctx->partial_cleared & FD_BUFFER_COLOR) && + skip_restore(&ctx->cleared_scissor.color, tile)) + return false; + if ((buffers & FD_BUFFER_DEPTH) && + (ctx->partial_cleared & FD_BUFFER_DEPTH) && + skip_restore(&ctx->cleared_scissor.depth, tile)) + return false; + if ((buffers & FD_BUFFER_STENCIL) && + (ctx->partial_cleared & FD_BUFFER_STENCIL) && + skip_restore(&ctx->cleared_scissor.stencil, tile)) + return false; + + return true; }