freedreno/a3xx/compiler: refactor trans_samp()
[mesa.git] / src / gallium / drivers / freedreno / freedreno_gmem.c
index 12633bd5f389c2b40bde9af474c823eece7309b4..861ebf5675e96ff2a1a0a949abc514c150efd1cb 100644 (file)
@@ -35,6 +35,7 @@
 #include "freedreno_gmem.h"
 #include "freedreno_context.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"
 
 /*
  * resolve.
  */
 
+static uint32_t bin_width(struct fd_context *ctx)
+{
+       if (ctx->screen->gpu_id >= 300)
+               return 992;
+       return 512;
+}
+
 static void
 calculate_tiles(struct fd_context *ctx)
 {
        struct fd_gmem_stateobj *gmem = &ctx->gmem;
        struct pipe_scissor_state *scissor = &ctx->max_scissor;
-       uint32_t cpp = util_format_get_blocksize(ctx->framebuffer.cbufs[0]->format);
+       struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
        uint32_t gmem_size = ctx->screen->gmemsize_bytes;
        uint32_t minx, miny, width, height;
        uint32_t nbins_x = 1, nbins_y = 1;
        uint32_t bin_w, bin_h;
-       uint32_t max_width = 992;
+       uint32_t max_width = bin_width(ctx);
+       uint32_t cpp = 4;
+       uint32_t i, j, t, xoff, yoff;
+       uint32_t tpp_x, tpp_y;
+       bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
+
+       if (pfb->cbufs[0])
+               cpp = util_format_get_blocksize(pfb->cbufs[0]->format);
 
-       if ((gmem->cpp == cpp) &&
+       if ((gmem->cpp == cpp) && (gmem->has_zs == has_zs) &&
                        !memcmp(&gmem->scissor, scissor, sizeof(gmem->scissor))) {
                /* everything is up-to-date */
                return;
        }
 
-       minx = scissor->minx & ~31; /* round down to multiple of 32 */
-       miny = scissor->miny & ~31;
-       width = scissor->maxx - minx;
-       height = scissor->maxy - miny;
-
-// TODO we probably could optimize this a bit if we know that
-// Z or stencil is not enabled for any of the draw calls..
-//     if (fd_stencil_enabled(ctx->zsa) || fd_depth_enabled(ctx->zsa)) {
+       /* if have depth/stencil, we need to leave room: */
+       if (has_zs) {
                gmem_size /= 2;
-               max_width = 256;
-//     }
+               max_width /= 2;
+       }
+
+       if (fd_mesa_debug & FD_DBG_DSCIS) {
+               minx = 0;
+               miny = 0;
+               width = pfb->width;
+               height = pfb->height;
+       } else {
+               minx = scissor->minx & ~31; /* round down to multiple of 32 */
+               miny = scissor->miny & ~31;
+               width = scissor->maxx - minx;
+               height = scissor->maxy - miny;
+       }
 
        bin_w = align(width, 32);
        bin_h = align(height, 32);
@@ -118,60 +139,160 @@ calculate_tiles(struct fd_context *ctx)
 
        gmem->scissor = *scissor;
        gmem->cpp = cpp;
-       gmem->minx = minx;
-       gmem->miny = miny;
+       gmem->has_zs = has_zs;
        gmem->bin_h = bin_h;
        gmem->bin_w = bin_w;
        gmem->nbins_x = nbins_x;
        gmem->nbins_y = nbins_y;
+       gmem->minx = minx;
+       gmem->miny = miny;
        gmem->width = width;
        gmem->height = height;
-}
 
-static void
-render_tiles(struct fd_context *ctx)
-{
-       struct fd_gmem_stateobj *gmem = &ctx->gmem;
-       uint32_t i, yoff = 0;
+       /*
+        * Assign tiles and pipes:
+        *
+        * At some point it might be worth playing with different
+        * strategies and seeing if that makes much impact on
+        * performance.
+        */
 
-       yoff= gmem->miny;
+#define div_round_up(v, a)  (((v) + (a) - 1) / (a))
+       /* figure out number of tiles per pipe: */
+       tpp_x = tpp_y = 1;
+       while (div_round_up(nbins_y, tpp_y) > 8)
+               tpp_y += 2;
+       while ((div_round_up(nbins_y, tpp_y) *
+                       div_round_up(nbins_x, tpp_x)) > 8)
+               tpp_x += 1;
+
+       /* configure pipes: */
+       xoff = yoff = 0;
+       for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+
+               if (xoff >= nbins_x) {
+                       xoff = 0;
+                       yoff += tpp_y;
+               }
 
-       ctx->emit_tile_init(ctx);
+               if (yoff >= nbins_y) {
+                       break;
+               }
 
-       for (i = 0; i < gmem->nbins_y; i++) {
-               uint32_t j, xoff = gmem->minx;
-               uint32_t bh = gmem->bin_h;
+               pipe->x = xoff;
+               pipe->y = yoff;
+               pipe->w = MIN2(tpp_x, nbins_x - xoff);
+               pipe->h = MIN2(tpp_y, nbins_y - yoff);
 
-               /* clip bin height: */
-               bh = MIN2(bh, gmem->height - yoff);
+               xoff += tpp_x;
+       }
 
-               for (j = 0; j < gmem->nbins_x; j++) {
-                       uint32_t bw = gmem->bin_w;
+       for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               pipe->x = pipe->y = pipe->w = pipe->h = 0;
+       }
 
-                       /* clip bin width: */
-                       bw = MIN2(bw, gmem->width - xoff);
+#if 0 /* debug */
+       printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
+       for (i = 0; i < 8; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               printf("pipe[%d]: %ux%u @ %u,%u\n", i,
+                               pipe->w, pipe->h, pipe->x, pipe->y);
+       }
+#endif
 
-                       DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d",
-                                       bh, yoff, bw, xoff);
+       /* configure tiles: */
+       t = 0;
+       yoff = miny;
+       for (i = 0; i < nbins_y; i++) {
+               uint32_t bw, bh;
 
-                       ctx->emit_tile_prep(ctx, xoff, yoff, bw, bh);
+               xoff = minx;
 
-                       if (ctx->restore)
-                               ctx->emit_tile_mem2gmem(ctx, xoff, yoff, bw, bh);
+               /* clip bin height: */
+               bh = MIN2(bin_h, miny + height - yoff);
+
+               for (j = 0; j < nbins_x; j++) {
+                       struct fd_tile *tile = &ctx->tile[t];
+                       uint32_t n, p;
 
-                       ctx->emit_tile_renderprep(ctx, xoff, yoff, bw, bh);
+                       assert(t < ARRAY_SIZE(ctx->tile));
 
-                       /* emit IB to drawcmds: */
-                       OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+                       /* pipe number: */
+                       p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
 
-                       /* emit gmem2mem to transfer tile back to system memory: */
-                       ctx->emit_tile_gmem2mem(ctx, xoff, yoff, bw, bh);
+                       /* slot number: */
+                       n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
+
+                       /* clip bin width: */
+                       bw = MIN2(bin_w, minx + width - xoff);
+
+                       tile->n = n;
+                       tile->p = p;
+                       tile->bin_w = bw;
+                       tile->bin_h = bh;
+                       tile->xoff = xoff;
+                       tile->yoff = yoff;
+
+                       t++;
 
                        xoff += bw;
                }
 
                yoff += bh;
        }
+
+#if 0 /* debug */
+       t = 0;
+       for (i = 0; i < nbins_y; i++) {
+               for (j = 0; j < nbins_x; j++) {
+                       struct fd_tile *tile = &ctx->tile[t++];
+                       printf("|p:%u n:%u|", tile->p, tile->n);
+               }
+               printf("\n");
+       }
+#endif
+}
+
+static void
+render_tiles(struct fd_context *ctx)
+{
+       struct fd_gmem_stateobj *gmem = &ctx->gmem;
+       int i;
+
+       ctx->emit_tile_init(ctx);
+
+       if (ctx->restore)
+               ctx->stats.batch_restore++;
+
+       for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) {
+               struct fd_tile *tile = &ctx->tile[i];
+
+               DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d",
+                       tile->bin_h, tile->yoff, tile->bin_w, tile->xoff);
+
+               ctx->emit_tile_prep(ctx, tile);
+
+               if (ctx->restore) {
+                       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_MEM2GMEM);
+                       ctx->emit_tile_mem2gmem(ctx, tile);
+                       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+               }
+
+               ctx->emit_tile_renderprep(ctx, tile);
+
+               fd_hw_query_prepare_tile(ctx, i, ctx->ring);
+
+               /* emit IB to drawcmds: */
+               OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+               fd_reset_wfi(ctx);
+
+               /* emit gmem2mem to transfer tile back to system memory: */
+               fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_GMEM2MEM);
+               ctx->emit_tile_gmem2mem(ctx, tile);
+               fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+       }
 }
 
 static void
@@ -179,8 +300,11 @@ render_sysmem(struct fd_context *ctx)
 {
        ctx->emit_sysmem_prep(ctx);
 
+       fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
+
        /* emit IB to drawcmds: */
        OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+       fd_reset_wfi(ctx);
 }
 
 void
@@ -195,37 +319,55 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
                if (ctx->cleared || ctx->gmem_reason || (ctx->num_draws > 5)) {
                        DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u",
                                ctx->cleared, ctx->gmem_reason, ctx->num_draws);
-               } else {
+               } else if (!(fd_mesa_debug & FD_DBG_DBYPASS)) {
                        sysmem = true;
                }
        }
 
+       /* close out the draw cmds by making sure any active queries are
+        * paused:
+        */
+       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+
        /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
        fd_ringmarker_mark(ctx->draw_end);
+       fd_ringmarker_mark(ctx->binning_end);
+
+       fd_reset_wfi(ctx);
+
+       ctx->stats.batch_total++;
 
        if (sysmem) {
                DBG("rendering sysmem (%s/%s)",
-                       util_format_name(pfb->cbufs[0]->format),
-                       pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none");
+                       util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
+                       util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+               fd_hw_query_prepare(ctx, 1);
                render_sysmem(ctx);
+               ctx->stats.batch_sysmem++;
        } else {
                struct fd_gmem_stateobj *gmem = &ctx->gmem;
-               DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
-                       util_format_name(pfb->cbufs[0]->format),
-                       pfb->zsbuf ? util_format_name(pfb->zsbuf->format) : "none");
                calculate_tiles(ctx);
+               DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
+                       util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
+                       util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+               fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y);
                render_tiles(ctx);
+               ctx->stats.batch_gmem++;
        }
 
        /* GPU executes starting from tile cmds, which IB back to draw cmds: */
        fd_ringmarker_flush(ctx->draw_end);
 
-       /* mark start for next draw cmds: */
+       /* mark start for next draw/binning cmds: */
        fd_ringmarker_mark(ctx->draw_start);
+       fd_ringmarker_mark(ctx->binning_start);
+
+       fd_reset_wfi(ctx);
 
        /* update timestamps on render targets: */
        timestamp = fd_ringbuffer_timestamp(ctx->ring);
-       fd_resource(pfb->cbufs[0]->texture)->timestamp = timestamp;
+       if (pfb->cbufs[0])
+               fd_resource(pfb->cbufs[0]->texture)->timestamp = timestamp;
        if (pfb->zsbuf)
                fd_resource(pfb->zsbuf->texture)->timestamp = timestamp;