freedreno/a3xx/compiler: refactor trans_samp()
[mesa.git] / src / gallium / drivers / freedreno / freedreno_gmem.c
index 47f7a310e8c747d6a7b07de5bb18726ac5159733..861ebf5675e96ff2a1a0a949abc514c150efd1cb 100644 (file)
@@ -35,6 +35,7 @@
 #include "freedreno_gmem.h"
 #include "freedreno_context.h"
 #include "freedreno_resource.h"
+#include "freedreno_query_hw.h"
 #include "freedreno_util.h"
 
 /*
@@ -85,7 +86,8 @@ calculate_tiles(struct fd_context *ctx)
        uint32_t bin_w, bin_h;
        uint32_t max_width = bin_width(ctx);
        uint32_t cpp = 4;
-       uint32_t i, j, t, p, n, xoff, yoff;
+       uint32_t i, j, t, xoff, yoff;
+       uint32_t tpp_x, tpp_y;
        bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
 
        if (pfb->cbufs[0])
@@ -142,23 +144,70 @@ calculate_tiles(struct fd_context *ctx)
        gmem->bin_w = bin_w;
        gmem->nbins_x = nbins_x;
        gmem->nbins_y = nbins_y;
+       gmem->minx = minx;
+       gmem->miny = miny;
        gmem->width = width;
        gmem->height = height;
 
-       /* Assign tiles and pipes:
-        * NOTE we currently take a rather simplistic approach of
-        * mapping rows of tiles to a pipe.  At some point it might
-        * be worth playing with different strategies and seeing if
-        * that makes much impact on performance.
+       /*
+        * Assign tiles and pipes:
+        *
+        * At some point it might be worth playing with different
+        * strategies and seeing if that makes much impact on
+        * performance.
         */
-       t = p = n = 0;
+
+#define div_round_up(v, a)  (((v) + (a) - 1) / (a))
+       /* figure out number of tiles per pipe: */
+       tpp_x = tpp_y = 1;
+       while (div_round_up(nbins_y, tpp_y) > 8)
+               tpp_y += 2;
+       while ((div_round_up(nbins_y, tpp_y) *
+                       div_round_up(nbins_x, tpp_x)) > 8)
+               tpp_x += 1;
+
+       /* configure pipes: */
+       xoff = yoff = 0;
+       for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+
+               if (xoff >= nbins_x) {
+                       xoff = 0;
+                       yoff += tpp_y;
+               }
+
+               if (yoff >= nbins_y) {
+                       break;
+               }
+
+               pipe->x = xoff;
+               pipe->y = yoff;
+               pipe->w = MIN2(tpp_x, nbins_x - xoff);
+               pipe->h = MIN2(tpp_y, nbins_y - yoff);
+
+               xoff += tpp_x;
+       }
+
+       for (; i < ARRAY_SIZE(ctx->pipe); i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               pipe->x = pipe->y = pipe->w = pipe->h = 0;
+       }
+
+#if 0 /* debug */
+       printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
+       for (i = 0; i < 8; i++) {
+               struct fd_vsc_pipe *pipe = &ctx->pipe[i];
+               printf("pipe[%d]: %ux%u @ %u,%u\n", i,
+                               pipe->w, pipe->h, pipe->x, pipe->y);
+       }
+#endif
+
+       /* configure tiles: */
+       t = 0;
        yoff = miny;
        for (i = 0; i < nbins_y; i++) {
-               struct fd_vsc_pipe *pipe = &ctx->pipe[p];
                uint32_t bw, bh;
 
-               assert(p < ARRAY_SIZE(ctx->pipe));
-
                xoff = minx;
 
                /* clip bin height: */
@@ -166,13 +215,20 @@ calculate_tiles(struct fd_context *ctx)
 
                for (j = 0; j < nbins_x; j++) {
                        struct fd_tile *tile = &ctx->tile[t];
+                       uint32_t n, p;
 
                        assert(t < ARRAY_SIZE(ctx->tile));
 
+                       /* pipe number: */
+                       p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
+
+                       /* slot number: */
+                       n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
+
                        /* clip bin width: */
                        bw = MIN2(bin_w, minx + width - xoff);
 
-                       tile->n = n++;
+                       tile->n = n;
                        tile->p = p;
                        tile->bin_w = bw;
                        tile->bin_h = bh;
@@ -184,22 +240,19 @@ calculate_tiles(struct fd_context *ctx)
                        xoff += bw;
                }
 
-               /* one pipe per row: */
-               pipe->x = 0;
-               pipe->y = i;
-               pipe->w = nbins_x;
-               pipe->h = 1;
-
-               p++;
-               n = 0;
-
                yoff += bh;
        }
 
-       for (; p < ARRAY_SIZE(ctx->pipe); p++) {
-               struct fd_vsc_pipe *pipe = &ctx->pipe[p];
-               pipe->x = pipe->y = pipe->w = pipe->h = 0;
+#if 0 /* debug */
+       t = 0;
+       for (i = 0; i < nbins_y; i++) {
+               for (j = 0; j < nbins_x; j++) {
+                       struct fd_tile *tile = &ctx->tile[t++];
+                       printf("|p:%u n:%u|", tile->p, tile->n);
+               }
+               printf("\n");
        }
+#endif
 }
 
 static void
@@ -210,6 +263,9 @@ render_tiles(struct fd_context *ctx)
 
        ctx->emit_tile_init(ctx);
 
+       if (ctx->restore)
+               ctx->stats.batch_restore++;
+
        for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) {
                struct fd_tile *tile = &ctx->tile[i];
 
@@ -218,16 +274,24 @@ render_tiles(struct fd_context *ctx)
 
                ctx->emit_tile_prep(ctx, tile);
 
-               if (ctx->restore)
+               if (ctx->restore) {
+                       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_MEM2GMEM);
                        ctx->emit_tile_mem2gmem(ctx, tile);
+                       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+               }
 
                ctx->emit_tile_renderprep(ctx, tile);
 
+               fd_hw_query_prepare_tile(ctx, i, ctx->ring);
+
                /* emit IB to drawcmds: */
                OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+               fd_reset_wfi(ctx);
 
                /* emit gmem2mem to transfer tile back to system memory: */
+               fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_GMEM2MEM);
                ctx->emit_tile_gmem2mem(ctx, tile);
+               fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
        }
 }
 
@@ -236,8 +300,11 @@ render_sysmem(struct fd_context *ctx)
 {
        ctx->emit_sysmem_prep(ctx);
 
+       fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
+
        /* emit IB to drawcmds: */
        OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+       fd_reset_wfi(ctx);
 }
 
 void
@@ -257,30 +324,45 @@ fd_gmem_render_tiles(struct pipe_context *pctx)
                }
        }
 
+       /* close out the draw cmds by making sure any active queries are
+        * paused:
+        */
+       fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
+
        /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
        fd_ringmarker_mark(ctx->draw_end);
+       fd_ringmarker_mark(ctx->binning_end);
+
+       fd_reset_wfi(ctx);
+
+       ctx->stats.batch_total++;
 
        if (sysmem) {
                DBG("rendering sysmem (%s/%s)",
                        util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
                        util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+               fd_hw_query_prepare(ctx, 1);
                render_sysmem(ctx);
+               ctx->stats.batch_sysmem++;
        } else {
                struct fd_gmem_stateobj *gmem = &ctx->gmem;
                calculate_tiles(ctx);
                DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
                        util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
                        util_format_short_name(pipe_surface_format(pfb->zsbuf)));
+               fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y);
                render_tiles(ctx);
+               ctx->stats.batch_gmem++;
        }
 
        /* GPU executes starting from tile cmds, which IB back to draw cmds: */
        fd_ringmarker_flush(ctx->draw_end);
 
-       /* mark start for next draw cmds: */
+       /* mark start for next draw/binning cmds: */
        fd_ringmarker_mark(ctx->draw_start);
+       fd_ringmarker_mark(ctx->binning_start);
 
-       fd_reset_rmw_state(ctx);
+       fd_reset_wfi(ctx);
 
        /* update timestamps on render targets: */
        timestamp = fd_ringbuffer_timestamp(ctx->ring);