freedreno/a6xx: add early-lrz-late-z mode
[mesa.git] / src / gallium / drivers / freedreno / a6xx / fd6_gmem.c
index 0ce85ea380cd5939a23fd83a8088cd21fb29a32f..829a4d57c5d901c4145ce970885f4e6c461203d2 100644 (file)
@@ -58,7 +58,7 @@ fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc,
                int level, int layer)
 {
        if (fd_resource_ubwc_enabled(rsc, level)) {
-               OUT_RELOCW(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, 0);
+               OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, 0);
                OUT_RING(ring,
                                A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(rsc->layout.ubwc_slices[level].pitch) |
                                A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(rsc->layout.ubwc_layer_size >> 2));
@@ -313,8 +313,9 @@ update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, b
                A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable));
 }
 
-#define VSC_DATA_SIZE(pitch)  ((pitch) * 32 + 0x100)  /* extra size to store VSC_SIZE */
-#define VSC_DATA2_SIZE(pitch) ((pitch) * 32)
+/* extra size to store VSC_DRAW_STRM_SIZE: */
+#define VSC_DRAW_STRM_SIZE(pitch)  ((pitch) * 32 + 0x100)
+#define VSC_PRIM_STRM_SIZE(pitch) ((pitch) * 32)
 
 static void
 update_vsc_pipe(struct fd_batch *batch)
@@ -325,22 +326,45 @@ update_vsc_pipe(struct fd_batch *batch)
        struct fd_ringbuffer *ring = batch->gmem;
        int i;
 
+       if (batch->draw_strm_bits/8 > fd6_ctx->vsc_draw_strm_pitch) {
+               if (fd6_ctx->vsc_draw_strm)
+                       fd_bo_del(fd6_ctx->vsc_draw_strm);
+               fd6_ctx->vsc_draw_strm = NULL;
+               /* Note: probably only need to align to 0x40, but aligning stronger
+                * reduces the odds that we will have to realloc again on the next
+                * frame:
+                */
+               fd6_ctx->vsc_draw_strm_pitch = align(batch->draw_strm_bits/8, 0x4000);
+               debug_printf("pre-resize VSC_DRAW_STRM_PITCH to: 0x%x\n",
+                               fd6_ctx->vsc_draw_strm_pitch);
+       }
 
-       if (!fd6_ctx->vsc_data) {
-               fd6_ctx->vsc_data = fd_bo_new(ctx->screen->dev,
-                       VSC_DATA_SIZE(fd6_ctx->vsc_data_pitch),
-                       DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_data");
+       if (batch->prim_strm_bits/8 > fd6_ctx->vsc_prim_strm_pitch) {
+               if (fd6_ctx->vsc_prim_strm)
+                       fd_bo_del(fd6_ctx->vsc_prim_strm);
+               fd6_ctx->vsc_prim_strm = NULL;
+               fd6_ctx->vsc_prim_strm_pitch = align(batch->prim_strm_bits/8, 0x4000);
+               debug_printf("pre-resize VSC_PRIM_STRM_PITCH to: 0x%x\n",
+                               fd6_ctx->vsc_prim_strm_pitch);
        }
 
-       if (!fd6_ctx->vsc_data2) {
-               fd6_ctx->vsc_data2 = fd_bo_new(ctx->screen->dev,
-                       VSC_DATA2_SIZE(fd6_ctx->vsc_data2_pitch),
-                       DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_data2");
+       if (!fd6_ctx->vsc_draw_strm) {
+               fd6_ctx->vsc_draw_strm = fd_bo_new(ctx->screen->dev,
+                       VSC_DRAW_STRM_SIZE(fd6_ctx->vsc_draw_strm_pitch),
+                       DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_draw_strm");
+       }
+
+       if (!fd6_ctx->vsc_prim_strm) {
+               fd6_ctx->vsc_prim_strm = fd_bo_new(ctx->screen->dev,
+                       VSC_PRIM_STRM_SIZE(fd6_ctx->vsc_prim_strm_pitch),
+                       DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_prim_strm");
        }
 
        OUT_REG(ring,
                A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h),
-               A6XX_VSC_SIZE_ADDRESS(.bo = fd6_ctx->vsc_data, .bo_offset = 32 * fd6_ctx->vsc_data_pitch));
+               A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(
+                       .bo = fd6_ctx->vsc_draw_strm,
+                       .bo_offset = 32 * fd6_ctx->vsc_draw_strm_pitch));
 
        OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x,
                                        .ny = gmem->nbins_y));
@@ -355,14 +379,14 @@ update_vsc_pipe(struct fd_batch *batch)
        }
 
        OUT_REG(ring,
-               A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = fd6_ctx->vsc_data2),
-               A6XX_VSC_PIPE_DATA2_PITCH(.dword = fd6_ctx->vsc_data2_pitch),
-               A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_data2)));
+               A6XX_VSC_PRIM_STRM_ADDRESS(.bo = fd6_ctx->vsc_prim_strm),
+               A6XX_VSC_PRIM_STRM_PITCH(.dword = fd6_ctx->vsc_prim_strm_pitch),
+               A6XX_VSC_PRIM_STRM_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_prim_strm)));
 
        OUT_REG(ring,
-               A6XX_VSC_PIPE_DATA_ADDRESS(.bo = fd6_ctx->vsc_data),
-               A6XX_VSC_PIPE_DATA_PITCH(.dword = fd6_ctx->vsc_data_pitch),
-               A6XX_VSC_PIPE_DATA_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_data)));
+               A6XX_VSC_DRAW_STRM_ADDRESS(.bo = fd6_ctx->vsc_draw_strm),
+               A6XX_VSC_DRAW_STRM_PITCH(.dword = fd6_ctx->vsc_draw_strm_pitch),
+               A6XX_VSC_DRAW_STRM_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_draw_strm)));
 }
 
 /* TODO we probably have more than 8 scratch regs.. although the first
@@ -372,8 +396,8 @@ update_vsc_pipe(struct fd_batch *batch)
 #define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0)
 
 /*
- * If overflow is detected, either 0x1 (VSC_DATA overflow) or 0x3
- * (VSC_DATA2 overflow) plus the size of the overflowed buffer is
+ * If overflow is detected, either 0x1 (VSC_DRAW_STRM overflow) or 0x3
+ * (VSC_PRIM_STRM overflow) plus the size of the overflowed buffer is
  * written to control->vsc_overflow.  This allows the CPU to
  * detect which buffer overflowed (and, since the current size is
  * encoded as well, this protects against already-submitted but
@@ -392,12 +416,12 @@ emit_vsc_overflow_test(struct fd_batch *batch)
        const struct fd_gmem_stateobj *gmem = batch->gmem_state;
        struct fd6_context *fd6_ctx = fd6_context(batch->ctx);
 
-       debug_assert((fd6_ctx->vsc_data_pitch & 0x3) == 0);
-       debug_assert((fd6_ctx->vsc_data2_pitch & 0x3) == 0);
+       debug_assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0);
+       debug_assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0);
 
        /* Clear vsc_scratch: */
        OUT_PKT7(ring, CP_MEM_WRITE, 3);
-       OUT_RELOCW(ring, control_ptr(fd6_ctx, vsc_scratch));
+       OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_scratch));
        OUT_RING(ring, 0x0);
 
        /* Check for overflow, write vsc_scratch if detected: */
@@ -405,22 +429,22 @@ emit_vsc_overflow_test(struct fd_batch *batch)
                OUT_PKT7(ring, CP_COND_WRITE5, 8);
                OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
                                CP_COND_WRITE5_0_WRITE_MEMORY);
-               OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i)));
+               OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
                OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
-               OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_data_pitch));
+               OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch));
                OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
-               OUT_RELOCW(ring, control_ptr(fd6_ctx, vsc_scratch));  /* WRITE_ADDR_LO/HI */
-               OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_data_pitch));
+               OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_scratch));  /* WRITE_ADDR_LO/HI */
+               OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch));
 
                OUT_PKT7(ring, CP_COND_WRITE5, 8);
                OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
                                CP_COND_WRITE5_0_WRITE_MEMORY);
-               OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i)));
+               OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
                OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
-               OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_data2_pitch));
+               OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch));
                OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
-               OUT_RELOCW(ring, control_ptr(fd6_ctx, vsc_scratch));  /* WRITE_ADDR_LO/HI */
-               OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_data2_pitch));
+               OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_scratch));  /* WRITE_ADDR_LO/HI */
+               OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch));
        }
 
        OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
@@ -444,7 +468,7 @@ emit_vsc_overflow_test(struct fd_batch *batch)
 
        BEGIN_RING(ring, 10);  /* ensure if/else doesn't get split */
 
-       /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */
+       /* b0 will be set if VSC_DRAW_STRM or VSC_PRIM_STRM overflow: */
        OUT_PKT7(ring, CP_REG_TEST, 1);
        OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) |
                        A6XX_CP_REG_TEST_0_BIT(0) |
@@ -463,7 +487,7 @@ emit_vsc_overflow_test(struct fd_batch *batch)
                OUT_PKT7(ring, CP_REG_TO_MEM, 3);
                OUT_RING(ring, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) |
                                CP_REG_TO_MEM_0_CNT(1 - 1));
-               OUT_RELOCW(ring, control_ptr(fd6_ctx, vsc_overflow));
+               OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_overflow));
 
                OUT_PKT4(ring, OVERFLOW_FLAG_REG, 1);
                OUT_RING(ring, 0x0);
@@ -492,9 +516,9 @@ check_vsc_overflow(struct fd_context *ctx)
        unsigned size = vsc_overflow & ~0x3;
 
        if (buffer == 0x1) {
-               /* VSC_PIPE_DATA overflow: */
+               /* VSC_DRAW_STRM overflow: */
 
-               if (size < fd6_ctx->vsc_data_pitch) {
+               if (size < fd6_ctx->vsc_draw_strm_pitch) {
                        /* we've already increased the size, this overflow is
                         * from a batch submitted before resize, but executed
                         * after
@@ -502,25 +526,27 @@ check_vsc_overflow(struct fd_context *ctx)
                        return;
                }
 
-               fd_bo_del(fd6_ctx->vsc_data);
-               fd6_ctx->vsc_data = NULL;
-               fd6_ctx->vsc_data_pitch *= 2;
+               fd_bo_del(fd6_ctx->vsc_draw_strm);
+               fd6_ctx->vsc_draw_strm = NULL;
+               fd6_ctx->vsc_draw_strm_pitch *= 2;
 
-               debug_printf("resized VSC_DATA_PITCH to: 0x%x\n", fd6_ctx->vsc_data_pitch);
+               debug_printf("resized VSC_DRAW_STRM_PITCH to: 0x%x\n",
+                               fd6_ctx->vsc_draw_strm_pitch);
 
        } else if (buffer == 0x3) {
-               /* VSC_PIPE_DATA2 overflow: */
+               /* VSC_PRIM_STRM overflow: */
 
-               if (size < fd6_ctx->vsc_data2_pitch) {
+               if (size < fd6_ctx->vsc_prim_strm_pitch) {
                        /* we've already increased the size */
                        return;
                }
 
-               fd_bo_del(fd6_ctx->vsc_data2);
-               fd6_ctx->vsc_data2 = NULL;
-               fd6_ctx->vsc_data2_pitch *= 2;
+               fd_bo_del(fd6_ctx->vsc_prim_strm);
+               fd6_ctx->vsc_prim_strm = NULL;
+               fd6_ctx->vsc_prim_strm_pitch *= 2;
 
-               debug_printf("resized VSC_DATA2_PITCH to: 0x%x\n", fd6_ctx->vsc_data2_pitch);
+               debug_printf("resized VSC_PRIM_STRM_PITCH to: 0x%x\n",
+                               fd6_ctx->vsc_prim_strm_pitch);
 
        } else {
                /* NOTE: it's possible, for example, for overflow to corrupt the
@@ -600,14 +626,9 @@ emit_binning_pass(struct fd_batch *batch)
        const struct fd_gmem_stateobj *gmem = batch->gmem_state;
        struct fd6_context *fd6_ctx = fd6_context(batch->ctx);
 
-       uint32_t x1 = gmem->minx;
-       uint32_t y1 = gmem->miny;
-       uint32_t x2 = gmem->minx + gmem->width - 1;
-       uint32_t y2 = gmem->miny + gmem->height - 1;
-
        debug_assert(!batch->tessellation);
 
-       set_scissor(ring, x1, y1, x2, y2);
+       set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1);
 
        emit_marker6(ring, 7);
        OUT_PKT7(ring, CP_SET_MARKER, 1);
@@ -865,12 +886,12 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
                        OUT_PKT7(ring, CP_SET_BIN_DATA5, 7);
                        OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
                                        CP_SET_BIN_DATA5_0_VSC_N(tile->n));
-                       OUT_RELOC(ring, fd6_ctx->vsc_data,       /* VSC_PIPE[p].DATA_ADDRESS */
-                                       (tile->p * fd6_ctx->vsc_data_pitch), 0, 0);
-                       OUT_RELOC(ring, fd6_ctx->vsc_data,       /* VSC_SIZE_ADDRESS + (p * 4) */
-                                       (tile->p * 4) + (32 * fd6_ctx->vsc_data_pitch), 0, 0);
-                       OUT_RELOC(ring, fd6_ctx->vsc_data2,
-                                       (tile->p * fd6_ctx->vsc_data2_pitch), 0, 0);
+                       OUT_RELOC(ring, fd6_ctx->vsc_draw_strm,       /* per-pipe draw-stream address */
+                                       (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
+                       OUT_RELOC(ring, fd6_ctx->vsc_draw_strm,       /* VSC_DRAW_STRM_ADDRESS + (p * 4) */
+                                       (tile->p * 4) + (32 * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
+                       OUT_RELOC(ring, fd6_ctx->vsc_prim_strm,
+                                       (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0);
 
                        OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
                        OUT_RING(ring, 0x0);
@@ -903,13 +924,12 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
 static void
 set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
-       struct pipe_scissor_state blit_scissor;
-       struct pipe_framebuffer_state *pfb = &batch->framebuffer;
+       struct pipe_scissor_state blit_scissor = batch->max_scissor;
 
-       blit_scissor.minx = 0;
-       blit_scissor.miny = 0;
-       blit_scissor.maxx = align(pfb->width, batch->ctx->screen->gmem_alignw);
-       blit_scissor.maxy = align(pfb->height, batch->ctx->screen->gmem_alignh);
+       blit_scissor.minx = ROUND_DOWN_TO(blit_scissor.minx, 16);
+       blit_scissor.miny = ROUND_DOWN_TO(blit_scissor.miny, 4);
+       blit_scissor.maxx = ALIGN(blit_scissor.maxx, 16);
+       blit_scissor.maxy = ALIGN(blit_scissor.maxy, 4);
 
        OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
        OUT_RING(ring,
@@ -1357,12 +1377,15 @@ fd6_emit_tile_fini(struct fd_batch *batch)
 {
        struct fd_ringbuffer *ring = batch->gmem;
 
+       if (batch->epilogue)
+               fd6_emit_ib(batch->gmem, batch->epilogue);
+
        OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
-       OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_UNK3);
+       OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE);
 
        fd6_emit_lrz_flush(ring);
 
-       fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
+       fd6_event_write(batch, ring, PC_CCU_RESOLVE_TS, true);
 
        if (use_hw_binning(batch)) {
                check_vsc_overflow(batch->ctx);
@@ -1435,11 +1458,11 @@ setup_tess_buffers(struct fd_batch *batch, struct fd_ringbuffer *ring)
                        DRM_FREEDRENO_GEM_TYPE_KMEM, "tessparam");
 
        OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2);
-       OUT_RELOCW(ring, batch->tessfactor_bo, 0, 0, 0);
+       OUT_RELOC(ring, batch->tessfactor_bo, 0, 0, 0);
 
        batch->tess_addrs_constobj->cur = batch->tess_addrs_constobj->start;
-       OUT_RELOCW(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0);
-       OUT_RELOCW(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0);
+       OUT_RELOC(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0);
+       OUT_RELOC(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0);
 }
 
 static void
@@ -1507,6 +1530,9 @@ fd6_emit_sysmem_fini(struct fd_batch *batch)
 {
        struct fd_ringbuffer *ring = batch->gmem;
 
+       if (batch->epilogue)
+               fd6_emit_ib(batch->gmem, batch->epilogue);
+
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);