vc4: Optimize CL emits by doing size checks up front.
[mesa.git] / src / gallium / drivers / vc4 / vc4_context.c
index 08e85ed6312e538b8f240501d12d94f44563c07a..d4a9eec7b081b249f628150ba183ec755ffd1d0d 100644 (file)
@@ -23,9 +23,9 @@
 
 #include <xf86drm.h>
 #include <err.h>
-#include <stdio.h>
 
 #include "pipe/p_defines.h"
+#include "util/ralloc.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
 #include "vc4_context.h"
 #include "vc4_resource.h"
 
+/**
+ * Emits a no-op STORE_TILE_BUFFER_GENERAL.
+ *
+ * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
+ * some sort before another load is triggered.
+ */
 static void
-dump_fbo(struct vc4_context *vc4, struct vc4_bo *fbo)
+vc4_store_before_load(struct vc4_context *vc4, bool *coords_emitted)
 {
-#ifndef USE_VC4_SIMULATOR
-        uint32_t *map = vc4_bo_map(fbo);
+        if (!*coords_emitted)
+                return;
+
+        cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+        cl_u8(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
+        cl_u8(&vc4->rcl, (VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
+                          VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
+                          VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR));
+        cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
+
+        *coords_emitted = false;
+}
+
+/**
+ * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
+ *
+ * The tile coordinates packet triggers a pending load if there is one, are
+ * used for clipping during rendering, and determine where loads/stores happen
+ * relative to their base address.
+ */
+static void
+vc4_tile_coordinates(struct vc4_context *vc4, uint32_t x, uint32_t y,
+                       bool *coords_emitted)
+{
+        if (*coords_emitted)
+                return;
+
+        cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
+        cl_u8(&vc4->rcl, x);
+        cl_u8(&vc4->rcl, y);
+
+        *coords_emitted = true;
+}
+
+static void
+vc4_setup_rcl(struct vc4_context *vc4)
+{
+        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
+        struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
+        struct vc4_surface *zsurf = vc4_surface(vc4->framebuffer.zsbuf);
+        struct vc4_resource *ztex = zsurf ? vc4_resource(zsurf->base.texture) : NULL;
+
+        if (!csurf)
+                vc4->resolve &= ~PIPE_CLEAR_COLOR0;
+        if (!zsurf)
+                vc4->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
+        uint32_t resolve_uncleared = vc4->resolve & ~vc4->cleared;
         uint32_t width = vc4->framebuffer.width;
         uint32_t height = vc4->framebuffer.height;
-        uint32_t chunk_w = width / 79;
-        uint32_t chunk_h = height / 40;
-        uint32_t found_colors[10];
-        uint32_t num_found_colors = 0;
-
-        for (int by = 0; by < height; by += chunk_h) {
-                for (int bx = 0; bx < width; bx += chunk_w) {
-                        bool on = false, black = false;
-
-                        for (int y = by; y < MIN2(height, by + chunk_h); y++) {
-                                for (int x = bx; x < MIN2(width, bx + chunk_w); x++) {
-                                        uint32_t pix = map[y * width + x];
-                                        on |= pix != 0;
-                                        black |= pix == 0xff000000;
-
-                                        int i;
-                                        for (i = 0; i < num_found_colors; i++) {
-                                                if (pix == found_colors[i])
-                                                        break;
-                                        }
-                                        if (i == num_found_colors &&
-                                            num_found_colors < Elements(found_colors))
-                                                found_colors[num_found_colors++] = pix;
+        uint32_t xtiles = align(width, 64) / 64;
+        uint32_t ytiles = align(height, 64) / 64;
+
+#if 0
+        fprintf(stderr, "RCL: resolve 0x%x clear 0x%x resolve uncleared 0x%x\n",
+                vc4->resolve,
+                vc4->cleared,
+                resolve_uncleared);
+#endif
+
+        uint32_t reloc_size = 9;
+        uint32_t clear_size = 14;
+        uint32_t config_size = 11 + reloc_size;
+        uint32_t loadstore_size = 7 + reloc_size;
+        uint32_t tilecoords_size = 3;
+        uint32_t branch_size = 5 + reloc_size;
+        uint32_t color_store_size = 1;
+        cl_ensure_space(&vc4->rcl,
+                        clear_size +
+                        config_size +
+                        loadstore_size +
+                        xtiles * ytiles * (loadstore_size * 4 +
+                                           tilecoords_size * 3 +
+                                           branch_size +
+                                           color_store_size));
+
+        cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
+        cl_u32(&vc4->rcl, vc4->clear_color[0]);
+        cl_u32(&vc4->rcl, vc4->clear_color[1]);
+        cl_u32(&vc4->rcl, vc4->clear_depth);
+        cl_u8(&vc4->rcl, vc4->clear_stencil);
+
+        /* The rendering mode config determines the pointer that's used for
+         * VC4_PACKET_STORE_MS_TILE_BUFFER address computations.  The kernel
+         * could handle a no-relocation rendering mode config and deny those
+         * packets, but instead we just tell the kernel we're doing our color
+         * rendering to the Z buffer, and just don't emit any of those
+         * packets.
+         */
+        struct vc4_surface *render_surf = csurf ? csurf : zsurf;
+        struct vc4_resource *render_tex = vc4_resource(render_surf->base.texture);
+        cl_start_reloc(&vc4->rcl, 1);
+        cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+        cl_reloc(vc4, &vc4->rcl, render_tex->bo, render_surf->offset);
+        cl_u16(&vc4->rcl, width);
+        cl_u16(&vc4->rcl, height);
+        cl_u16(&vc4->rcl, ((render_surf->tiling <<
+                            VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) |
+                           (vc4_rt_format_is_565(render_surf->base.format) ?
+                            VC4_RENDER_CONFIG_FORMAT_BGR565 :
+                            VC4_RENDER_CONFIG_FORMAT_RGBA8888) |
+                           VC4_RENDER_CONFIG_EARLY_Z_COVERAGE_DISABLE));
+
+        /* The tile buffer normally gets cleared when the previous tile is
+         * stored.  If the clear values changed between frames, then the tile
+         * buffer has stale clear values in it, so we have to do a store in
+         * None mode (no writes) so that we trigger the tile buffer clear.
+         *
+         * Excess clearing is only a performance cost, since per-tile contents
+         * will be loaded/stored in the loop below.
+         */
+        if (vc4->cleared & (PIPE_CLEAR_COLOR0 |
+                            PIPE_CLEAR_DEPTH |
+                            PIPE_CLEAR_STENCIL)) {
+                cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
+                cl_u8(&vc4->rcl, 0);
+                cl_u8(&vc4->rcl, 0);
+
+                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+                cl_u16(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
+                cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
+        }
+
+        uint32_t color_hindex = ctex ? vc4_gem_hindex(vc4, ctex->bo) : 0;
+        uint32_t depth_hindex = ztex ? vc4_gem_hindex(vc4, ztex->bo) : 0;
+        uint32_t tile_alloc_hindex = vc4_gem_hindex(vc4, vc4->tile_alloc);
+
+        for (int y = 0; y < ytiles; y++) {
+                for (int x = 0; x < xtiles; x++) {
+                        bool end_of_frame = (x == xtiles - 1 &&
+                                             y == ytiles - 1);
+                        bool coords_emitted = false;
+
+                        /* Note that the load doesn't actually occur until the
+                         * tile coords packet is processed, and only one load
+                         * may be outstanding at a time.
+                         */
+                        if (resolve_uncleared & PIPE_CLEAR_COLOR) {
+                                vc4_store_before_load(vc4, &coords_emitted);
+
+                                cl_start_reloc(&vc4->rcl, 1);
+                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+                                cl_u8(&vc4->rcl,
+                                      VC4_LOADSTORE_TILE_BUFFER_COLOR |
+                                      (csurf->tiling <<
+                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
+                                cl_u8(&vc4->rcl,
+                                      vc4_rt_format_is_565(csurf->base.format) ?
+                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
+                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888);
+                                cl_reloc_hindex(&vc4->rcl, color_hindex,
+                                                csurf->offset);
+
+                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
+                        }
+
+                        if (resolve_uncleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+                                vc4_store_before_load(vc4, &coords_emitted);
+
+                                cl_start_reloc(&vc4->rcl, 1);
+                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+                                cl_u8(&vc4->rcl,
+                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
+                                      (zsurf->tiling <<
+                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
+                                cl_u8(&vc4->rcl, 0);
+                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
+                                                zsurf->offset);
+
+                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
+                        }
+
+                        /* Clipping depends on tile coordinates having been
+                         * emitted, so make sure it's happened even if
+                         * everything was cleared to start.
+                         */
+                        vc4_tile_coordinates(vc4, x, y, &coords_emitted);
+
+                        /* Wait for the binner before jumping to the first
+                         * tile's lists.
+                         */
+                        if (x == 0 && y == 0)
+                                cl_u8(&vc4->rcl, VC4_PACKET_WAIT_ON_SEMAPHORE);
+
+                        cl_start_reloc(&vc4->rcl, 1);
+                        cl_u8(&vc4->rcl, VC4_PACKET_BRANCH_TO_SUB_LIST);
+                        cl_reloc_hindex(&vc4->rcl, tile_alloc_hindex,
+                                        (y * xtiles + x) * 32);
+
+                        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
+
+                                cl_start_reloc(&vc4->rcl, 1);
+                                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+                                cl_u8(&vc4->rcl,
+                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
+                                      (zsurf->tiling <<
+                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
+                                cl_u8(&vc4->rcl,
+                                      VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR);
+                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
+                                                zsurf->offset |
+                                                ((end_of_frame &&
+                                                  !(vc4->resolve & PIPE_CLEAR_COLOR0)) ?
+                                                 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
+
+                                coords_emitted = false;
+                        }
+
+                        if (vc4->resolve & PIPE_CLEAR_COLOR0) {
+                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
+                                if (end_of_frame) {
+                                        cl_u8(&vc4->rcl,
+                                              VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
+                                } else {
+                                        cl_u8(&vc4->rcl,
+                                              VC4_PACKET_STORE_MS_TILE_BUFFER);
                                 }
+
+                                coords_emitted = false;
                         }
-                        if (black)
-                                fprintf(stderr, "O");
-                        else if (on)
-                                fprintf(stderr, "X");
-                        else
-                                fprintf(stderr, ".");
+
+                        /* One of the bits needs to have been set that would
+                         * have triggered an EOF.
+                         */
+                        assert(vc4->resolve & (PIPE_CLEAR_COLOR0 |
+                                               PIPE_CLEAR_DEPTH |
+                                               PIPE_CLEAR_STENCIL));
+                        /* Any coords emitted must also have been consumed by
+                         * a store.
+                         */
+                        assert(!coords_emitted);
                 }
-                fprintf(stderr, "\n");
         }
 
-        for (int i = 0; i < num_found_colors; i++) {
-                fprintf(stderr, "color %d: 0x%08x\n", i, found_colors[i]);
-        }
-#endif
+        if (vc4->resolve & PIPE_CLEAR_COLOR0)
+                ctex->writes++;
+
+        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))
+                ztex->writes++;
 }
 
 void
@@ -92,8 +294,23 @@ vc4_flush(struct pipe_context *pctx)
         if (!vc4->needs_flush)
                 return;
 
-        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
-        struct vc4_resource *ctex = vc4_resource(csurf->base.texture);
+        /* Increment the semaphore indicating that binning is done and
+         * unblocking the render thread.  Note that this doesn't act until the
+         * FLUSH completes.
+         */
+        cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+        /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
+        cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
+
+        vc4_setup_rcl(vc4);
+
+        if (vc4_debug & VC4_DEBUG_CL) {
+                fprintf(stderr, "BCL:\n");
+                vc4_dump_cl(vc4->bcl.base, vc4->bcl.size, false);
+                fprintf(stderr, "RCL:\n");
+                vc4_dump_cl(vc4->rcl.base, vc4->rcl.size, true);
+        }
+
         struct drm_vc4_submit_cl submit;
         memset(&submit, 0, sizeof(submit));
 
@@ -101,14 +318,14 @@ vc4_flush(struct pipe_context *pctx)
         submit.bo_handle_count = (vc4->bo_handles.next -
                                   vc4->bo_handles.base) / 4;
         submit.bin_cl = vc4->bcl.base;
-        submit.bin_cl_len = vc4->bcl.next - vc4->bcl.base;
+        submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
         submit.render_cl = vc4->rcl.base;
-        submit.render_cl_len = vc4->rcl.next - vc4->rcl.base;
-        submit.shader_records = vc4->shader_rec.base;
-        submit.shader_record_len = vc4->shader_rec.next - vc4->shader_rec.base;
-        submit.shader_record_count = vc4->shader_rec_count;
+        submit.render_cl_size = vc4->rcl.next - vc4->rcl.base;
+        submit.shader_rec = vc4->shader_rec.base;
+        submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
+        submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = vc4->uniforms.base;
-        submit.uniforms_len = vc4->uniforms.next - vc4->uniforms.base;
+        submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
 
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
@@ -116,10 +333,22 @@ vc4_flush(struct pipe_context *pctx)
 #ifndef USE_VC4_SIMULATOR
                 ret = drmIoctl(vc4->fd, DRM_IOCTL_VC4_SUBMIT_CL, &submit);
 #else
-                ret = vc4_simulator_flush(vc4, &submit, csurf);
+                ret = vc4_simulator_flush(vc4, &submit);
 #endif
-                if (ret)
-                        errx(1, "VC4 submit failed\n");
+                if (ret) {
+                        fprintf(stderr, "VC4 submit failed\n");
+                        abort();
+                }
+        }
+
+        vc4->last_emit_seqno = submit.seqno;
+
+        if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
+                if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
+                                    PIPE_TIMEOUT_INFINITE)) {
+                        fprintf(stderr, "Wait failed.\n");
+                        abort();
+                }
         }
 
         vc4_reset_cl(&vc4->bcl);
@@ -127,22 +356,85 @@ vc4_flush(struct pipe_context *pctx)
         vc4_reset_cl(&vc4->shader_rec);
         vc4_reset_cl(&vc4->uniforms);
         vc4_reset_cl(&vc4->bo_handles);
-#ifdef USE_VC4_SIMULATOR
+        struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
+        for (int i = 0; i < submit.bo_handle_count; i++)
+                vc4_bo_unreference(&referenced_bos[i]);
         vc4_reset_cl(&vc4->bo_pointers);
-#endif
         vc4->shader_rec_count = 0;
 
         vc4->needs_flush = false;
-        vc4->dirty = ~0;
+        vc4->draw_call_queued = false;
 
-        dump_fbo(vc4, ctex->bo);
+        /* We have no hardware context saved between our draw calls, so we
+         * need to flag the next draw as needing all state emitted.  Emitting
+         * all state at the start of our draws is also what ensures that we
+         * return to the state we need after a previous tile has finished.
+         */
+        vc4->dirty = ~0;
+        vc4->resolve = 0;
+        vc4->cleared = 0;
 }
 
 static void
 vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
                unsigned flags)
 {
+        struct vc4_context *vc4 = vc4_context(pctx);
+
         vc4_flush(pctx);
+
+        if (fence) {
+                struct vc4_fence *f = vc4_fence_create(vc4->screen,
+                                                       vc4->last_emit_seqno);
+                *fence = (struct pipe_fence_handle *)f;
+        }
+}
+
+/**
+ * Flushes the current command lists if they reference the given BO.
+ *
+ * This helps avoid flushing the command buffers when unnecessary.
+ */
+bool
+vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (!vc4->needs_flush)
+                return false;
+
+        /* Walk all the referenced BOs in the drawing command list to see if
+         * they match.
+         */
+        struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
+        for (int i = 0; i < (vc4->bo_handles.next -
+                             vc4->bo_handles.base) / 4; i++) {
+                if (referenced_bos[i] == bo) {
+                        return true;
+                }
+        }
+
+        /* Also check for the Z/color buffers, since the references to those
+         * are only added immediately before submit.
+         */
+        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
+        if (csurf) {
+                struct vc4_resource *ctex = vc4_resource(csurf->base.texture);
+                if (ctex->bo == bo) {
+                        return true;
+                }
+        }
+
+        struct vc4_surface *zsurf = vc4_surface(vc4->framebuffer.zsbuf);
+        if (zsurf) {
+                struct vc4_resource *ztex =
+                        vc4_resource(zsurf->base.texture);
+                if (ztex->bo == bo) {
+                        return true;
+                }
+        }
+
+        return false;
 }
 
 static void
@@ -158,7 +450,14 @@ vc4_context_destroy(struct pipe_context *pctx)
 
         util_slab_destroy(&vc4->transfer_pool);
 
-        free(vc4);
+        pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL);
+        pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL);
+        vc4_bo_unreference(&vc4->tile_alloc);
+        vc4_bo_unreference(&vc4->tile_state);
+
+        vc4_program_fini(pctx);
+
+        ralloc_free(vc4);
 }
 
 struct pipe_context *
@@ -171,7 +470,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv)
         uint32_t saved_shaderdb_flag = vc4_debug & VC4_DEBUG_SHADERDB;
         vc4_debug &= ~VC4_DEBUG_SHADERDB;
 
-        vc4 = CALLOC_STRUCT(vc4_context);
+        vc4 = rzalloc(NULL, struct vc4_context);
         if (vc4 == NULL)
                 return NULL;
         struct pipe_context *pctx = &vc4->base;
@@ -186,24 +485,27 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv)
         vc4_draw_init(pctx);
         vc4_state_init(pctx);
         vc4_program_init(pctx);
+        vc4_query_init(pctx);
         vc4_resource_context_init(pctx);
 
         vc4_init_cl(vc4, &vc4->bcl);
         vc4_init_cl(vc4, &vc4->rcl);
         vc4_init_cl(vc4, &vc4->shader_rec);
+        vc4_init_cl(vc4, &vc4->uniforms);
         vc4_init_cl(vc4, &vc4->bo_handles);
+        vc4_init_cl(vc4, &vc4->bo_pointers);
 
         vc4->dirty = ~0;
         vc4->fd = screen->fd;
 
-        util_slab_create(&vc4->transfer_pool, sizeof(struct pipe_transfer),
+        util_slab_create(&vc4->transfer_pool, sizeof(struct vc4_transfer),
                          16, UTIL_SLAB_SINGLETHREADED);
         vc4->blitter = util_blitter_create(pctx);
         if (!vc4->blitter)
                 goto fail;
 
         vc4->primconvert = util_primconvert_create(pctx,
-                                                   !((1 << PIPE_PRIM_QUADS) - 1));
+                                                   (1 << PIPE_PRIM_QUADS) - 1);
         if (!vc4->primconvert)
                 goto fail;