From: Iago Toral Quiroga <itoral@igalia.com>
Date: Tue, 29 Oct 2019 09:27:23 +0000 (+0100)
Subject: v3d: support rendering to multi-layered framebuffers
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=74a59fdc6e8c4f9c51454a6d1a5b8998083014b5;p=mesa.git

v3d: support rendering to multi-layered framebuffers

When doing layered rendering the binning stage will prepare per-tile
lists for each layer in the framebuffer, so we need to make sure
we allocate enough space for them .

We also need to emit the NUMBER_OF_LAYERS packet. This is required
even when the number of layers is only 1, otherwise the simulator
detects buffer overflows in the tile_state BO during some CTS test
cases involving layered FBOs.

When rendering, we need to emit commands for each layer of the
framebuffer separately and make sure we address the correct layers for
each one.

v2: fixed typo in comment (Alejandro)

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
---

diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index 738c1f82319..f8d146847de 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -354,6 +354,8 @@ struct v3d_job {
         */
         uint32_t draw_width;
         uint32_t draw_height;
+        uint32_t num_layers;
+
         /** @} */
         /** @{ Tile information, depending on MSAA and float color buffer. */
         uint32_t draw_tiles_x; /** @< Number of tiles wide for framebuffer. */
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index d0f6f623e6c..812ca3b94a7 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -55,11 +55,14 @@ v3d_start_draw(struct v3d_context *v3d)
         job->submit.bcl_start = job->bcl.bo->offset;
         v3d_job_add_bo(job, job->bcl.bo);
 
+        uint32_t fb_layers = util_framebuffer_get_num_layers(&v3d->framebuffer);
+
         /* The PTB will request the tile alloc initial size per tile at start
          * of tile binning.
          */
-        uint32_t tile_alloc_size = (job->draw_tiles_x *
-                                    job->draw_tiles_y) * 64;
+        uint32_t tile_alloc_size =
+                MAX2(fb_layers, 1) * job->draw_tiles_x * job->draw_tiles_y * 64;
+
         /* The PTB allocates in aligned 4k chunks after the initial setup. */
         tile_alloc_size = align(tile_alloc_size, 4096);
 
@@ -79,10 +82,21 @@ v3d_start_draw(struct v3d_context *v3d)
                                        "tile_alloc");
         uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
         job->tile_state = v3d_bo_alloc(v3d->screen,
+                                       MAX2(fb_layers, 1) *
                                        job->draw_tiles_y *
                                        job->draw_tiles_x *
                                        tsda_per_tile_size,
                                        "TSDA");
+#if V3D_VERSION >= 41
+        /* This must go before the binning mode configuration. It is
+         * required for layered framebuffers to work.
+         */
+        if (fb_layers > 0) {
+                cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
+                        config.number_of_layers = fb_layers;
+                }
+        }
+#endif
 
 #if V3D_VERSION >= 40
         cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
@@ -137,6 +151,7 @@ v3d_start_draw(struct v3d_context *v3d)
         job->needs_flush = true;
         job->draw_width = v3d->framebuffer.width;
         job->draw_height = v3d->framebuffer.height;
+        job->num_layers = fb_layers;
 }
 
 static void
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
index 20fd6211bde..792b7582dfc 100644
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -53,7 +53,7 @@ flush_last_load(struct v3d_cl *cl)
 
 static void
 load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-             uint32_t pipe_bit, uint32_t *loads_pending)
+             int layer, uint32_t pipe_bit, uint32_t *loads_pending)
 {
         struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -64,9 +64,12 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
 
         struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
         cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
                 load.buffer_to_load = buffer;
-                load.address = cl_address(rsc->bo, surf->offset);
+                load.address = cl_address(rsc->bo, layer_offset);
 
 #if V3D_VERSION >= 40
                 load.memory_format = surf->tiling;
@@ -109,8 +112,9 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
 
 static void
 store_general(struct v3d_job *job,
-              struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-              int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
+              struct v3d_cl *cl, struct pipe_surface *psurf,
+              int layer, int buffer, int pipe_bit,
+              uint32_t *stores_pending, bool general_color_clear)
 {
         struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -126,9 +130,12 @@ store_general(struct v3d_job *job,
 
         rsc->writes++;
 
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
         cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
                 store.buffer_to_store = buffer;
-                store.address = cl_address(rsc->bo, surf->offset);
+                store.address = cl_address(rsc->bo, layer_offset);
 
 #if V3D_VERSION >= 40
                 store.clear_buffer_being_stored = false;
@@ -203,7 +210,7 @@ zs_buffer_from_pipe_bits(int pipe_clear_bits)
 }
 
 static void
-v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl, int layer)
 {
         uint32_t loads_pending = job->load;
 
@@ -218,7 +225,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                         continue;
                 }
 
-                load_general(cl, psurf, RENDER_TARGET_0 + i,
+                load_general(cl, psurf, RENDER_TARGET_0 + i, layer,
                              bit, &loads_pending);
         }
 
@@ -230,7 +237,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                 if (rsc->separate_stencil &&
                     (loads_pending & PIPE_CLEAR_STENCIL)) {
                         load_general(cl, job->zsbuf,
-                                     STENCIL,
+                                     STENCIL, layer,
                                      PIPE_CLEAR_STENCIL,
                                      &loads_pending);
                 }
@@ -238,6 +245,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                 if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
                         load_general(cl, job->zsbuf,
                                      zs_buffer_from_pipe_bits(loads_pending),
+                                     layer,
                                      loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
                                      &loads_pending);
                 }
@@ -266,7 +274,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
 }
 
 static void
-v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
 {
 #if V3D_VERSION < 40
         UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS;
@@ -316,7 +324,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                         continue;
                 }
 
-                store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit,
+                store_general(job, cl, psurf, layer, RENDER_TARGET_0 + i, bit,
                               &stores_pending, general_color_clear);
         }
 
@@ -325,20 +333,20 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                 struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
                 if (rsc->separate_stencil) {
                         if (job->store & PIPE_CLEAR_DEPTH) {
-                                store_general(job, cl, job->zsbuf, Z,
-                                              PIPE_CLEAR_DEPTH,
+                                store_general(job, cl, job->zsbuf, layer,
+                                              Z, PIPE_CLEAR_DEPTH,
                                               &stores_pending,
                                               general_color_clear);
                         }
 
                         if (job->store & PIPE_CLEAR_STENCIL) {
-                                store_general(job, cl, job->zsbuf, STENCIL,
-                                              PIPE_CLEAR_STENCIL,
+                                store_general(job, cl, job->zsbuf, layer,
+                                              STENCIL, PIPE_CLEAR_STENCIL,
                                               &stores_pending,
                                               general_color_clear);
                         }
                 } else {
-                        store_general(job, cl, job->zsbuf,
+                        store_general(job, cl, job->zsbuf, layer,
                                       zs_buffer_from_pipe_bits(job->store),
                                       job->store & PIPE_CLEAR_DEPTHSTENCIL,
                                       &stores_pending, general_color_clear);
@@ -400,7 +408,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
 }
 
 static void
-v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
+v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
 {
         /* Emit the generic list in our indirect state -- the rcl will just
          * have pointers into it.
@@ -416,7 +424,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
                 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
         }
 
-        v3d_rcl_emit_loads(job, cl);
+        v3d_rcl_emit_loads(job, cl, layer);
 
         if (V3D_VERSION < 40) {
                 /* Tile Coordinates triggers the last reload and sets where
@@ -434,7 +442,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
 
         cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
 
-        v3d_rcl_emit_stores(job, cl);
+        v3d_rcl_emit_stores(job, cl, layer);
 
 #if V3D_VERSION >= 40
         cl_emit(cl, END_OF_TILE_MARKER, end);
@@ -501,7 +509,8 @@ v3dX(emit_rcl)(struct v3d_job *job)
         /* The RCL list should be empty. */
         assert(!job->rcl.bo);
 
-        v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+        v3d_cl_ensure_space_with_branch(&job->rcl, 200 +
+                                        MAX2(job->num_layers, 1) * 256 *
                                         cl_packet_length(SUPERTILE_COORDINATES));
         job->submit.rcl_start = job->rcl.bo->offset;
         v3d_job_add_bo(job, job->rcl.bo);
@@ -684,113 +693,125 @@ v3dX(emit_rcl)(struct v3d_job *job)
                         TILE_ALLOCATION_BLOCK_SIZE_64B;
         }
 
-        uint32_t supertile_w = 1, supertile_h = 1;
-
-        /* If doing multicore binning, we would need to initialize each core's
-         * tile list here.
+        /* ARB_framebuffer_no_attachments allows rendering to happen even when
+         * the framebuffer has no attachments, the idea being that fragment
+         * shaders can still do image load/store, ssbo, etc without having to
+         * write to actual attachments, so always run at least one iteration
+         * of the loop.
          */
-        cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-                list.address = cl_address(job->tile_alloc, 0);
-        }
-
-        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-                uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
-                const uint32_t max_supertiles = 256;
-
-                /* Size up our supertiles until we get under the limit. */
-                for (;;) {
-                        frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
-                                                             supertile_w);
-                        frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
-                                                             supertile_h);
-                        if (frame_w_in_supertiles * frame_h_in_supertiles <
-                            max_supertiles) {
-                                break;
-                        }
+        assert(job->num_layers > 0 || (job->load == 0 && job->store == 0));
+        for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) {
+                uint32_t supertile_w = 1, supertile_h = 1;
 
-                        if (supertile_w < supertile_h)
-                                supertile_w++;
-                        else
-                                supertile_h++;
+                /* If doing multicore binning, we would need to initialize each core's
+                 * tile list here.
+                 */
+                uint32_t tile_alloc_offset =
+                        layer * job->draw_tiles_x * job->draw_tiles_y * 64;
+                cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+                        list.address =
+                                cl_address(job->tile_alloc, tile_alloc_offset);
                 }
 
-                config.number_of_bin_tile_lists = 1;
-                config.total_frame_width_in_tiles = job->draw_tiles_x;
-                config.total_frame_height_in_tiles = job->draw_tiles_y;
+                cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+                        uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
+                        const uint32_t max_supertiles = 256;
+
+                        /* Size up our supertiles until we get under the limit. */
+                        for (;;) {
+                                frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
+                                                                     supertile_w);
+                                frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
+                                                                     supertile_h);
+                                if (frame_w_in_supertiles * frame_h_in_supertiles <
+                                    max_supertiles) {
+                                        break;
+                                }
+
+                                if (supertile_w < supertile_h)
+                                        supertile_w++;
+                                else
+                                        supertile_h++;
+                        }
 
-                config.supertile_width_in_tiles = supertile_w;
-                config.supertile_height_in_tiles = supertile_h;
+                        config.number_of_bin_tile_lists = 1;
+                        config.total_frame_width_in_tiles = job->draw_tiles_x;
+                        config.total_frame_height_in_tiles = job->draw_tiles_y;
 
-                config.total_frame_width_in_supertiles = frame_w_in_supertiles;
-                config.total_frame_height_in_supertiles = frame_h_in_supertiles;
-        }
+                        config.supertile_width_in_tiles = supertile_w;
+                        config.supertile_height_in_tiles = supertile_h;
 
-        /* Start by clearing the tile buffer. */
-        cl_emit(&job->rcl, TILE_COORDINATES, coords) {
-                coords.tile_column_number = 0;
-                coords.tile_row_number = 0;
-        }
+                        config.total_frame_width_in_supertiles = frame_w_in_supertiles;
+                        config.total_frame_height_in_supertiles = frame_h_in_supertiles;
+                }
 
-        /* Emit an initial clear of the tile buffers.  This is necessary for
-         * any buffers that should be cleared (since clearing normally happens
-         * at the *end* of the generic tile list), but it's also nice to clear
-         * everything so the first tile doesn't inherit any contents from some
-         * previous frame.
-         *
-         * Also, implement the GFXH-1742 workaround.  There's a race in the HW
-         * between the RCL updating the TLB's internal type/size and the
-         * spawning of the QPU instances using the TLB's current internal
-         * type/size.  To make sure the QPUs get the right state,, we need 1
-         * dummy store in between internal type/size changes on V3D 3.x, and 2
-         * dummy stores on 4.x.
-         */
+                /* Start by clearing the tile buffer. */
+                cl_emit(&job->rcl, TILE_COORDINATES, coords) {
+                        coords.tile_column_number = 0;
+                        coords.tile_row_number = 0;
+                }
+
+                /* Emit an initial clear of the tile buffers.  This is necessary for
+                 * any buffers that should be cleared (since clearing normally happens
+                 * at the *end* of the generic tile list), but it's also nice to clear
+                 * everything so the first tile doesn't inherit any contents from some
+                 * previous frame.
+                 *
+                 * Also, implement the GFXH-1742 workaround.  There's a race in the HW
+                 * between the RCL updating the TLB's internal type/size and the
+                 * spawning of the QPU instances using the TLB's current internal
+                 * type/size.  To make sure the QPUs get the right state,, we need 1
+                 * dummy store in between internal type/size changes on V3D 3.x, and 2
+                 * dummy stores on 4.x.
+                 */
 #if V3D_VERSION < 40
-        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
-                store.buffer_to_store = NONE;
-        }
-#else
-        for (int i = 0; i < 2; i++) {
-                if (i > 0)
-                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
-                cl_emit(&job->rcl, END_OF_LOADS, end);
                 cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
                         store.buffer_to_store = NONE;
                 }
-                if (i == 0) {
-                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
-                                clear.clear_z_stencil_buffer = true;
-                                clear.clear_all_render_targets = true;
+#else
+                for (int i = 0; i < 2; i++) {
+                        if (i > 0)
+                                cl_emit(&job->rcl, TILE_COORDINATES, coords);
+                        cl_emit(&job->rcl, END_OF_LOADS, end);
+                        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                                store.buffer_to_store = NONE;
                         }
+                        if (i == 0) {
+                                cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                        clear.clear_z_stencil_buffer = true;
+                                        clear.clear_all_render_targets = true;
+                                }
+                        }
+                        cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
                 }
-                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
-        }
 #endif
 
-        cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+                cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
 
-        v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
+                v3d_rcl_emit_generic_per_tile_list(job, layer);
 
-        /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
-         * performance, but we should use Morton order otherwise to improve
-         * cache locality.
-         */
-        uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
-        uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
-        uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
-        uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
-
-        uint32_t max_x_supertile = 0;
-        uint32_t max_y_supertile = 0;
-        if (job->draw_max_x != 0 && job->draw_max_y != 0) {
-                max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
-                max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
-        }
+                /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+                 * performance, but we should use Morton order otherwise to improve
+                 * cache locality.
+                 */
+                uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
+                uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
+                uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
+                uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
+
+                uint32_t max_x_supertile = 0;
+                uint32_t max_y_supertile = 0;
+                if (job->draw_max_x != 0 && job->draw_max_y != 0) {
+                        max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
+                        max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
+                }
 
-        for (int y = min_y_supertile; y <= max_y_supertile; y++) {
-                for (int x = min_x_supertile; x <= max_x_supertile; x++) {
-                        cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
-                                coords.column_number_in_supertiles = x;
-                                coords.row_number_in_supertiles = y;
+                for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+                        for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+                                cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
+                                        coords.column_number_in_supertiles = x;
+                                        coords.row_number_in_supertiles = y;
+                                }
                         }
                 }
         }