v3d: support rendering to multi-layered framebuffers

author Iago Toral Quiroga <itoral@igalia.com>

Tue, 29 Oct 2019 09:27:23 +0000 (10:27 +0100)

committer Iago Toral Quiroga <itoral@igalia.com>

Mon, 16 Dec 2019 07:42:37 +0000 (08:42 +0100)
author Iago Toral Quiroga <itoral@igalia.com>
Tue, 29 Oct 2019 09:27:23 +0000 (10:27 +0100)
committer Iago Toral Quiroga <itoral@igalia.com>
Mon, 16 Dec 2019 07:42:37 +0000 (08:42 +0100)
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h

index 738c1f82319ab3a194f74ff53173adce5b41ff35..f8d146847de714515b17778d2db7068170e96ee2 100644 (file)
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -354,6 +354,8 @@ struct v3d_job {
          */
          uint32_t draw_width;
          uint32_t draw_height;
+        uint32_t num_layers;
+
          /** @} */
          /** @{ Tile information, depending on MSAA and float color buffer. */
          uint32_t draw_tiles_x; /** @< Number of tiles wide for framebuffer. */
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c

index d0f6f623e6cfcb2b5f5c3e544267bca41c036b6a..812ca3b94a763d77a04a7fe67fcf233be96af6bf 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -55,11 +55,14 @@ v3d_start_draw(struct v3d_context *v3d)
          job->submit.bcl_start = job->bcl.bo->offset;
          v3d_job_add_bo(job, job->bcl.bo);
  
+        uint32_t fb_layers = util_framebuffer_get_num_layers(&v3d->framebuffer);
+
          /* The PTB will request the tile alloc initial size per tile at start
           * of tile binning.
           */
-        uint32_t tile_alloc_size = (job->draw_tiles_x *
-                                    job->draw_tiles_y) * 64;
+        uint32_t tile_alloc_size =
+                MAX2(fb_layers, 1) * job->draw_tiles_x * job->draw_tiles_y * 64;
+
          /* The PTB allocates in aligned 4k chunks after the initial setup. */
          tile_alloc_size = align(tile_alloc_size, 4096);
  
@@ -79,10 +82,21 @@ v3d_start_draw(struct v3d_context *v3d)
                                         "tile_alloc");
          uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
          job->tile_state = v3d_bo_alloc(v3d->screen,
+                                       MAX2(fb_layers, 1) *
                                         job->draw_tiles_y *
                                         job->draw_tiles_x *
                                         tsda_per_tile_size,
                                         "TSDA");
+#if V3D_VERSION >= 41
+        /* This must go before the binning mode configuration. It is
+         * required for layered framebuffers to work.
+         */
+        if (fb_layers > 0) {
+                cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
+                        config.number_of_layers = fb_layers;
+                }
+        }
+#endif
  
  #if V3D_VERSION >= 40
          cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
@@ -137,6 +151,7 @@ v3d_start_draw(struct v3d_context *v3d)
          job->needs_flush = true;
          job->draw_width = v3d->framebuffer.width;
          job->draw_height = v3d->framebuffer.height;
+        job->num_layers = fb_layers;
  }
  
  static void
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c

index 20fd6211bde901e7880d5d3051a09a9c680f22c7..792b7582dfc6bc6944e9562fef1b648e1fa05586 100644 (file)
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -53,7 +53,7 @@ flush_last_load(struct v3d_cl *cl)
  
  static void
  load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-             uint32_t pipe_bit, uint32_t *loads_pending)
+             int layer, uint32_t pipe_bit, uint32_t *loads_pending)
  {
          struct v3d_surface *surf = v3d_surface(psurf);
          bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -64,9 +64,12 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
  
          struct v3d_resource *rsc = v3d_resource(psurf->texture);
  
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
          cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
                  load.buffer_to_load = buffer;
-                load.address = cl_address(rsc->bo, surf->offset);
+                load.address = cl_address(rsc->bo, layer_offset);
  
  #if V3D_VERSION >= 40
                  load.memory_format = surf->tiling;
@@ -109,8 +112,9 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
  
  static void
  store_general(struct v3d_job *job,
-              struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-              int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
+              struct v3d_cl *cl, struct pipe_surface *psurf,
+              int layer, int buffer, int pipe_bit,
+              uint32_t *stores_pending, bool general_color_clear)
  {
          struct v3d_surface *surf = v3d_surface(psurf);
          bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -126,9 +130,12 @@ store_general(struct v3d_job *job,
  
          rsc->writes++;
  
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
          cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
                  store.buffer_to_store = buffer;
-                store.address = cl_address(rsc->bo, surf->offset);
+                store.address = cl_address(rsc->bo, layer_offset);
  
  #if V3D_VERSION >= 40
                  store.clear_buffer_being_stored = false;
@@ -203,7 +210,7 @@ zs_buffer_from_pipe_bits(int pipe_clear_bits)
  }
  
  static void
-v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl, int layer)
  {
          uint32_t loads_pending = job->load;
  
@@ -218,7 +225,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                          continue;
                  }
  
-                load_general(cl, psurf, RENDER_TARGET_0 + i,
+                load_general(cl, psurf, RENDER_TARGET_0 + i, layer,
                               bit, &loads_pending);
          }
  
@@ -230,7 +237,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                  if (rsc->separate_stencil &&
                      (loads_pending & PIPE_CLEAR_STENCIL)) {
                          load_general(cl, job->zsbuf,
-                                     STENCIL,
+                                     STENCIL, layer,
                                       PIPE_CLEAR_STENCIL,
                                       &loads_pending);
                  }
@@ -238,6 +245,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                  if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
                          load_general(cl, job->zsbuf,
                                       zs_buffer_from_pipe_bits(loads_pending),
+                                     layer,
                                       loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
                                       &loads_pending);
                  }
@@ -266,7 +274,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
  }
  
  static void
-v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
  {
  #if V3D_VERSION < 40
          UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS;
@@ -316,7 +324,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                          continue;
                  }
  
-                store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit,
+                store_general(job, cl, psurf, layer, RENDER_TARGET_0 + i, bit,
                                &stores_pending, general_color_clear);
          }
  
@@ -325,20 +333,20 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                  struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
                  if (rsc->separate_stencil) {
                          if (job->store & PIPE_CLEAR_DEPTH) {
-                                store_general(job, cl, job->zsbuf, Z,
-                                              PIPE_CLEAR_DEPTH,
+                                store_general(job, cl, job->zsbuf, layer,
+                                              Z, PIPE_CLEAR_DEPTH,
                                                &stores_pending,
                                                general_color_clear);
                          }
  
                          if (job->store & PIPE_CLEAR_STENCIL) {
-                                store_general(job, cl, job->zsbuf, STENCIL,
-                                              PIPE_CLEAR_STENCIL,
+                                store_general(job, cl, job->zsbuf, layer,
+                                              STENCIL, PIPE_CLEAR_STENCIL,
                                                &stores_pending,
                                                general_color_clear);
                          }
                  } else {
-                        store_general(job, cl, job->zsbuf,
+                        store_general(job, cl, job->zsbuf, layer,
                                        zs_buffer_from_pipe_bits(job->store),
                                        job->store & PIPE_CLEAR_DEPTHSTENCIL,
                                        &stores_pending, general_color_clear);
@@ -400,7 +408,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
  }
  
  static void
-v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
+v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
  {
          /* Emit the generic list in our indirect state -- the rcl will just
           * have pointers into it.
@@ -416,7 +424,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
                  cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
          }
  
-        v3d_rcl_emit_loads(job, cl);
+        v3d_rcl_emit_loads(job, cl, layer);
  
          if (V3D_VERSION < 40) {
                  /* Tile Coordinates triggers the last reload and sets where
@@ -434,7 +442,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
  
          cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
  
-        v3d_rcl_emit_stores(job, cl);
+        v3d_rcl_emit_stores(job, cl, layer);
  
  #if V3D_VERSION >= 40
          cl_emit(cl, END_OF_TILE_MARKER, end);
@@ -501,7 +509,8 @@ v3dX(emit_rcl)(struct v3d_job *job)
          /* The RCL list should be empty. */
          assert(!job->rcl.bo);
  
-        v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+        v3d_cl_ensure_space_with_branch(&job->rcl, 200 +
+                                        MAX2(job->num_layers, 1) * 256 *
                                          cl_packet_length(SUPERTILE_COORDINATES));
          job->submit.rcl_start = job->rcl.bo->offset;
          v3d_job_add_bo(job, job->rcl.bo);
@@ -684,113 +693,125 @@ v3dX(emit_rcl)(struct v3d_job *job)
                          TILE_ALLOCATION_BLOCK_SIZE_64B;
          }
  
-        uint32_t supertile_w = 1, supertile_h = 1;
-
-        /* If doing multicore binning, we would need to initialize each core's
-         * tile list here.
+        /* ARB_framebuffer_no_attachments allows rendering to happen even when
+         * the framebuffer has no attachments, the idea being that fragment
+         * shaders can still do image load/store, ssbo, etc without having to
+         * write to actual attachments, so always run at least one iteration
+         * of the loop.
           */
-        cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-                list.address = cl_address(job->tile_alloc, 0);
-        }
-
-        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
-                uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
-                const uint32_t max_supertiles = 256;
-
-                /* Size up our supertiles until we get under the limit. */
-                for (;;) {
-                        frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
-                                                             supertile_w);
-                        frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
-                                                             supertile_h);
-                        if (frame_w_in_supertiles * frame_h_in_supertiles <
-                            max_supertiles) {
-                                break;
-                        }
+        assert(job->num_layers > 0 || (job->load == 0 && job->store == 0));
+        for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) {
+                uint32_t supertile_w = 1, supertile_h = 1;
  
-                        if (supertile_w < supertile_h)
-                                supertile_w++;
-                        else
-                                supertile_h++;
+                /* If doing multicore binning, we would need to initialize each core's
+                 * tile list here.
+                 */
+                uint32_t tile_alloc_offset =
+                        layer * job->draw_tiles_x * job->draw_tiles_y * 64;
+                cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+                        list.address =
+                                cl_address(job->tile_alloc, tile_alloc_offset);
                  }
  
-                config.number_of_bin_tile_lists = 1;
-                config.total_frame_width_in_tiles = job->draw_tiles_x;
-                config.total_frame_height_in_tiles = job->draw_tiles_y;
+                cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+                        uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
+                        const uint32_t max_supertiles = 256;
+
+                        /* Size up our supertiles until we get under the limit. */
+                        for (;;) {
+                                frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
+                                                                     supertile_w);
+                                frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
+                                                                     supertile_h);
+                                if (frame_w_in_supertiles * frame_h_in_supertiles <
+                                    max_supertiles) {
+                                        break;
+                                }
+
+                                if (supertile_w < supertile_h)
+                                        supertile_w++;
+                                else
+                                        supertile_h++;
+                        }
  
-                config.supertile_width_in_tiles = supertile_w;
-                config.supertile_height_in_tiles = supertile_h;
+                        config.number_of_bin_tile_lists = 1;
+                        config.total_frame_width_in_tiles = job->draw_tiles_x;
+                        config.total_frame_height_in_tiles = job->draw_tiles_y;
  
-                config.total_frame_width_in_supertiles = frame_w_in_supertiles;
-                config.total_frame_height_in_supertiles = frame_h_in_supertiles;
-        }
+                        config.supertile_width_in_tiles = supertile_w;
+                        config.supertile_height_in_tiles = supertile_h;
  
-        /* Start by clearing the tile buffer. */
-        cl_emit(&job->rcl, TILE_COORDINATES, coords) {
-                coords.tile_column_number = 0;
-                coords.tile_row_number = 0;
-        }
+                        config.total_frame_width_in_supertiles = frame_w_in_supertiles;
+                        config.total_frame_height_in_supertiles = frame_h_in_supertiles;
+                }
  
-        /* Emit an initial clear of the tile buffers.  This is necessary for
-         * any buffers that should be cleared (since clearing normally happens
-         * at the *end* of the generic tile list), but it's also nice to clear
-         * everything so the first tile doesn't inherit any contents from some
-         * previous frame.
-         *
-         * Also, implement the GFXH-1742 workaround.  There's a race in the HW
-         * between the RCL updating the TLB's internal type/size and the
-         * spawning of the QPU instances using the TLB's current internal
-         * type/size.  To make sure the QPUs get the right state,, we need 1
-         * dummy store in between internal type/size changes on V3D 3.x, and 2
-         * dummy stores on 4.x.
-         */
+                /* Start by clearing the tile buffer. */
+                cl_emit(&job->rcl, TILE_COORDINATES, coords) {
+                        coords.tile_column_number = 0;
+                        coords.tile_row_number = 0;
+                }
+
+                /* Emit an initial clear of the tile buffers.  This is necessary for
+                 * any buffers that should be cleared (since clearing normally happens
+                 * at the *end* of the generic tile list), but it's also nice to clear
+                 * everything so the first tile doesn't inherit any contents from some
+                 * previous frame.
+                 *
+                 * Also, implement the GFXH-1742 workaround.  There's a race in the HW
+                 * between the RCL updating the TLB's internal type/size and the
+                 * spawning of the QPU instances using the TLB's current internal
+                 * type/size.  To make sure the QPUs get the right state,, we need 1
+                 * dummy store in between internal type/size changes on V3D 3.x, and 2
+                 * dummy stores on 4.x.
+                 */
  #if V3D_VERSION < 40
-        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
-                store.buffer_to_store = NONE;
-        }
-#else
-        for (int i = 0; i < 2; i++) {
-                if (i > 0)
-                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
-                cl_emit(&job->rcl, END_OF_LOADS, end);
                  cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
                          store.buffer_to_store = NONE;
                  }
-                if (i == 0) {
-                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
-                                clear.clear_z_stencil_buffer = true;
-                                clear.clear_all_render_targets = true;
+#else
+                for (int i = 0; i < 2; i++) {
+                        if (i > 0)
+                                cl_emit(&job->rcl, TILE_COORDINATES, coords);
+                        cl_emit(&job->rcl, END_OF_LOADS, end);
+                        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                                store.buffer_to_store = NONE;
                          }
+                        if (i == 0) {
+                                cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                        clear.clear_z_stencil_buffer = true;
+                                        clear.clear_all_render_targets = true;
+                                }
+                        }
+                        cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
                  }
-                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
-        }
  #endif
  
-        cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+                cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
  
-        v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
+                v3d_rcl_emit_generic_per_tile_list(job, layer);
  
-        /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
-         * performance, but we should use Morton order otherwise to improve
-         * cache locality.
-         */
-        uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
-        uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
-        uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
-        uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
-
-        uint32_t max_x_supertile = 0;
-        uint32_t max_y_supertile = 0;
-        if (job->draw_max_x != 0 && job->draw_max_y != 0) {
-                max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
-                max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
-        }
+                /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+                 * performance, but we should use Morton order otherwise to improve
+                 * cache locality.
+                 */
+                uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
+                uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
+                uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
+                uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
+
+                uint32_t max_x_supertile = 0;
+                uint32_t max_y_supertile = 0;
+                if (job->draw_max_x != 0 && job->draw_max_y != 0) {
+                        max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
+                        max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
+                }
  
-        for (int y = min_y_supertile; y <= max_y_supertile; y++) {
-                for (int x = min_x_supertile; x <= max_x_supertile; x++) {
-                        cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
-                                coords.column_number_in_supertiles = x;
-                                coords.row_number_in_supertiles = y;
+                for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+                        for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+                                cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
+                                        coords.column_number_in_supertiles = x;
+                                        coords.row_number_in_supertiles = y;
+                                }
                          }
                  }
          }
author	Iago Toral Quiroga <itoral@igalia.com>
	Tue, 29 Oct 2019 09:27:23 +0000 (10:27 +0100)
committer	Iago Toral Quiroga <itoral@igalia.com>
	Mon, 16 Dec 2019 07:42:37 +0000 (08:42 +0100)
src/gallium/drivers/v3d/v3d_context.h		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_draw.c		patch \| blob \| history
src/gallium/drivers/v3d/v3dx_rcl.c		patch \| blob \| history