static void
load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
- uint32_t pipe_bit, uint32_t *loads_pending)
+ int layer, uint32_t pipe_bit, uint32_t *loads_pending)
{
struct v3d_surface *surf = v3d_surface(psurf);
bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
struct v3d_resource *rsc = v3d_resource(psurf->texture);
+ uint32_t layer_offset =
+ v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+ psurf->u.tex.first_layer + layer);
cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
load.buffer_to_load = buffer;
- load.address = cl_address(rsc->bo, surf->offset);
+ load.address = cl_address(rsc->bo, layer_offset);
#if V3D_VERSION >= 40
load.memory_format = surf->tiling;
static void
store_general(struct v3d_job *job,
- struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
- int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
+ struct v3d_cl *cl, struct pipe_surface *psurf,
+ int layer, int buffer, int pipe_bit,
+ uint32_t *stores_pending, bool general_color_clear)
{
struct v3d_surface *surf = v3d_surface(psurf);
bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
rsc->writes++;
+ uint32_t layer_offset =
+ v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+ psurf->u.tex.first_layer + layer);
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = buffer;
- store.address = cl_address(rsc->bo, surf->offset);
+ store.address = cl_address(rsc->bo, layer_offset);
#if V3D_VERSION >= 40
store.clear_buffer_being_stored = false;
}
static void
-v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl, int layer)
{
uint32_t loads_pending = job->load;
continue;
}
- load_general(cl, psurf, RENDER_TARGET_0 + i,
+ load_general(cl, psurf, RENDER_TARGET_0 + i, layer,
bit, &loads_pending);
}
if (rsc->separate_stencil &&
(loads_pending & PIPE_CLEAR_STENCIL)) {
load_general(cl, job->zsbuf,
- STENCIL,
+ STENCIL, layer,
PIPE_CLEAR_STENCIL,
&loads_pending);
}
if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
load_general(cl, job->zsbuf,
zs_buffer_from_pipe_bits(loads_pending),
+ layer,
loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
&loads_pending);
}
}
static void
-v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
{
#if V3D_VERSION < 40
UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS;
continue;
}
- store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit,
+ store_general(job, cl, psurf, layer, RENDER_TARGET_0 + i, bit,
&stores_pending, general_color_clear);
}
struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
if (rsc->separate_stencil) {
if (job->store & PIPE_CLEAR_DEPTH) {
- store_general(job, cl, job->zsbuf, Z,
- PIPE_CLEAR_DEPTH,
+ store_general(job, cl, job->zsbuf, layer,
+ Z, PIPE_CLEAR_DEPTH,
&stores_pending,
general_color_clear);
}
if (job->store & PIPE_CLEAR_STENCIL) {
- store_general(job, cl, job->zsbuf, STENCIL,
- PIPE_CLEAR_STENCIL,
+ store_general(job, cl, job->zsbuf, layer,
+ STENCIL, PIPE_CLEAR_STENCIL,
&stores_pending,
general_color_clear);
}
} else {
- store_general(job, cl, job->zsbuf,
+ store_general(job, cl, job->zsbuf, layer,
zs_buffer_from_pipe_bits(job->store),
job->store & PIPE_CLEAR_DEPTHSTENCIL,
&stores_pending, general_color_clear);
}
static void
-v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
+v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
{
/* Emit the generic list in our indirect state -- the rcl will just
* have pointers into it.
cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
}
- v3d_rcl_emit_loads(job, cl);
+ v3d_rcl_emit_loads(job, cl, layer);
if (V3D_VERSION < 40) {
/* Tile Coordinates triggers the last reload and sets where
cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
- v3d_rcl_emit_stores(job, cl);
+ v3d_rcl_emit_stores(job, cl, layer);
#if V3D_VERSION >= 40
cl_emit(cl, END_OF_TILE_MARKER, end);
/* The RCL list should be empty. */
assert(!job->rcl.bo);
- v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+ v3d_cl_ensure_space_with_branch(&job->rcl, 200 +
+ MAX2(job->num_layers, 1) * 256 *
cl_packet_length(SUPERTILE_COORDINATES));
job->submit.rcl_start = job->rcl.bo->offset;
v3d_job_add_bo(job, job->rcl.bo);
TILE_ALLOCATION_BLOCK_SIZE_64B;
}
- uint32_t supertile_w = 1, supertile_h = 1;
-
- /* If doing multicore binning, we would need to initialize each core's
- * tile list here.
+ /* ARB_framebuffer_no_attachments allows rendering to happen even when
+ * the framebuffer has no attachments, the idea being that fragment
+ * shaders can still do image load/store, ssbo, etc without having to
+ * write to actual attachments, so always run at least one iteration
+ * of the loop.
*/
- cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
- list.address = cl_address(job->tile_alloc, 0);
- }
-
- cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
- uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
- const uint32_t max_supertiles = 256;
-
- /* Size up our supertiles until we get under the limit. */
- for (;;) {
- frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
- supertile_w);
- frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
- supertile_h);
- if (frame_w_in_supertiles * frame_h_in_supertiles <
- max_supertiles) {
- break;
- }
+ assert(job->num_layers > 0 || (job->load == 0 && job->store == 0));
+ for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) {
+ uint32_t supertile_w = 1, supertile_h = 1;
- if (supertile_w < supertile_h)
- supertile_w++;
- else
- supertile_h++;
+ /* If doing multicore binning, we would need to initialize each core's
+ * tile list here.
+ */
+ uint32_t tile_alloc_offset =
+ layer * job->draw_tiles_x * job->draw_tiles_y * 64;
+ cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+ list.address =
+ cl_address(job->tile_alloc, tile_alloc_offset);
}
- config.number_of_bin_tile_lists = 1;
- config.total_frame_width_in_tiles = job->draw_tiles_x;
- config.total_frame_height_in_tiles = job->draw_tiles_y;
+ cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+ uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
+ const uint32_t max_supertiles = 256;
+
+ /* Size up our supertiles until we get under the limit. */
+ for (;;) {
+ frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
+ supertile_w);
+ frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
+ supertile_h);
+ if (frame_w_in_supertiles * frame_h_in_supertiles <
+ max_supertiles) {
+ break;
+ }
+
+ if (supertile_w < supertile_h)
+ supertile_w++;
+ else
+ supertile_h++;
+ }
- config.supertile_width_in_tiles = supertile_w;
- config.supertile_height_in_tiles = supertile_h;
+ config.number_of_bin_tile_lists = 1;
+ config.total_frame_width_in_tiles = job->draw_tiles_x;
+ config.total_frame_height_in_tiles = job->draw_tiles_y;
- config.total_frame_width_in_supertiles = frame_w_in_supertiles;
- config.total_frame_height_in_supertiles = frame_h_in_supertiles;
- }
+ config.supertile_width_in_tiles = supertile_w;
+ config.supertile_height_in_tiles = supertile_h;
- /* Start by clearing the tile buffer. */
- cl_emit(&job->rcl, TILE_COORDINATES, coords) {
- coords.tile_column_number = 0;
- coords.tile_row_number = 0;
- }
+ config.total_frame_width_in_supertiles = frame_w_in_supertiles;
+ config.total_frame_height_in_supertiles = frame_h_in_supertiles;
+ }
- /* Emit an initial clear of the tile buffers. This is necessary for
- * any buffers that should be cleared (since clearing normally happens
- * at the *end* of the generic tile list), but it's also nice to clear
- * everything so the first tile doesn't inherit any contents from some
- * previous frame.
- *
- * Also, implement the GFXH-1742 workaround. There's a race in the HW
- * between the RCL updating the TLB's internal type/size and the
- * spawning of the QPU instances using the TLB's current internal
- * type/size. To make sure the QPUs get the right state,, we need 1
- * dummy store in between internal type/size changes on V3D 3.x, and 2
- * dummy stores on 4.x.
- */
+ /* Start by clearing the tile buffer. */
+ cl_emit(&job->rcl, TILE_COORDINATES, coords) {
+ coords.tile_column_number = 0;
+ coords.tile_row_number = 0;
+ }
+
+ /* Emit an initial clear of the tile buffers. This is necessary for
+ * any buffers that should be cleared (since clearing normally happens
+ * at the *end* of the generic tile list), but it's also nice to clear
+ * everything so the first tile doesn't inherit any contents from some
+ * previous frame.
+ *
+ * Also, implement the GFXH-1742 workaround. There's a race in the HW
+ * between the RCL updating the TLB's internal type/size and the
+ * spawning of the QPU instances using the TLB's current internal
+ * type/size. To make sure the QPUs get the right state,, we need 1
+ * dummy store in between internal type/size changes on V3D 3.x, and 2
+ * dummy stores on 4.x.
+ */
#if V3D_VERSION < 40
- cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
- store.buffer_to_store = NONE;
- }
-#else
- for (int i = 0; i < 2; i++) {
- if (i > 0)
- cl_emit(&job->rcl, TILE_COORDINATES, coords);
- cl_emit(&job->rcl, END_OF_LOADS, end);
cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
- if (i == 0) {
- cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
- clear.clear_z_stencil_buffer = true;
- clear.clear_all_render_targets = true;
+#else
+ for (int i = 0; i < 2; i++) {
+ if (i > 0)
+ cl_emit(&job->rcl, TILE_COORDINATES, coords);
+ cl_emit(&job->rcl, END_OF_LOADS, end);
+ cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+ store.buffer_to_store = NONE;
}
+ if (i == 0) {
+ cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+ clear.clear_z_stencil_buffer = true;
+ clear.clear_all_render_targets = true;
+ }
+ }
+ cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
}
- cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
- }
#endif
- cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+ cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
- v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
+ v3d_rcl_emit_generic_per_tile_list(job, layer);
- /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
- * performance, but we should use Morton order otherwise to improve
- * cache locality.
- */
- uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
- uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
- uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
- uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
-
- uint32_t max_x_supertile = 0;
- uint32_t max_y_supertile = 0;
- if (job->draw_max_x != 0 && job->draw_max_y != 0) {
- max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
- max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
- }
+ /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11
+ * performance, but we should use Morton order otherwise to improve
+ * cache locality.
+ */
+ uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
+ uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
+ uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
+ uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
+
+ uint32_t max_x_supertile = 0;
+ uint32_t max_y_supertile = 0;
+ if (job->draw_max_x != 0 && job->draw_max_y != 0) {
+ max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
+ max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
+ }
- for (int y = min_y_supertile; y <= max_y_supertile; y++) {
- for (int x = min_x_supertile; x <= max_x_supertile; x++) {
- cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
- coords.column_number_in_supertiles = x;
- coords.row_number_in_supertiles = y;
+ for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+ for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+ cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
+ coords.column_number_in_supertiles = x;
+ coords.row_number_in_supertiles = y;
+ }
}
}
}