v3d: set instance id to 0 at start of tile
[mesa.git] / src / gallium / drivers / v3d / v3dx_rcl.c
index 5be29aca1fd42f4b34114c520c0f75990664377e..739088ec3ed5632f444e2c831d27046e87f9b81e 100644 (file)
@@ -21,7 +21,7 @@
  * IN THE SOFTWARE.
  */
 
-#include "util/u_format.h"
+#include "util/format/u_format.h"
 #include "v3d_context.h"
 #include "v3d_tiling.h"
 #include "broadcom/common/v3d_macros.h"
@@ -53,7 +53,7 @@ flush_last_load(struct v3d_cl *cl)
 
 static void
 load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-             uint32_t pipe_bit, uint32_t *loads_pending)
+             int layer, uint32_t pipe_bit, uint32_t *loads_pending)
 {
         struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -64,9 +64,12 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
 
         struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
         cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
                 load.buffer_to_load = buffer;
-                load.address = cl_address(rsc->bo, surf->offset);
+                load.address = cl_address(rsc->bo, layer_offset);
 
 #if V3D_VERSION >= 40
                 load.memory_format = surf->tiling;
@@ -74,6 +77,7 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
                         load.input_image_format = V3D_OUTPUT_IMAGE_FORMAT_S8;
                 else
                         load.input_image_format = surf->format;
+                load.r_b_swap = surf->swap_rb;
 
                 if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
                     surf->tiling == VC5_TILING_UIF_XOR) {
@@ -108,8 +112,9 @@ load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
 
 static void
 store_general(struct v3d_job *job,
-              struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
-              int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
+              struct v3d_cl *cl, struct pipe_surface *psurf,
+              int layer, int buffer, int pipe_bit,
+              uint32_t *stores_pending, bool general_color_clear)
 {
         struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
@@ -125,21 +130,22 @@ store_general(struct v3d_job *job,
 
         rsc->writes++;
 
+        uint32_t layer_offset =
+                v3d_layer_offset(&rsc->base, psurf->u.tex.level,
+                                 psurf->u.tex.first_layer + layer);
         cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
                 store.buffer_to_store = buffer;
-                store.address = cl_address(rsc->bo, surf->offset);
+                store.address = cl_address(rsc->bo, layer_offset);
 
 #if V3D_VERSION >= 40
-                store.clear_buffer_being_stored =
-                        ((job->cleared & pipe_bit) &&
-                         (general_color_clear ||
-                          !(pipe_bit & PIPE_CLEAR_COLOR_BUFFERS)));
+                store.clear_buffer_being_stored = false;
 
                 if (separate_stencil)
                         store.output_image_format = V3D_OUTPUT_IMAGE_FORMAT_S8;
                 else
                         store.output_image_format = surf->format;
 
+                store.r_b_swap = surf->swap_rb;
                 store.memory_format = surf->tiling;
 
                 if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
@@ -164,18 +170,18 @@ store_general(struct v3d_job *job,
                 assert(buffer != ZSTENCIL);
                 store.raw_mode = true;
                 if (!last_store) {
-                        store.disable_colour_buffers_clear_on_write = true;
+                        store.disable_color_buffers_clear_on_write = true;
                         store.disable_z_buffer_clear_on_write = true;
                         store.disable_stencil_buffer_clear_on_write = true;
                 } else {
-                        store.disable_colour_buffers_clear_on_write =
+                        store.disable_color_buffers_clear_on_write =
                                 !(((pipe_bit & PIPE_CLEAR_COLOR_BUFFERS) &&
                                    general_color_clear &&
-                                   (job->cleared & pipe_bit)));
+                                   (job->clear & pipe_bit)));
                         store.disable_z_buffer_clear_on_write =
-                                !(job->cleared & PIPE_CLEAR_DEPTH);
+                                !(job->clear & PIPE_CLEAR_DEPTH);
                         store.disable_stencil_buffer_clear_on_write =
-                                !(job->cleared & PIPE_CLEAR_STENCIL);
+                                !(job->clear & PIPE_CLEAR_STENCIL);
                 }
                 store.padded_height_of_output_image_in_uif_blocks =
                         surf->padded_height_of_output_image_in_uif_blocks;
@@ -204,11 +210,11 @@ zs_buffer_from_pipe_bits(int pipe_clear_bits)
 }
 
 static void
-v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl, int layer)
 {
-        uint32_t loads_pending = job->resolve & ~job->cleared;
+        uint32_t loads_pending = job->load;
 
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 uint32_t bit = PIPE_CLEAR_COLOR0 << i;
                 if (!(loads_pending & bit))
                         continue;
@@ -219,7 +225,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                         continue;
                 }
 
-                load_general(cl, psurf, RENDER_TARGET_0 + i,
+                load_general(cl, psurf, RENDER_TARGET_0 + i, layer,
                              bit, &loads_pending);
         }
 
@@ -231,7 +237,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                 if (rsc->separate_stencil &&
                     (loads_pending & PIPE_CLEAR_STENCIL)) {
                         load_general(cl, job->zsbuf,
-                                     STENCIL,
+                                     STENCIL, layer,
                                      PIPE_CLEAR_STENCIL,
                                      &loads_pending);
                 }
@@ -239,6 +245,7 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
                 if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
                         load_general(cl, job->zsbuf,
                                      zs_buffer_from_pipe_bits(loads_pending),
+                                     layer,
                                      loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
                                      &loads_pending);
                 }
@@ -249,8 +256,8 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
          * tile coordinates.
          */
         if (loads_pending) {
-                cl_emit(cl, RELOAD_TILE_COLOUR_BUFFER, load) {
-                        load.disable_colour_buffer_load =
+                cl_emit(cl, RELOAD_TILE_COLOR_BUFFER, load) {
+                        load.disable_color_buffer_load =
                                 (~loads_pending &
                                  PIPE_CLEAR_COLOR_BUFFERS) >>
                                 PIPE_FIRST_COLOR_BUFFER_BIT;
@@ -267,11 +274,12 @@ v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
 }
 
 static void
-v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
+v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
 {
-        MAYBE_UNUSED bool needs_color_clear = job->cleared & PIPE_CLEAR_COLOR_BUFFERS;
-        MAYBE_UNUSED bool needs_z_clear = job->cleared & PIPE_CLEAR_DEPTH;
-        MAYBE_UNUSED bool needs_s_clear = job->cleared & PIPE_CLEAR_STENCIL;
+#if V3D_VERSION < 40
+        UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS;
+        UNUSED bool needs_z_clear = job->clear & PIPE_CLEAR_DEPTH;
+        UNUSED bool needs_s_clear = job->clear & PIPE_CLEAR_STENCIL;
 
         /* For clearing color in a TLB general on V3D 3.3:
          *
@@ -288,10 +296,13 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
          * TLB color buffers.
          */
         bool general_color_clear = (needs_color_clear &&
-                                    (job->cleared & PIPE_CLEAR_COLOR_BUFFERS) ==
-                                    (job->resolve & PIPE_CLEAR_COLOR_BUFFERS));
+                                    (job->clear & PIPE_CLEAR_COLOR_BUFFERS) ==
+                                    (job->store & PIPE_CLEAR_COLOR_BUFFERS));
+#else
+        bool general_color_clear = false;
+#endif
 
-        uint32_t stores_pending = job->resolve;
+        uint32_t stores_pending = job->store;
 
         /* For V3D 4.1, use general stores for all TLB stores.
          *
@@ -302,9 +313,9 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
          * perspective.  Non-MSAA surfaces will use
          * STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED.
          */
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 uint32_t bit = PIPE_CLEAR_COLOR0 << i;
-                if (!(job->resolve & bit))
+                if (!(job->store & bit))
                         continue;
 
                 struct pipe_surface *psurf = job->cbufs[i];
@@ -313,37 +324,37 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                         continue;
                 }
 
-                store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit,
+                store_general(job, cl, psurf, layer, RENDER_TARGET_0 + i, bit,
                               &stores_pending, general_color_clear);
         }
 
-        if (job->resolve & PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf &&
+        if (job->store & PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf &&
             !(V3D_VERSION < 40 && job->zsbuf->texture->nr_samples <= 1)) {
                 struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
                 if (rsc->separate_stencil) {
-                        if (job->resolve & PIPE_CLEAR_DEPTH) {
-                                store_general(job, cl, job->zsbuf, Z,
-                                              PIPE_CLEAR_DEPTH,
+                        if (job->store & PIPE_CLEAR_DEPTH) {
+                                store_general(job, cl, job->zsbuf, layer,
+                                              Z, PIPE_CLEAR_DEPTH,
                                               &stores_pending,
                                               general_color_clear);
                         }
 
-                        if (job->resolve & PIPE_CLEAR_STENCIL) {
-                                store_general(job, cl, job->zsbuf, STENCIL,
-                                              PIPE_CLEAR_STENCIL,
+                        if (job->store & PIPE_CLEAR_STENCIL) {
+                                store_general(job, cl, job->zsbuf, layer,
+                                              STENCIL, PIPE_CLEAR_STENCIL,
                                               &stores_pending,
                                               general_color_clear);
                         }
                 } else {
-                        store_general(job, cl, job->zsbuf,
-                                      zs_buffer_from_pipe_bits(job->resolve),
-                                      job->resolve & PIPE_CLEAR_DEPTHSTENCIL,
+                        store_general(job, cl, job->zsbuf, layer,
+                                      zs_buffer_from_pipe_bits(job->store),
+                                      job->store & PIPE_CLEAR_DEPTHSTENCIL,
                                       &stores_pending, general_color_clear);
                 }
         }
 
-        if (stores_pending) {
 #if V3D_VERSION < 40
+        if (stores_pending) {
                 cl_emit(cl, STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED, store) {
 
                         store.disable_color_buffer_write =
@@ -355,34 +366,49 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
                         /* Note that when set this will clear all of the color
                          * buffers.
                          */
-                        store.disable_colour_buffers_clear_on_write =
+                        store.disable_color_buffers_clear_on_write =
                                 !needs_color_clear;
                         store.disable_z_buffer_clear_on_write =
                                 !needs_z_clear;
                         store.disable_stencil_buffer_clear_on_write =
                                 !needs_s_clear;
                 };
-#else /* V3D_VERSION >= 40 */
-                unreachable("All color buffers should have been stored.");
-#endif /* V3D_VERSION >= 40 */
         } else if (needs_color_clear && !general_color_clear) {
                 /* If we didn't do our color clears in the general packet,
                  * then emit a packet to clear all the TLB color buffers now.
                  */
-#if V3D_VERSION < 40
                 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
                         store.buffer_to_store = NONE;
                 }
+        }
 #else /* V3D_VERSION >= 40 */
+        /* If we're emitting an RCL with GL_ARB_framebuffer_no_attachments,
+         * we still need to emit some sort of store.
+         */
+        if (!job->store) {
+                cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+                        store.buffer_to_store = NONE;
+                }
+        }
+
+        assert(!stores_pending);
+
+        /* GFXH-1461/GFXH-1689: The per-buffer store command's clear
+         * buffer bit is broken for depth/stencil.  In addition, the
+         * clear packet's Z/S bit is broken, but the RTs bit ends up
+         * clearing Z/S.
+         */
+        if (job->clear) {
                 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+                        clear.clear_z_stencil_buffer = true;
                         clear.clear_all_render_targets = true;
                 }
-#endif /* V3D_VERSION >= 40 */
         }
+#endif /* V3D_VERSION >= 40 */
 }
 
 static void
-v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
+v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
 {
         /* Emit the generic list in our indirect state -- the rcl will just
          * have pointers into it.
@@ -398,7 +424,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
                 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
         }
 
-        v3d_rcl_emit_loads(job, cl);
+        v3d_rcl_emit_loads(job, cl, layer);
 
         if (V3D_VERSION < 40) {
                 /* Tile Coordinates triggers the last reload and sets where
@@ -410,14 +436,20 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
         /* The binner starts out writing tiles assuming that the initial mode
          * is triangles, so make sure that's the case.
          */
-        cl_emit(cl, PRIMITIVE_LIST_FORMAT, fmt) {
-                fmt.data_type = LIST_INDEXED;
+        cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
                 fmt.primitive_type = LIST_TRIANGLES;
         }
 
+#if V3D_VERSION >= 41
+        /* PTB assumes that value to be 0, but hw will not set it. */
+        cl_emit(cl, SET_INSTANCEID, set) {
+           set.instance_id = 0;
+        }
+#endif
+
         cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
 
-        v3d_rcl_emit_stores(job, cl);
+        v3d_rcl_emit_stores(job, cl, layer);
 
 #if V3D_VERSION >= 40
         cl_emit(cl, END_OF_TILE_MARKER, end);
@@ -451,7 +483,7 @@ static void
 v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
                           struct v3d_resource *rsc, bool is_separate_stencil)
 {
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CONFIG, zs) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_Z_STENCIL, zs) {
                 zs.address = cl_address(rsc->bo, surf->offset);
 
                 if (!is_separate_stencil) {
@@ -468,9 +500,9 @@ v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
                 zs.memory_format = surf->tiling;
         }
 
-        if (job->resolve & (is_separate_stencil ?
-                            PIPE_CLEAR_STENCIL :
-                            PIPE_CLEAR_DEPTHSTENCIL)) {
+        if (job->store & (is_separate_stencil ?
+                          PIPE_CLEAR_STENCIL :
+                          PIPE_CLEAR_DEPTHSTENCIL)) {
                 rsc->writes++;
         }
 }
@@ -478,32 +510,149 @@ v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
 
 #define div_round_up(a, b) (((a) + (b) - 1) / b)
 
+static void
+emit_render_layer(struct v3d_job *job, uint32_t layer)
+{
+        uint32_t supertile_w = 1, supertile_h = 1;
+
+        /* If doing multicore binning, we would need to initialize each
+         * core's tile list here.
+         */
+        uint32_t tile_alloc_offset =
+                layer * job->draw_tiles_x * job->draw_tiles_y * 64;
+        cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
+                list.address = cl_address(job->tile_alloc, tile_alloc_offset);
+        }
+
+        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
+                uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
+                const uint32_t max_supertiles = 256;
+
+                /* Size up our supertiles until we get under the limit. */
+                for (;;) {
+                        frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
+                                                             supertile_w);
+                        frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
+                                                             supertile_h);
+                        if (frame_w_in_supertiles *
+                                frame_h_in_supertiles < max_supertiles) {
+                                break;
+                        }
+
+                        if (supertile_w < supertile_h)
+                                supertile_w++;
+                        else
+                                supertile_h++;
+                }
+
+                config.number_of_bin_tile_lists = 1;
+                config.total_frame_width_in_tiles = job->draw_tiles_x;
+                config.total_frame_height_in_tiles = job->draw_tiles_y;
+
+                config.supertile_width_in_tiles = supertile_w;
+                config.supertile_height_in_tiles = supertile_h;
+
+                config.total_frame_width_in_supertiles = frame_w_in_supertiles;
+                config.total_frame_height_in_supertiles = frame_h_in_supertiles;
+        }
+
+        /* Start by clearing the tile buffer. */
+        cl_emit(&job->rcl, TILE_COORDINATES, coords) {
+                coords.tile_column_number = 0;
+                coords.tile_row_number = 0;
+        }
+
+        /* Emit an initial clear of the tile buffers.  This is necessary
+         * for any buffers that should be cleared (since clearing
+         * normally happens at the *end* of the generic tile list), but
+         * it's also nice to clear everything so the first tile doesn't
+         * inherit any contents from some previous frame.
+         *
+         * Also, implement the GFXH-1742 workaround.  There's a race in
+         * the HW between the RCL updating the TLB's internal type/size
+         * and thespawning of the QPU instances using the TLB's current
+         * internal type/size.  To make sure the QPUs get the right
+         * state, we need 1 dummy store in between internal type/size
+         * changes on V3D 3.x, and 2 dummy stores on 4.x.
+         */
+#if V3D_VERSION < 40
+        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                store.buffer_to_store = NONE;
+        }
+#else
+        for (int i = 0; i < 2; i++) {
+                if (i > 0)
+                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
+                cl_emit(&job->rcl, END_OF_LOADS, end);
+                cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                        store.buffer_to_store = NONE;
+                }
+                if (i == 0) {
+                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                clear.clear_z_stencil_buffer = true;
+                                clear.clear_all_render_targets = true;
+                        }
+                }
+                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
+        }
+#endif
+
+        cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+
+        v3d_rcl_emit_generic_per_tile_list(job, layer);
+
+        /* XXX perf: We should expose GL_MESA_tile_raster_order to
+         * improve X11 performance, but we should use Morton order
+         * otherwise to improve cache locality.
+         */
+        uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
+        uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
+        uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
+        uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
+
+        uint32_t max_x_supertile = 0;
+        uint32_t max_y_supertile = 0;
+        if (job->draw_max_x != 0 && job->draw_max_y != 0) {
+                max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
+                max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
+        }
+
+        for (int y = min_y_supertile; y <= max_y_supertile; y++) {
+                for (int x = min_x_supertile; x <= max_x_supertile; x++) {
+                        cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
+                                coords.column_number_in_supertiles = x;
+                                coords.row_number_in_supertiles = y;
+                        }
+                }
+        }
+}
+
 void
 v3dX(emit_rcl)(struct v3d_job *job)
 {
         /* The RCL list should be empty. */
         assert(!job->rcl.bo);
 
-        v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+        v3d_cl_ensure_space_with_branch(&job->rcl, 200 +
+                                        MAX2(job->num_layers, 1) * 256 *
                                         cl_packet_length(SUPERTILE_COORDINATES));
         job->submit.rcl_start = job->rcl.bo->offset;
         v3d_job_add_bo(job, job->rcl.bo);
 
         int nr_cbufs = 0;
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 if (job->cbufs[i])
                         nr_cbufs = i + 1;
         }
 
-        /* Comon config must be the first TILE_RENDERING_MODE_CONFIGURATION
+        /* Comon config must be the first TILE_RENDERING_MODE_CFG
          * and Z_STENCIL_CLEAR_VALUES must be last.  The ones in between are
          * optional updates to the previous HW state.
          */
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_COMMON_CONFIGURATION,
-                config) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
 #if V3D_VERSION < 40
-                config.enable_z_store = job->resolve & PIPE_CLEAR_DEPTH;
-                config.enable_stencil_store = job->resolve & PIPE_CLEAR_STENCIL;
+                config.enable_z_store = job->store & PIPE_CLEAR_DEPTH;
+                config.enable_stencil_store = job->store & PIPE_CLEAR_STENCIL;
 #else /* V3D_VERSION >= 40 */
                 if (job->zsbuf) {
                         struct v3d_surface *surf = v3d_surface(job->zsbuf);
@@ -532,8 +681,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 config.image_width_pixels = job->draw_width;
                 config.image_height_pixels = job->draw_height;
 
-                config.number_of_render_targets_minus_1 =
-                        MAX2(nr_cbufs, 1) - 1;
+                config.number_of_render_targets = MAX2(nr_cbufs, 1);
 
                 config.multisample_mode_4x = job->msaa;
 
@@ -547,7 +695,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 struct v3d_surface *surf = v3d_surface(psurf);
                 struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
-                MAYBE_UNUSED uint32_t config_pad = 0;
+                UNUSED uint32_t config_pad = 0;
                 uint32_t clear_pad = 0;
 
                 /* XXX: Set the pad for raster. */
@@ -567,7 +715,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 }
 
 #if V3D_VERSION < 40
-                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
                         rt.address = cl_address(rsc->bo, surf->offset);
                         rt.internal_type = surf->internal_type;
                         rt.output_image_format = surf->format;
@@ -576,12 +724,12 @@ v3dX(emit_rcl)(struct v3d_job *job)
                         rt.render_target_number = i;
                         rt.pad = config_pad;
 
-                        if (job->resolve & PIPE_CLEAR_COLOR0 << i)
+                        if (job->store & PIPE_CLEAR_COLOR0 << i)
                                 rsc->writes++;
                 }
 #endif /* V3D_VERSION < 40 */
 
-                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART1,
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
                         clear) {
                         clear.clear_color_low_32_bits = job->clear_color[i][0];
                         clear.clear_color_next_24_bits = job->clear_color[i][1] & 0xffffff;
@@ -589,7 +737,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 };
 
                 if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
-                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART2,
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2,
                                 clear) {
                                 clear.clear_color_mid_low_32_bits =
                                         ((job->clear_color[i][1] >> 24) |
@@ -602,7 +750,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 }
 
                 if (surf->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART3,
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3,
                                 clear) {
                                 clear.uif_padded_height_in_uif_blocks = clear_pad;
                                 clear.clear_color_high_16_bits = job->clear_color[i][3] >> 16;
@@ -612,7 +760,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
         }
 
 #if V3D_VERSION >= 40
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
                 v3d_setup_render_target(job, 0,
                                         &rt.render_target_0_internal_bpp,
                                         &rt.render_target_0_internal_type,
@@ -654,10 +802,10 @@ v3dX(emit_rcl)(struct v3d_job *job)
 #endif /* V3D_VERSION < 40 */
 
         /* Ends rendering mode config. */
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CLEAR_VALUES,
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES,
                 clear) {
                 clear.z_clear_value = job->clear_z;
-                clear.stencil_vg_mask_clear_value = job->clear_s;
+                clear.stencil_clear_value = job->clear_s;
         };
 
         /* Always set initial block size before the first branch, which needs
@@ -669,114 +817,15 @@ v3dX(emit_rcl)(struct v3d_job *job)
                         TILE_ALLOCATION_BLOCK_SIZE_64B;
         }
 
-        uint32_t supertile_w = 1, supertile_h = 1;
-
-        /* If doing multicore binning, we would need to initialize each core's
-         * tile list here.
-         */
-        cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
-                list.address = cl_address(job->tile_alloc, 0);
-        }
-
-        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CONFIGURATION, config) {
-                uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
-                const uint32_t max_supertiles = 256;
-
-                /* Size up our supertiles until we get under the limit. */
-                for (;;) {
-                        frame_w_in_supertiles = div_round_up(job->draw_tiles_x,
-                                                             supertile_w);
-                        frame_h_in_supertiles = div_round_up(job->draw_tiles_y,
-                                                             supertile_h);
-                        if (frame_w_in_supertiles * frame_h_in_supertiles <
-                            max_supertiles) {
-                                break;
-                        }
-
-                        if (supertile_w < supertile_h)
-                                supertile_w++;
-                        else
-                                supertile_h++;
-                }
-
-                config.total_frame_width_in_tiles = job->draw_tiles_x;
-                config.total_frame_height_in_tiles = job->draw_tiles_y;
-
-                config.supertile_width_in_tiles_minus_1 = supertile_w - 1;
-                config.supertile_height_in_tiles_minus_1 = supertile_h - 1;
-
-                config.total_frame_width_in_supertiles = frame_w_in_supertiles;
-                config.total_frame_height_in_supertiles = frame_h_in_supertiles;
-        }
-
-        /* Start by clearing the tile buffer. */
-        cl_emit(&job->rcl, TILE_COORDINATES, coords) {
-                coords.tile_column_number = 0;
-                coords.tile_row_number = 0;
-        }
-
-        /* Emit an initial clear of the tile buffers.  This is necessary for
-         * any buffers that should be cleared (since clearing normally happens
-         * at the *end* of the generic tile list), but it's also nice to clear
-         * everything so the first tile doesn't inherit any contents from some
-         * previous frame.
-         *
-         * Also, implement the GFXH-1742 workaround.  There's a race in the HW
-         * between the RCL updating the TLB's internal type/size and the
-         * spawning of the QPU instances using the TLB's current internal
-         * type/size.  To make sure the QPUs get the right state,, we need 1
-         * dummy store in between internal type/size changes on V3D 3.x, and 2
-         * dummy stores on 4.x.
+        /* ARB_framebuffer_no_attachments allows rendering to happen even when
+         * the framebuffer has no attachments, the idea being that fragment
+         * shaders can still do image load/store, ssbo, etc without having to
+         * write to actual attachments, so always run at least one iteration
+         * of the loop.
          */
-#if V3D_VERSION < 40
-        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
-                store.buffer_to_store = NONE;
-        }
-#else
-        for (int i = 0; i < 2; i++) {
-                if (i > 0)
-                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
-                cl_emit(&job->rcl, END_OF_LOADS, end);
-                cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
-                        store.buffer_to_store = NONE;
-                }
-                if (i == 0) {
-                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
-                                clear.clear_z_stencil_buffer = true;
-                                clear.clear_all_render_targets = true;
-                        }
-                }
-                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
-        }
-#endif
-
-        cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
-
-        v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
-
-        cl_emit(&job->rcl, WAIT_ON_SEMAPHORE, sem);
-
-        /* XXX: Use Morton order */
-        uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
-        uint32_t supertile_h_in_pixels = job->tile_height * supertile_h;
-        uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels;
-        uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels;
-
-        uint32_t max_x_supertile = 0;
-        uint32_t max_y_supertile = 0;
-        if (job->draw_max_x != 0 && job->draw_max_y != 0) {
-                max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels;
-                max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels;
-        }
-
-        for (int y = min_y_supertile; y <= max_y_supertile; y++) {
-                for (int x = min_x_supertile; x <= max_x_supertile; x++) {
-                        cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) {
-                                coords.column_number_in_supertiles = x;
-                                coords.row_number_in_supertiles = y;
-                        }
-                }
-        }
+        assert(job->num_layers > 0 || (job->load == 0 && job->store == 0));
+        for (int layer = 0; layer < MAX2(1, job->num_layers); layer++)
+                emit_render_layer(job, layer);
 
         cl_emit(&job->rcl, END_OF_RENDERING, end);
 }