turnip: rework format table to support r5g5b5a1_unorm/b5g5r5a1_unorm
[mesa.git] / src / freedreno / vulkan / tu_cmd_buffer.c
index cc6c0a8b55369582d8461c1fd7f163398cb4618f..5c99808b00497d5091d0a02577890978005a0a28 100644 (file)
@@ -111,6 +111,66 @@ tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
    return VK_SUCCESS;
 }
 
+static bool
+is_linear_mipmapped(const struct tu_image_view *iview)
+{
+   return iview->image->layout.tile_mode == TILE6_LINEAR &&
+          iview->base_mip != iview->image->level_count - 1;
+}
+
+static bool
+force_sysmem(const struct tu_cmd_buffer *cmd,
+             const struct VkRect2D *render_area)
+{
+   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_physical_device *device = cmd->device->physical_device;
+   bool has_linear_mipmapped_store = false;
+   const struct tu_render_pass *pass = cmd->state.pass;
+
+   /* Iterate over all the places we call tu6_emit_store_attachment() */
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      const struct tu_subpass *subpass = &pass->subpasses[i];
+      if (subpass->resolve_attachments) {
+         for (unsigned i = 0; i < subpass->color_count; i++) {
+            uint32_t a = subpass->resolve_attachments[i].attachment;
+            if (a != VK_ATTACHMENT_UNUSED &&
+                cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
+               const struct tu_image_view *iview = fb->attachments[a].attachment;
+               if (is_linear_mipmapped(iview)) {
+                  has_linear_mipmapped_store = true;
+                  break;
+               }
+            }
+         }
+      }
+   }
+
+   for (unsigned i = 0; i < pass->attachment_count; i++) {
+      if (pass->attachments[i].gmem_offset >= 0 &&
+          cmd->state.pass->attachments[i].store_op == VK_ATTACHMENT_STORE_OP_STORE) {
+         const struct tu_image_view *iview = fb->attachments[i].attachment;
+         if (is_linear_mipmapped(iview)) {
+            has_linear_mipmapped_store = true;
+            break;
+         }
+      }
+   }
+
+   /* Linear textures cannot have any padding between mipmap levels and their
+    * height isn't padded, while at the same time the GMEM->MEM resolve does
+    * not have per-pixel granularity, so if the image height isn't aligned to
+    * the resolve granularity and the render area is tall enough, we may wind
+    * up writing past the bottom of the image into the next miplevel or even
+    * past the end of the image. For the last miplevel, the layout code should
+    * insert enough padding so that the overdraw writes to the padding.  To
+    * work around this, we force-enable sysmem rendering.
+    */
+   const uint32_t y2 = render_area->offset.y + render_area->extent.height;
+   const uint32_t aligned_y2 = ALIGN_POT(y2, device->tile_align_h);
+
+   return has_linear_mipmapped_store && aligned_y2 > fb->height;
+}
+
 static void
 tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
                                     const struct tu_device *dev,
@@ -142,11 +202,19 @@ tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
       .height = align(ra_height, tile_align_h),
    };
 
+   if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
+      /* start with 2x2 tiles */
+      tiling->tile_count.width = 2;
+      tiling->tile_count.height = 2;
+      tiling->tile0.extent.width = align(DIV_ROUND_UP(ra_width, 2), tile_align_w);
+      tiling->tile0.extent.height = align(DIV_ROUND_UP(ra_height, 2), tile_align_h);
+   }
+
    /* do not exceed max tile width */
    while (tiling->tile0.extent.width > max_tile_width) {
       tiling->tile_count.width++;
       tiling->tile0.extent.width =
-         align(ra_width / tiling->tile_count.width, tile_align_w);
+         align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w);
    }
 
    /* do not exceed gmem size */
@@ -303,12 +371,6 @@ tu6_index_size(VkIndexType type)
    }
 }
 
-static void
-tu6_emit_marker(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
-   tu_cs_emit_write_reg(cs, cmd->marker_reg, ++cmd->marker_seqno);
-}
-
 unsigned
 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
@@ -433,15 +495,14 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
       if (vk_format_is_srgb(iview->vk_format))
          srgb_cntl |= (1 << i);
 
-      const struct tu_native_format *format =
-         tu6_get_native_format(iview->vk_format);
-      assert(format && format->rb >= 0);
+      const struct tu_native_format format =
+         tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
 
       tu_cs_emit_regs(cs,
                       A6XX_RB_MRT_BUF_INFO(i,
                                            .color_tile_mode = tile_mode,
-                                           .color_format = format->rb,
-                                           .color_swap = format->swap),
+                                           .color_format = format.fmt,
+                                           .color_swap = format.swap),
                       A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)),
                       A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size),
                       A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)),
@@ -449,7 +510,7 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
 
       tu_cs_emit_regs(cs,
                       A6XX_SP_FS_MRT_REG(i,
-                                         .color_format = format->rb,
+                                         .color_format = format.fmt,
                                          .color_sint = vk_format_is_sint(iview->vk_format),
                                          .color_uint = vk_format_is_uint(iview->vk_format)));
 
@@ -571,6 +632,7 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
        * that means the packets we're emitting also happen during binning. So
        * we need to guard the write on !BINNING at CP execution time.
        */
+      tu_cs_reserve(cs, 3 + 4);
       tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
       tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
                      CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
@@ -615,9 +677,8 @@ tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
    tu_cs_emit_regs(cs,
                    A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve));
 
-   const struct tu_native_format *format =
-      tu6_get_native_format(iview->vk_format);
-   assert(format && format->rb >= 0);
+   const struct tu_native_format format =
+      tu6_format_color(iview->vk_format, iview->image->layout.tile_mode);
 
    enum a6xx_tile_mode tile_mode =
       tu6_get_image_tile_mode(iview->image, iview->base_mip);
@@ -625,8 +686,8 @@ tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
                    A6XX_RB_BLIT_DST_INFO(
                       .tile_mode = tile_mode,
                       .samples = tu_msaa_samples(iview->image->samples),
-                      .color_format = format->rb,
-                      .color_swap = format->swap,
+                      .color_format = format.fmt,
+                      .color_swap = format.swap,
                       .flags = iview->image->layout.ubwc_layer_size != 0),
                    A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)),
                    A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)),
@@ -645,9 +706,7 @@ tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
 static void
 tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   tu6_emit_marker(cmd, cs);
    tu6_emit_event_write(cmd, cs, BLIT, false);
-   tu6_emit_marker(cmd, cs);
 }
 
 static void
@@ -694,6 +753,9 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
       return false;
 
+   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
+      return true;
+
    return (tiling->tile_count.width * tiling->tile_count.height) > 2;
 }
 
@@ -703,7 +765,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd)
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
       return true;
 
-   return false;
+   return cmd->state.tiling_config.force_sysmem;
 }
 
 static void
@@ -712,12 +774,10 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      const struct tu_tile *tile)
 {
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x7));
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_YIELD));
 
-   tu6_emit_marker(cmd, cs);
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM) | 0x10);
-   tu6_emit_marker(cmd, cs);
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
 
    const uint32_t x1 = tile->begin.x;
    const uint32_t y1 = tile->begin.y;
@@ -740,6 +800,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      A6XX_CP_REG_TEST_0_BIT(0) |
                      A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
 
+      tu_cs_reserve(cs, 3 + 11);
       tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
       tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
       tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11));
@@ -859,24 +920,14 @@ tu6_emit_predicated_blit(struct tu_cmd_buffer *cmd,
                          uint32_t gmem_a,
                          bool resolve)
 {
-   const uint32_t space = 14 + 6;
-   struct tu_cond_exec_state state;
-
-   VkResult result = tu_cond_exec_start(cmd->device, cs, &state,
-                                        CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
-                                        CP_COND_REG_EXEC_0_GMEM,
-                                        space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
+   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
 
    tu6_emit_blit_info(cmd, cs,
                       cmd->state.framebuffer->attachments[a].attachment,
                       cmd->state.pass->attachments[gmem_a].gmem_offset, resolve);
    tu6_emit_blit(cmd, cs);
 
-   tu_cond_exec_end(cs, &state);
+   tu_cond_exec_end(cs);
 }
 
 static void
@@ -910,21 +961,9 @@ static void tu6_emit_resolve(struct tu_cmd_buffer *cmd,
 
    tu6_emit_predicated_blit(cmd, cs, a, gmem_a, true);
 
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const uint32_t space = 25 + 66 * fb->layers + 17;
-   struct tu_cond_exec_state state;
-
-   VkResult result = tu_cond_exec_start(cmd->device, cs, &state,
-                                        CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
-                                        CP_COND_REG_EXEC_0_SYSMEM,
-                                        space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
+   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
    tu6_emit_sysmem_resolve(cmd, cs, a, gmem_a);
-   tu_cond_exec_end(cs, &state);
+   tu_cond_exec_end(cs);
 }
 
 static void
@@ -958,10 +997,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
    tu_cs_emit(cs, 0x0);
 
-   tu6_emit_marker(cmd, cs);
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE) | 0x10);
-   tu6_emit_marker(cmd, cs);
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
 
    tu6_emit_blit_scissor(cmd, cs, true);
 
@@ -990,12 +1027,6 @@ tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
 static void
 tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu6_emit_cache_flush(cmd, cs);
 
    tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
@@ -1027,14 +1058,11 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
 
-   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8101, 0);
-   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
 
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
-   tu_cs_emit_write_reg(cs, REG_A6XX_RB_SAMPLE_CNTL, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
@@ -1086,8 +1114,6 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
 
-   tu6_emit_marker(cmd, cs);
-
    tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
 
    tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
@@ -1263,6 +1289,7 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
          A6XX_CP_REG_TEST_0_BIT(0) |
          A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
 
+   tu_cs_reserve(cs, 3 + 7);
    tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
    tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
    tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7));
@@ -1301,10 +1328,8 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
 
-   tu6_emit_marker(cmd, cs);
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
-   tu6_emit_marker(cmd, cs);
 
    tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
    tu_cs_emit(cs, 0x1);
@@ -1409,45 +1434,44 @@ tu_emit_sysmem_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 }
 
 static void
-tu_cmd_prepare_sysmem_clear_ib(struct tu_cmd_buffer *cmd,
-                               const VkRenderPassBeginInfo *info)
+tu_emit_load_clear(struct tu_cmd_buffer *cmd,
+                   const VkRenderPassBeginInfo *info)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const uint32_t blit_cmd_space = 25 + 66 * fb->layers + 17;
-   const uint32_t clear_space =
-       blit_cmd_space * cmd->state.pass->attachment_count + 5;
+   struct tu_cs *cs = &cmd->draw_cs;
 
-   struct tu_cs sub_cs;
+   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
 
-   VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs,
-                                            clear_space, &sub_cs);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
+   tu6_emit_blit_scissor(cmd, cs, true);
 
    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu_emit_sysmem_clear_attachment(cmd, &sub_cs, i, info);
+      tu6_emit_load_attachment(cmd, cs, i);
+
+   tu6_emit_blit_scissor(cmd, cs, false);
 
-   /* TODO: We shouldn't need this flush, but without it we'd have an empty IB
-    * when nothing clears which we currently can't handle.
+   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
+      tu6_emit_clear_attachment(cmd, cs, i, info);
+
+   tu_cond_exec_end(cs);
+
+   /* invalidate because reading input attachments will cache GMEM and
+    * the cache isn''t updated when GMEM is written
+    * TODO: is there a no-cache bit for textures?
     */
-   tu_cs_reserve_space(cmd->device, &sub_cs, 5);
-   tu6_emit_event_write(cmd, &sub_cs, UNK_1D, true);
+   if (cmd->state.subpass->input_count)
+      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
+
+   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
 
-   cmd->state.sysmem_clear_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
+   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
+      tu_emit_sysmem_clear_attachment(cmd, cs, i, info);
+
+   tu_cond_exec_end(cs);
 }
 
 static void
 tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                         const struct VkRect2D *renderArea)
 {
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 1024);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
    if (fb->width > 0 && fb->height > 0) {
       tu6_emit_window_scissor(cmd, cs,
@@ -1460,14 +1484,10 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
 
-   tu_cs_emit_ib(cs, &cmd->state.sysmem_clear_ib);
-
    tu6_emit_lrz_flush(cmd, cs);
 
-   tu6_emit_marker(cmd, cs);
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS) | 0x10);
-   tu6_emit_marker(cmd, cs);
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
 
    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
    tu_cs_emit(cs, 0x0);
@@ -1510,13 +1530,6 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
       }
    }
 
-   const uint32_t space = 14 + tu_cs_get_call_size(&cmd->draw_epilogue_cs);
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@@ -1524,8 +1537,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    tu6_emit_lrz_flush(cmd, cs);
 
-   tu6_emit_event_write(cmd, cs, UNK_1C, true);
-   tu6_emit_event_write(cmd, cs, UNK_1D, true);
+   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
+   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 
    tu_cs_sanity_check(cs);
 }
@@ -1536,12 +1549,6 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 1024);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu6_emit_lrz_flush(cmd, cs);
 
    /* lrz clear? */
@@ -1596,15 +1603,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd,
                 struct tu_cs *cs,
                 const struct tu_tile *tile)
 {
-   const uint32_t render_tile_space = 256 + tu_cs_get_call_size(&cmd->draw_cs);
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, render_tile_space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu6_emit_tile_select(cmd, cs, tile);
-   tu_cs_emit_ib(cs, &cmd->state.tile_load_ib);
 
    tu_cs_emit_call(cs, &cmd->draw_cs);
    cmd->wait_for_idle = true;
@@ -1615,13 +1614,14 @@ tu6_render_tile(struct tu_cmd_buffer *cmd,
                      A6XX_CP_REG_TEST_0_BIT(0) |
                      A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
 
+      tu_cs_reserve(cs, 3 + 2);
       tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
-      tu_cs_emit(cs, 0x10000000);
-      tu_cs_emit(cs, 2);  /* conditionally execute next 2 dwords */
+      tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
+      tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(2));
 
       /* if (no overflow) */ {
          tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-         tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
+         tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
       }
    }
 
@@ -1633,13 +1633,6 @@ tu6_render_tile(struct tu_cmd_buffer *cmd,
 static void
 tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   const uint32_t space = 16 + tu_cs_get_call_size(&cmd->draw_epilogue_cs);
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
 
    tu_cs_emit_regs(cs,
@@ -1677,65 +1670,20 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
 
    tu6_sysmem_render_begin(cmd, &cmd->cs, &tiling->render_area);
 
-   const uint32_t space = tu_cs_get_call_size(&cmd->draw_cs);
-   VkResult result = tu_cs_reserve_space(cmd->device, &cmd->cs, space);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
    cmd->wait_for_idle = true;
 
    tu6_sysmem_render_end(cmd, &cmd->cs);
 }
 
-static void
-tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd,
-                            const VkRenderPassBeginInfo *info)
-{
-   const uint32_t tile_load_space =
-      2 * 3 /* blit_scissor */ +
-      (20 /* load */ + 19 /* clear */) * cmd->state.pass->attachment_count +
-      2 /* cache invalidate */;
-
-   struct tu_cs sub_cs;
-
-   VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs,
-                                            tile_load_space, &sub_cs);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
-   tu6_emit_blit_scissor(cmd, &sub_cs, true);
-
-   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu6_emit_load_attachment(cmd, &sub_cs, i);
-
-   tu6_emit_blit_scissor(cmd, &sub_cs, false);
-
-   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu6_emit_clear_attachment(cmd, &sub_cs, i, info);
-
-   /* invalidate because reading input attachments will cache GMEM and
-    * the cache isn''t updated when GMEM is written
-    * TODO: is there a no-cache bit for textures?
-    */
-   if (cmd->state.subpass->input_count)
-      tu6_emit_event_write(cmd, &sub_cs, CACHE_INVALIDATE, false);
-
-   cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
-}
-
 static void
 tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
 {
    const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count;
    struct tu_cs sub_cs;
 
-   VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs,
-                                            tile_store_space, &sub_cs);
+   VkResult result =
+      tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
    if (result != VK_SUCCESS) {
       cmd->record_result = result;
       return;
@@ -1755,6 +1703,7 @@ tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
    struct tu_tiling_config *tiling = &cmd->state.tiling_config;
 
    tiling->render_area = *render_area;
+   tiling->force_sysmem = force_sysmem(cmd, render_area);
 
    tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels);
    tu_tiling_config_update_pipe_layout(tiling, dev);
@@ -1932,18 +1881,15 @@ tu_create_cmd_buffer(struct tu_device *device,
    }
 
    tu_bo_list_init(&cmd_buffer->bo_list);
-   tu_cs_init(&cmd_buffer->cs, TU_CS_MODE_GROW, 4096);
-   tu_cs_init(&cmd_buffer->draw_cs, TU_CS_MODE_GROW, 4096);
-   tu_cs_init(&cmd_buffer->draw_epilogue_cs, TU_CS_MODE_GROW, 4096);
-   tu_cs_init(&cmd_buffer->sub_cs, TU_CS_MODE_SUB_STREAM, 2048);
+   tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
+   tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
+   tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
+   tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
 
    *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
 
    list_inithead(&cmd_buffer->upload.list);
 
-   cmd_buffer->marker_reg = REG_A6XX_CP_SCRATCH_REG(
-      cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ? 7 : 6);
-
    VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
    if (result != VK_SUCCESS)
       goto fail_scratch_bo;
@@ -1971,10 +1917,10 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
    for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
       free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
 
-   tu_cs_finish(cmd_buffer->device, &cmd_buffer->cs);
-   tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_cs);
-   tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_epilogue_cs);
-   tu_cs_finish(cmd_buffer->device, &cmd_buffer->sub_cs);
+   tu_cs_finish(&cmd_buffer->cs);
+   tu_cs_finish(&cmd_buffer->draw_cs);
+   tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
+   tu_cs_finish(&cmd_buffer->sub_cs);
 
    tu_bo_list_destroy(&cmd_buffer->bo_list);
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
@@ -1988,10 +1934,10 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
    cmd_buffer->record_result = VK_SUCCESS;
 
    tu_bo_list_reset(&cmd_buffer->bo_list);
-   tu_cs_reset(cmd_buffer->device, &cmd_buffer->cs);
-   tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_cs);
-   tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_epilogue_cs);
-   tu_cs_reset(cmd_buffer->device, &cmd_buffer->sub_cs);
+   tu_cs_reset(&cmd_buffer->cs);
+   tu_cs_reset(&cmd_buffer->draw_cs);
+   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
+   tu_cs_reset(&cmd_buffer->sub_cs);
 
    for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
       cmd_buffer->descriptors[i].valid = 0;
@@ -2107,7 +2053,6 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    tu_cs_begin(&cmd_buffer->draw_cs);
    tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
 
-   cmd_buffer->marker_seqno = 0;
    cmd_buffer->scratch_seqno = 0;
 
    /* setup initial configuration into command buffer */
@@ -2164,11 +2109,6 @@ tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
    /* initialize/update the restart index */
    if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
       struct tu_cs *draw_cs = &cmd->draw_cs;
-      VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 2);
-      if (result != VK_SUCCESS) {
-         cmd->record_result = result;
-         return;
-      }
 
       tu6_emit_restart_index(
          draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
@@ -2314,12 +2254,6 @@ tu_CmdSetViewport(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *draw_cs = &cmd->draw_cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 12);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    assert(firstViewport == 0 && viewportCount == 1);
    tu6_emit_viewport(draw_cs, pViewports);
 
@@ -2335,12 +2269,6 @@ tu_CmdSetScissor(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *draw_cs = &cmd->draw_cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 3);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    assert(firstScissor == 0 && scissorCount == 1);
    tu6_emit_scissor(draw_cs, pScissors);
 
@@ -2367,12 +2295,6 @@ tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *draw_cs = &cmd->draw_cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 4);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
                        depthBiasSlopeFactor);
 
@@ -2386,12 +2308,6 @@ tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *draw_cs = &cmd->draw_cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 5);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu6_emit_blend_constants(draw_cs, blendConstants);
 
    tu_cs_sanity_check(draw_cs);
@@ -2592,15 +2508,9 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
    cmd->state.framebuffer = fb;
 
    tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea);
-   tu_cmd_prepare_sysmem_clear_ib(cmd, pRenderPassBegin);
-   tu_cmd_prepare_tile_load_ib(cmd, pRenderPassBegin);
    tu_cmd_prepare_tile_store_ib(cmd);
 
-   VkResult result = tu_cs_reserve_space(cmd->device, &cmd->draw_cs, 1024);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
+   tu_emit_load_clear(cmd, pRenderPassBegin);
 
    tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
    tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
@@ -2650,12 +2560,6 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
       }
    }
 
-   VkResult result = tu_cs_reserve_space(cmd->device, &cmd->draw_cs, 1024);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    /* invalidate because reading input attachments will cache GMEM and
     * the cache isn''t updated when GMEM is written
     * TODO: is there a no-cache bit for textures?
@@ -2672,8 +2576,8 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
    /* Emit flushes so that input attachments will read the correct value. This
     * is for sysmem only, although it shouldn't do much harm on gmem.
     */
-   tu6_emit_event_write(cmd, cs, UNK_1C, true);
-   tu6_emit_event_write(cmd, cs, UNK_1D, true);
+   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
+   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 
    /* TODO:
     * since we don't know how to do GMEM->GMEM resolve,
@@ -3070,7 +2974,7 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,
                 gl_shader_stage type)
 {
    struct tu_cs cs;
-   tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
+   tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
 
    tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
    tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
@@ -3094,7 +2998,7 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
       return VK_SUCCESS;
    }
 
-   VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, 8, &cs);
+   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
    if (result != VK_SUCCESS)
       return result;
 
@@ -3127,7 +3031,6 @@ tu6_emit_textures(struct tu_cmd_buffer *cmd,
                   bool *needs_border,
                   bool is_sysmem)
 {
-   struct tu_device *device = cmd->device;
    struct tu_cs *draw_state = &cmd->sub_cs;
    const struct tu_program_descriptor_linkage *link =
       &pipeline->program.link[type];
@@ -3140,7 +3043,7 @@ tu6_emit_textures(struct tu_cmd_buffer *cmd,
 
    /* allocate and fill texture state */
    struct ts_cs_memory tex_const;
-   result = tu_cs_alloc(device, draw_state, link->texture_map.num_desc,
+   result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
                         A6XX_TEX_CONST_DWORDS, &tex_const);
    if (result != VK_SUCCESS)
       return result;
@@ -3158,7 +3061,7 @@ tu6_emit_textures(struct tu_cmd_buffer *cmd,
    /* allocate and fill sampler state */
    struct ts_cs_memory tex_samp = { 0 };
    if (link->sampler_map.num_desc) {
-      result = tu_cs_alloc(device, draw_state, link->sampler_map.num_desc,
+      result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
                            A6XX_TEX_SAMP_DWORDS, &tex_samp);
       if (result != VK_SUCCESS)
          return result;
@@ -3203,7 +3106,7 @@ tu6_emit_textures(struct tu_cmd_buffer *cmd,
    }
 
    struct tu_cs cs;
-   result = tu_cs_begin_sub_stream(device, draw_state, 16, &cs);
+   result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
    if (result != VK_SUCCESS)
       return result;
 
@@ -3247,7 +3150,6 @@ tu6_emit_ibo(struct tu_cmd_buffer *cmd,
              gl_shader_stage type,
              struct tu_cs_entry *entry)
 {
-   struct tu_device *device = cmd->device;
    struct tu_cs *draw_state = &cmd->sub_cs;
    const struct tu_program_descriptor_linkage *link =
       &pipeline->program.link[type];
@@ -3261,7 +3163,7 @@ tu6_emit_ibo(struct tu_cmd_buffer *cmd,
    }
 
    struct ts_cs_memory ibo_const;
-   result = tu_cs_alloc(device, draw_state, num_desc,
+   result = tu_cs_alloc(draw_state, num_desc,
                         A6XX_TEX_CONST_DWORDS, &ibo_const);
    if (result != VK_SUCCESS)
       return result;
@@ -3305,7 +3207,7 @@ tu6_emit_ibo(struct tu_cmd_buffer *cmd,
    assert(ssbo_index == num_desc);
 
    struct tu_cs cs;
-   result = tu_cs_begin_sub_stream(device, draw_state, 7, &cs);
+   result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
    if (result != VK_SUCCESS)
       return result;
 
@@ -3415,7 +3317,7 @@ tu6_emit_border_color(struct tu_cmd_buffer *cmd,
       &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map;
    struct ts_cs_memory ptr;
 
-   VkResult result = tu_cs_alloc(cmd->device, &cmd->sub_cs,
+   VkResult result = tu_cs_alloc(&cmd->sub_cs,
                                  vs_sampler->num_desc + fs_sampler->num_desc,
                                  128 / 4,
                                  &ptr);
@@ -3454,14 +3356,11 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
    const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
    struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
    uint32_t draw_state_group_count = 0;
+   VkResult result;
 
    struct tu_descriptor_state *descriptors_state =
       &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
 
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
-   if (result != VK_SUCCESS)
-      return result;
-
    /* TODO lrz */
 
    tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
@@ -3770,22 +3669,12 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
       return;
    }
 
-   result = tu_cs_reserve_space(cmd->device, cs, 32);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    if (draw->indirect) {
       tu_finishme("indirect draw");
       return;
    }
 
-   /* TODO tu6_emit_marker should pick different regs depending on cs */
-
-   tu6_emit_marker(cmd, cs);
    tu6_emit_draw_direct(cmd, cs, draw);
-   tu6_emit_marker(cmd, cs);
 
    cmd->wait_for_idle = true;
 
@@ -3944,12 +3833,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
    struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
    struct tu_descriptor_state *descriptors_state =
       &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
-
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
+   VkResult result;
 
    if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
       tu_cs_emit_ib(cs, &pipeline->program.state_ib);
@@ -4004,7 +3888,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
    cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
-   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x8));
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
 
    const uint32_t *local_size = pipeline->compute.local_size;
    const uint32_t *num_groups = info->blocks;
@@ -4177,12 +4061,6 @@ write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value)
 {
    struct tu_cs *cs = &cmd->cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, 4);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE);
 
    /* TODO: any flush required before/after ? */
@@ -4230,16 +4108,10 @@ tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *cs = &cmd->cs;
 
-   VkResult result = tu_cs_reserve_space(cmd->device, cs, eventCount * 7);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
    /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */
 
    for (uint32_t i = 0; i < eventCount; i++) {
-      const struct tu_event *event = (const struct tu_event*) pEvents[i];
+      TU_FROM_HANDLE(tu_event, event, pEvents[i]);
 
       tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ);